# Semi-Supervised Learning with Support Vector Machines


In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Load Data

In [2]:
data = "./data/GSE58606_data.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,4040 : hsa-miR-9-5p,4610 : hsa-miR-126-3p,4700 : hsa-miR-140-5p,5250 : hsa-miR-105-5p,6880 : hsa-miR-297,9938 : hsa-let-7i-5p,10138 : hsa-miR-130a-3p,10306 : hsa-miR-146b-5p,10901 : hsa_negative_control_6,10902 : hsa_negative_control_7,...,169411 : hsa-miR-205-3p,169412 : hsa-miR-1260a,169414 : hsa-miR-525-5p,169415 : hsa-miR-187-5p,169416 : hsa-miRPlus-A1086,169417 : hsa-miR-551b-5p,169419 : hsa-miR-300,169420 : hsa-miR-193b-5p,target,target_actual
0,6.877643,8.080215,6.245265,6.304782,6.686064,7.80595,6.814781,7.146955,6.17068,6.129586,...,7.176684,7.724408,6.035476,6.255048,6.572019,6.251777,5.873303,6.362097,1,primary breast cancer
1,6.544287,6.882722,5.874236,6.324987,6.638048,6.946567,6.488574,6.690773,6.160272,6.104929,...,7.249094,7.30889,6.021796,6.902473,6.347183,6.625569,6.004899,6.560329,1,primary breast cancer
2,6.879713,6.885387,5.958748,6.317687,6.664956,7.266728,6.965078,6.969818,6.226957,6.124874,...,7.404104,7.466418,5.977962,6.484571,6.259788,6.226957,5.815442,6.354622,1,primary breast cancer
3,6.943464,7.972342,6.026106,6.425266,6.761433,8.010618,6.441945,6.679714,6.471518,6.264685,...,6.811846,8.601204,6.02179,5.91264,6.002302,5.999349,5.811117,6.306704,1,primary breast cancer
4,7.097877,9.014977,7.127656,6.566482,6.990273,8.74636,8.087122,7.680249,6.499892,6.22656,...,6.361862,7.570751,5.967819,5.821186,6.550078,5.947994,5.828413,6.288596,1,primary breast cancer


In [3]:
df["target_actual"].unique()

array(['primary breast cancer', 'normal breast tissue'], dtype=object)

In [4]:
df['target'].unique()

array([1, 0])

breast cancer tissue: 1  
normal tissue: 0

In [5]:
targets = ['target', 'target_actual']
gene_exp_df = df.drop(targets, axis=1)
labels = df['target']
gene_exp_df

Unnamed: 0,4040 : hsa-miR-9-5p,4610 : hsa-miR-126-3p,4700 : hsa-miR-140-5p,5250 : hsa-miR-105-5p,6880 : hsa-miR-297,9938 : hsa-let-7i-5p,10138 : hsa-miR-130a-3p,10306 : hsa-miR-146b-5p,10901 : hsa_negative_control_6,10902 : hsa_negative_control_7,...,169409 : hsa-miR-4286,169410 : hsa-miR-556-5p,169411 : hsa-miR-205-3p,169412 : hsa-miR-1260a,169414 : hsa-miR-525-5p,169415 : hsa-miR-187-5p,169416 : hsa-miRPlus-A1086,169417 : hsa-miR-551b-5p,169419 : hsa-miR-300,169420 : hsa-miR-193b-5p
0,6.877643,8.080215,6.245265,6.304782,6.686064,7.805950,6.814781,7.146955,6.170680,6.129586,...,8.196471,5.970980,7.176684,7.724408,6.035476,6.255048,6.572019,6.251777,5.873303,6.362097
1,6.544287,6.882722,5.874236,6.324987,6.638048,6.946567,6.488574,6.690773,6.160272,6.104929,...,7.063283,5.893273,7.249094,7.308890,6.021796,6.902473,6.347183,6.625569,6.004899,6.560329
2,6.879713,6.885387,5.958748,6.317687,6.664956,7.266728,6.965078,6.969818,6.226957,6.124874,...,7.689806,5.965072,7.404104,7.466418,5.977962,6.484571,6.259788,6.226957,5.815442,6.354622
3,6.943464,7.972342,6.026106,6.425266,6.761433,8.010618,6.441945,6.679714,6.471518,6.264685,...,9.063759,6.043617,6.811846,8.601204,6.021790,5.912640,6.002302,5.999349,5.811117,6.306704
4,7.097877,9.014977,7.127656,6.566482,6.990273,8.746360,8.087122,7.680249,6.499892,6.226560,...,9.186942,5.996952,6.361862,7.570751,5.967819,5.821186,6.550078,5.947994,5.828413,6.288596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,7.069042,6.928371,6.361283,6.419449,7.047141,7.177212,6.653090,6.749214,6.404046,6.188651,...,7.694343,5.962416,7.267664,7.730168,6.023850,5.820426,6.118422,6.035515,5.776698,6.312001
129,7.220528,7.135147,6.358755,6.525857,7.054967,7.533287,6.645902,6.825213,6.416208,6.219274,...,7.150628,5.961384,6.557431,7.997207,6.125245,5.822034,6.008232,6.000131,5.807077,6.336421
130,6.753913,6.634270,6.167049,6.404217,6.760727,6.811819,6.340138,6.595280,6.340526,6.124168,...,6.443596,6.010598,7.216755,6.498206,6.160170,5.833019,5.949871,6.577482,6.395739,6.546783
131,6.949213,8.136970,6.530446,6.340408,6.839192,7.641705,7.269473,6.743930,6.403648,6.189620,...,6.352844,5.927407,7.165452,6.852333,5.975425,6.035830,6.323667,6.424863,5.780869,6.318444


## Supervised learning with SVM classification 

In [6]:
x_train,x_test,y_train,y_test=train_test_split(gene_exp_df,labels,test_size=0.20,random_state=77)


svc = svm.SVC(probability=True)

svc.fit(x_train, y_train)

SVC(probability=True)

In [8]:
y_pred = svc.predict(x_test)

In [9]:
print("The predicted Data is :")
print(y_pred)
print("The actual data is:")
print(np.array(y_test))
print(f"The model is {accuracy_score(y_pred,y_test)*100}% accurate")

The predicted Data is :
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
The actual data is:
[0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1]
The model is 81.48148148148148% accurate


## Self-learning with a 50/50 split

In [35]:
x_train_sl,x_test_sl,y_train_sl,y_test_sl=train_test_split(gene_exp_df,labels,test_size=0.50,random_state=77)

Initial model training with 50% of the data

In [36]:

svc_sl = svm.SVC(probability=True)


svc_sl.fit(x_train_sl, y_train_sl)

SVC(probability=True)

generate the predicted labels for the "unlabeled data"

In [37]:
y_pred_sl = pd.Series(svc_sl.predict(x_test_sl))

create the updated train dataset to retrain the model based on the predicted labels from the test set

In [38]:
sl_data = pd.concat([x_train_sl, x_test_sl])
sl_labels = pd.concat([y_train_sl, y_pred_sl])
# check to make sure dims are the same
len(sl_labels) == len(sl_data)

True

In [39]:
svc_sl.fit(sl_data, sl_labels)

SVC(probability=True)

In [41]:
sl_preds = svc_sl.predict(x_test_sl)
print("The predicted Data is :")
print(sl_preds)
print("The actual data is:")
print(np.array(y_test_sl))
print(f"The model is {accuracy_score(sl_preds,y_test_sl)*100}% accurate")

The predicted Data is :
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
The actual data is:
[0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
The model is 89.55223880597015% accurate


The accuracy has increased from 81.48% to 89.55% with self-learning implemented