### Package import

In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score
import csv

## DataSet loading and preprocessing



In [4]:
dataset_ACSINS = pd.read_csv('Data/ACSINS/DeepSP_Feature.csv')
dataset_AS = pd.read_csv('Data/AS/DeepSP_Feature.csv')
dataset_BVP = pd.read_csv('Data/BVP/DeepSP_Feature.csv')
dataset_CIC = pd.read_csv('Data/CIC/DeepSP_Feature.csv')
dataset_CSI = pd.read_csv('Data/CSI/DeepSP_Feature.csv')
dataset_ELISA = pd.read_csv('Data/ELISA/DeepSP_Feature.csv')
dataset_HIC = pd.read_csv('Data/HIC/DeepSP_Feature.csv')
dataset_HEK = pd.read_csv('Data/HEK/DeepSP_Feature.csv')
dataset_PSR = pd.read_csv('Data/PSR/DeepSP_Feature.csv')
dataset_SGAC = pd.read_csv('Data/SGAC/DeepSP_Feature.csv')
dataset_SMAC = pd.read_csv('Data/SMAC/DeepSP_Feature.csv')
dataset_Tm = pd.read_csv('Data/Tm/DeepSP_Feature.csv')

In [5]:
X_train_ACSINS = dataset_ACSINS[['SAP_pos_CDRH1', 'SAP_pos_CDRL3', 'SCM_pos_CDRH1','SCM_neg_CDR']]
X_train_AS = dataset_AS[['SAP_pos_CDRH2','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_neg_CDRL3']]
X_train_BVP = dataset_BVP[['SAP_pos_CDRH1','SAP_pos_CDRH3','SCM_pos_CDR','SCM_neg_CDRH3']]
X_train_CIC = dataset_CIC[['SAP_pos_CDRL2', 'SAP_pos_CDRL3', 'SAP_pos_Lv','SCM_neg_CDR']]
X_train_CSI = dataset_CSI[['SAP_pos_CDRL1', 'SAP_pos_Lv', 'SCM_pos_CDRH2','SCM_neg_CDRL2']]
X_train_ELISA = dataset_ELISA[['SAP_pos_CDRH3', 'SCM_pos_CDR','SCM_neg_CDR']]
X_train_HIC = dataset_HIC[['SAP_pos_CDRL3', 'SAP_pos_CDR','SAP_pos_Hv','SCM_pos_CDRH3']]
X_train_HEK = dataset_HEK[['SAP_pos_CDRH2','SAP_pos_CDRL3','SCM_pos_Lv','SCM_neg_Lv']]
X_train_PSR = dataset_PSR[['SAP_pos_Lv', 'SCM_pos_CDRH2', 'SCM_neg_CDRL2']]
X_train_SGAC = dataset_SGAC[['SAP_pos_CDRH1', 'SAP_pos_CDRL3', 'SCM_neg_CDRH2','SCM_neg_Lv']]
X_train_SMAC = dataset_SMAC[['SAP_pos_CDR', 'SAP_pos_Fv', 'SCM_neg_CDRL2','SCM_neg_Fv']]
X_train_Tm = dataset_Tm[['SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SCM_pos_CDRH3']]


In [6]:
sc = StandardScaler()
X_train_ACSINS = sc.fit_transform(X_train_ACSINS)
X_train_AS = sc.fit_transform(X_train_AS)
X_train_BVP = sc.fit_transform(X_train_BVP)
X_train_CIC = sc.fit_transform(X_train_CIC)
X_train_CSI = sc.fit_transform(X_train_CSI)
X_train_ELISA = sc.fit_transform(X_train_ELISA)
X_train_HIC = sc.fit_transform(X_train_HIC)
X_train_HEK = sc.fit_transform(X_train_HEK)
X_train_PSR = sc.fit_transform(X_train_PSR)
X_train_SGAC = sc.fit_transform(X_train_SGAC)
X_train_SMAC = sc.fit_transform(X_train_SMAC)
X_train_Tm = sc.fit_transform(X_train_Tm)


In [7]:
y_train_ACSINS = dataset_ACSINS['ACSINS']
y_train_AS = dataset_AS['AS']
y_train_BVP = dataset_BVP['BVP']
y_train_CIC = dataset_CIC['CIC']
y_train_CSI = dataset_CSI['CSI']
y_train_ELISA = dataset_ELISA['ELISA']
y_train_HIC = dataset_HIC['HIC']
y_train_HEK = dataset_HEK['HEK']
y_train_PSR = dataset_PSR['PSR']
y_train_SGAC = dataset_SGAC['SGACSTD']
y_train_SMAC = dataset_SMAC['SMAC_nor']
y_train_Tm = dataset_Tm['Tm']

### ACSINS SVR C=3.5 e=0.1

In [60]:
from sklearn.svm import SVR
SVR_ACSINS = SVR(C=3.5, epsilon=0.1, kernel='rbf', gamma='scale')
SVR_ACSINS.fit(X_train_ACSINS,y_train_ACSINS)
y_pred_ACSINS = SVR_ACSINS.predict(X_train_ACSINS)
r2_ACSINS = r2_score(y_train_ACSINS, y_pred_ACSINS)
r_ACSINS = sqrt(r2_ACSINS)
MSE_ACSINS = mean_squared_error(y_train_ACSINS, y_pred_ACSINS)
RMSE_ACSINS = sqrt(MSE_ACSINS)
MAE_ACSINS = mean_squared_error(y_train_ACSINS,y_pred_ACSINS)
print(r_ACSINS,MSE_ACSINS)

0.6809213107523885 0.6930730184448964


### AS LR

In [9]:
lr_AS = LinearRegression()
lr_AS.fit(X_train_AS,y_train_AS)
y_pred_AS = lr_AS.predict(X_train_AS)
r2_AS = r2_score(y_train_AS, y_pred_AS)
r_AS = sqrt(r2_AS)
MSE_AS = mean_squared_error(y_train_AS, y_pred_AS)
RMSE_AS = sqrt(MSE_AS)
MAE_AS = mean_squared_error(y_train_AS,y_pred_AS)
print(r_AS,MSE_AS)

0.27336639194637663 0.002742893761564698


### BVP KNN n=6

In [10]:
KNN_BVP = KNeighborsRegressor(n_neighbors=6)
KNN_BVP.fit(X_train_BVP, y_train_BVP)
y_pred_BVP = KNN_BVP.predict(X_train_BVP)
r2_BVP = r2_score(y_train_BVP, y_pred_BVP)
r_BVP = sqrt(r2_BVP)
MSE_BVP = mean_squared_error(y_train_BVP, y_pred_BVP)
RMSE_BVP = sqrt(MSE_BVP)
MAE_BVP = mean_squared_error(y_train_BVP,y_pred_BVP)
print(r_BVP,MSE_BVP)

0.7387908015347076 8.425241559023506


### CIC KNN n=6

In [11]:
KNN_CIC = KNeighborsRegressor(n_neighbors=6)
KNN_CIC.fit(X_train_CIC,y_train_CIC)
y_pred_CIC = KNN_CIC.predict(X_train_CIC)
r2_CIC = r2_score(y_train_CIC, y_pred_CIC)
r_CIC = sqrt(r2_CIC)
MSE_CIC = mean_squared_error(y_train_CIC, y_pred_CIC)
RMSE_CIC = sqrt(MSE_CIC)
MAE_CIC = mean_squared_error(y_train_CIC,y_pred_CIC)
print(r_CIC,MSE_CIC)

0.6943256767925905 0.5654896239220957


### CSI SVR C=1.0 e=0.2

In [12]:
from sklearn.svm import SVR
SVR_CSI = SVR(C=1.0, epsilon=0.2, kernel='rbf', gamma='scale')
SVR_CSI.fit(X_train_CSI,y_train_CSI)
y_pred_CSI = SVR_CSI.predict(X_train_CSI)
r2_CSI = r2_score(y_train_CSI, y_pred_CSI)
r_CSI = sqrt(r2_CSI)
MSE_CSI = mean_squared_error(y_train_CSI, y_pred_CSI)
RMSE_CSI = sqrt(MSE_CSI)
MAE_CSI = mean_squared_error(y_train_CSI,y_pred_CSI)
print(r_CSI,MSE_CSI)

0.5552676269693703 0.8817649834855176


### ELISA KNN n=3

In [13]:
KNN_ELISA = KNeighborsRegressor(n_neighbors=3)
KNN_ELISA.fit(X_train_ELISA,y_train_ELISA)
y_pred_ELISA = KNN_ELISA.predict(X_train_ELISA)
r2_ELISA = r2_score(y_train_ELISA, y_pred_ELISA)
r_ELISA = sqrt(r2_ELISA)
MSE_ELISA = mean_squared_error(y_train_ELISA, y_pred_ELISA)
RMSE_ELISA = sqrt(MSE_ELISA)
MAE_ELISA = mean_squared_error(y_train_ELISA,y_pred_ELISA)
print(r_ELISA,MSE_ELISA)

0.8473431985957309 1.9669604653838246


### HIC SVR C=2.0 e=0.01

In [14]:
from sklearn.svm import SVR
SVR_HIC = SVR(C=2.0, epsilon=0.01, kernel='rbf', gamma='scale')
SVR_HIC.fit(X_train_HIC,y_train_HIC)
y_pred_HIC = SVR_HIC.predict(X_train_HIC)
r2_HIC = r2_score(y_train_HIC, y_pred_HIC)
r_HIC = sqrt(r2_HIC)
MSE_HIC = mean_squared_error(y_train_HIC, y_pred_HIC)
RMSE_HIC = sqrt(MSE_HIC)
MAE_HIC = mean_squared_error(y_train_HIC,y_pred_HIC)
print(r_HIC,MSE_HIC)

0.7283280020808955 0.5608490329707051


### HEK KNN n=5

In [15]:
KNN_HEK = KNeighborsRegressor(n_neighbors=5)
KNN_HEK.fit(X_train_HEK,y_train_HEK)
y_pred_HEK = KNN_HEK.predict(X_train_HEK)
r2_HEK = r2_score(y_train_HEK, y_pred_HEK)
r_HEK = sqrt(r2_HEK)
MSE_HEK = mean_squared_error(y_train_HEK, y_pred_HEK)
RMSE_HEK = sqrt(MSE_HEK)
MAE_HEK = mean_squared_error(y_train_HEK,y_pred_HEK)
print(r_HEK,MSE_HEK)

0.6524747770520979 2236.5250708465364


### PSR SVR C=0.4 e=0.1

In [16]:
from sklearn.svm import SVR
SVR_PSR = SVR(C=0.4, epsilon=0.1, kernel='rbf', gamma='scale')
SVR_PSR.fit(X_train_PSR,y_train_PSR)
y_pred_PSR = SVR_PSR.predict(X_train_PSR)
r2_PSR = r2_score(y_train_PSR, y_pred_PSR)
r_PSR = sqrt(r2_PSR)
MSE_PSR = mean_squared_error(y_train_PSR, y_pred_PSR)
RMSE_PSR = sqrt(MSE_PSR)
MAE_PSR = mean_squared_error(y_train_PSR,y_pred_PSR)
print(r_PSR,MSE_PSR)

0.6948314590910862 0.021392297391148736


### SGAC SVR C=3.0 e=0.3

In [17]:
from sklearn.svm import SVR
SVR_SGAC = SVR(C=3.0, epsilon=0.3, kernel='rbf', gamma='scale')
SVR_SGAC.fit(X_train_SGAC,y_train_SGAC)
y_pred_SGAC = SVR_SGAC.predict(X_train_SGAC)
r2_SGAC = r2_score(y_train_SGAC, y_pred_SGAC)
r_SGAC = sqrt(r2_SGAC)
MSE_SGAC = mean_squared_error(y_train_SGAC, y_pred_SGAC)
RMSE_SGAC = sqrt(MSE_SGAC)
MAE_SGAC = mean_squared_error(y_train_SGAC,y_pred_SGAC)
print(r_SGAC,MSE_SGAC)

0.6887136980539098 0.5218364096484991


### SMAC KNN n=6

In [18]:
KNN_SMAC = KNeighborsRegressor(n_neighbors=6)
KNN_SMAC.fit(X_train_SMAC,y_train_SMAC)
y_pred_SMAC = KNN_SMAC.predict(X_train_SMAC)
r2_SMAC = r2_score(y_train_SMAC, y_pred_SMAC)
r_SMAC = sqrt(r2_SMAC)
MSE_SMAC = mean_squared_error(y_train_SMAC, y_pred_SMAC)
RMSE_SMAC = sqrt(MSE_SMAC)
MAE_SMAC = mean_squared_error(y_train_SMAC,y_pred_SMAC)
print(r_SMAC,MSE_SMAC)

0.7354505583160753 0.6092917755781933


### Tm KNN n=6

In [19]:
KNN_Tm = KNeighborsRegressor(n_neighbors=6)
KNN_Tm.fit(X_train_Tm,y_train_Tm)
y_pred_Tm = KNN_Tm.predict(X_train_Tm)
r2_Tm = r2_score(y_train_Tm, y_pred_Tm)
r_Tm = sqrt(r2_Tm)
MSE_Tm = mean_squared_error(y_train_Tm, y_pred_Tm)
RMSE_Tm = sqrt(MSE_Tm)
MAE_Tm = mean_squared_error(y_train_Tm,y_pred_Tm)
print(r_Tm,MSE_Tm)

0.625084304875398 20.779197080291972


## Model Testing

In [91]:
dataset_test = pd.read_csv('SAPSCM.csv')
dataset_test

Unnamed: 0,Name,SAP_pos_CDRH1,SAP_pos_CDRH2,SAP_pos_CDRH3,SAP_pos_CDRL1,SAP_pos_CDRL2,SAP_pos_CDRL3,SAP_pos_CDR,SAP_pos_Hv,SAP_pos_Lv,...,SCM_pos_CDRH1,SCM_pos_CDRH2,SCM_pos_CDRH3,SCM_pos_CDRL1,SCM_pos_CDRL2,SCM_pos_CDRL3,SCM_pos_CDR,SCM_pos_Hv,SCM_pos_Lv,SCM_pos_Fv
0,abituzumab,3.86,4.35,8.72,2.15,4.70,6.79,30.68,45.53,36.43,...,33.36,50.21,58.55,37.38,99.53,19.15,301.27,877.35,1176.29,2044.64
1,abrilumab,3.14,0.95,1.52,1.63,3.66,5.10,17.65,43.00,29.55,...,4.73,1.59,8.02,34.77,5.28,23.33,70.42,921.07,925.66,1834.76
2,adalimumab,2.13,2.52,14.45,1.90,3.59,3.17,27.50,58.42,30.52,...,3.18,19.58,29.51,116.77,41.76,55.55,263.84,907.11,1219.44,2109.09
3,alemtuzumab,2.10,3.39,4.52,1.68,3.61,3.74,20.84,51.51,32.00,...,31.36,117.38,112.29,59.08,40.17,161.99,528.97,1438.97,1261.75,2691.13
4,alirocumab,2.34,0.49,5.75,6.34,2.48,4.47,23.01,52.35,43.46,...,87.39,30.73,22.05,163.36,64.86,19.05,393.55,1270.47,966.63,2228.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,vedolizumab,2.90,0.38,11.47,6.61,5.52,1.48,28.12,51.99,53.38,...,35.55,-3.78,-0.06,125.56,43.83,73.31,267.23,969.92,1093.80,2053.88
133,veltuzumab,2.51,4.15,14.46,3.16,4.07,2.54,31.23,43.67,37.64,...,34.90,0.02,39.57,32.31,35.55,30.64,166.97,1078.54,1059.89,2115.08
134,visilizumab,6.16,3.60,15.23,2.19,3.86,2.65,34.40,57.11,32.81,...,100.69,106.18,75.53,-2.44,69.55,18.26,372.93,1269.88,1078.14,2312.87
135,zalutumumab,1.99,5.14,18.61,0.97,2.46,5.58,34.69,63.19,32.75,...,29.38,5.33,43.21,1.04,2.67,29.45,108.28,1160.79,941.98,2081.37


In [66]:
feature_ACSINS = dataset_test[['SAP_pos_CDRH1', 'SAP_pos_CDRL3', 'SCM_pos_CDRH1','SCM_neg_CDR']]
feature_AS = dataset_test[['SAP_pos_CDRH2','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_neg_CDRL3']]
feature_BVP = dataset_test[['SAP_pos_CDRH1','SAP_pos_CDRH3','SCM_pos_CDR','SCM_neg_CDRH3']]
feature_CIC = dataset_test[['SAP_pos_CDRL2', 'SAP_pos_CDRL3', 'SAP_pos_Lv','SCM_neg_CDR']]
feature_CSI = dataset_test[['SAP_pos_CDRL1', 'SAP_pos_Lv', 'SCM_pos_CDRH2','SCM_neg_CDRL2']]
feature_ELISA = dataset_test[['SAP_pos_CDRH3', 'SCM_pos_CDR','SCM_neg_CDR']]
feature_HIC = dataset_test[['SAP_pos_CDRL3', 'SAP_pos_CDR','SAP_pos_Hv','SCM_pos_CDRH3']]
feature_HEK = dataset_test[['SAP_pos_CDRH2','SAP_pos_CDRL3','SCM_pos_Lv','SCM_neg_Lv']]
feature_PSR = dataset_test[['SAP_pos_Lv', 'SCM_pos_CDRH2', 'SCM_neg_CDRL2']]
feature_SGAC = dataset_test[['SAP_pos_CDRH1', 'SAP_pos_CDRL3', 'SCM_neg_CDRH2','SCM_neg_Lv']]
feature_SMAC = dataset_test[['SAP_pos_CDR', 'SAP_pos_Fv', 'SCM_neg_CDRL2','SCM_neg_Fv']]
feature_Tm = dataset_test[['SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SCM_pos_CDRH3']]


In [67]:
X_ACSINS = feature_ACSINS.values
X_AS = feature_AS.values
X_BVP = feature_BVP.values
X_CIC = feature_CIC.values
X_CSI = feature_CSI.values
X_ELISA = feature_ELISA.values
X_HIC = feature_HIC.values
X_HEK = feature_HEK.values
X_PSR = feature_PSR.values
X_SGAC = feature_SGAC.values
X_SMAC = feature_SMAC.values
X_Tm = feature_Tm.values

X_ACSINS = sc.fit_transform(X_ACSINS)
X_AS = sc.fit_transform(X_AS)
X_BVP = sc.fit_transform(X_BVP)
X_CIC = sc.fit_transform(X_CIC)
X_CSI = sc.fit_transform(X_CSI)
X_ELISA = sc.fit_transform(X_ELISA)
X_HIC = sc.fit_transform(X_HIC)
X_HEK = sc.fit_transform(X_HEK)
X_PSR = sc.fit_transform(X_PSR)
X_SGAC = sc.fit_transform(X_SGAC)
X_SMAC = sc.fit_transform(X_SMAC)
X_Tm = sc.fit_transform(X_Tm)

In [73]:
ACSINS_transformed,AS,BVP,CIC_transformed,CSI_transformed,ELISA,HIC,HEK,PSR,SGAC_transformed,SMAC_transformed,Tm =[],[],[],[],[],[],[],[],[],[],[],[]

In [74]:
for index,row in dataset_test.iterrows():
  feature_ACSINS = X_ACSINS[index]
  prediction_ACSINS = SVR_ACSINS.predict(feature_ACSINS.reshape(1,-1))
  feature_AS = X_AS[index]
  prediction_AS = lr_AS.predict(feature_AS.reshape(1,-1))
  feature_BVP = X_BVP[index]
  prediction_BVP = KNN_BVP.predict(feature_BVP.reshape(1,-1))
  feature_CIC = X_CIC[index]
  prediction_CIC = KNN_CIC.predict(feature_CIC.reshape(1,-1))
  feature_CSI = X_CSI[index]
  prediction_CSI = SVR_CSI.predict(feature_CSI.reshape(1,-1))
  feature_ELISA = X_ELISA[index]
  prediction_ELISA = KNN_ELISA.predict(feature_ELISA.reshape(1,-1))
  feature_HIC = X_HIC[index]
  prediction_HIC = SVR_HIC.predict(feature_HIC.reshape(1,-1))
  feature_HEK = X_HEK[index]
  prediction_HEK = KNN_HEK.predict(feature_HEK.reshape(1,-1))
  feature_PSR = X_PSR[index]
  prediction_PSR = SVR_PSR.predict(feature_PSR.reshape(1,-1))
  feature_SGAC = X_SGAC[index]
  prediction_SGAC = SVR_SGAC.predict(feature_SGAC.reshape(1,-1))
  feature_SMAC = X_SMAC[index]
  prediction_SMAC = KNN_SMAC.predict(feature_SMAC.reshape(1,-1))
  feature_Tm = X_Tm[index]
  prediction_Tm = KNN_Tm.predict(feature_Tm.reshape(1,-1))

  ACSINS_transformed.append(prediction_ACSINS)
  AS.append(prediction_AS)
  BVP.append(prediction_BVP)
  CIC_transformed.append(prediction_CIC)
  CSI_transformed.append(prediction_CSI)
  ELISA.append(prediction_ELISA)
  HIC.append(prediction_HIC)
  HEK.append(prediction_HEK)
  PSR.append(prediction_PSR)
  SGAC_transformed.append(prediction_SGAC)
  SMAC_transformed.append(prediction_SMAC)
  Tm.append(prediction_Tm)



In [99]:
Name = dataset_test[['Name']].to_numpy()

In [100]:
data = np.column_stack((Name,ACSINS_transformed,AS,BVP,CIC_transformed,CSI_transformed,ELISA,HIC,HEK,PSR,SGAC_transformed,SMAC_transformed,Tm))

np.savetxt('Prediction_Result.csv', data, delimiter=',', fmt='%s', header='Name,ACSINS_transformed,AS,BVP,CIC_transformed,CSI_transformed,ELISA,HIC,HEK,PSR,SGAC_transformed,SMAC_transformed,Tm', comments='')
