In [None]:
import numpy as np
import pandas as pd
import warnings
import time
import pickle
from sklearn.svm import OneClassSVM
from sklearn import metrics 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from pathlib import Path
import multiprocessing
print(multiprocessing.cpu_count())
print(Path.cwd())

In [None]:
data = pd.read_csv("v5data.csv") # read in the appropriate dataset
data.dropna(subset=['uniprot'], inplace=True)
contcols = (data.drop(["uniprot", "protein_id",  "name", "sym", "tdl" ], axis=1)).fillna(0)
df1 = data[["uniprot", "protein_id",  "name", "sym", "tdl"]]
df1 = pd.concat([df1.reset_index(drop=True), contcols], axis=1)

# changing the 5 proteins to tclins  
renameids = ["P02787", "O60840", "P13639", "Q16637", "Q9UM01"] # these 5 proteins are marked nontclins on latest Pharos
df1.loc[df1["uniprot"].isin(renameids), 'tdl'] = 'Tclin'
conditions = [
    (df1['tdl'] == "Tclin"),
    (df1['tdl'] != "Tclin")
    ]
values = [1, 0]
df1['y'] = np.select(conditions, values)

In [None]:
tclin = pd.read_csv("Tclin_list.csv") # Tclins to train on
tclin = tclin[['UniProt', 'Symbol', 'NewTClin','Quinquennal']]
tclin.rename(columns = {'UniProt':'uniprot', 'Symbol': 'sym'}, inplace=True)
tclinuniprotid = tclin["uniprot"]
tlist = tclin # First 709 are tclins 
tlist.drop(["sym",  "Quinquennal"], axis=1, inplace=True)

ids = tlist["uniprot"]
df = df1.loc[df1['uniprot'].isin(ids)]
df = pd.merge(df, tlist, on ='uniprot')
df = df.sort_values("uniprot", ascending=False)

df1 = df.loc[~df['uniprot'].isin(renameids)]
df2 = df.loc[df['uniprot'].isin(renameids)]
df2["NewTClin"] = "Y"

df = pd.concat([df1, df2]) 
warnings.filterwarnings(action='ignore', category=UserWarning)
test = df.loc[df['NewTClin'] == "Y"] 
train = df.loc[df['NewTClin'] != "Y"] 
train = train.loc[train["y"] == 1] # the 611 rows (proteins) for training

test["tdl"] = 'Tclin'
conditions = [
    (test['tdl'] == "Tclin"),
    (test['tdl'] != "Tclin")
    ]
values = [1, 0]
test['y'] = np.select(conditions, values)
train = train.loc[train["y"] == 1]

In [None]:
X_train = train.drop(["uniprot", "protein_id",  "name","sym", "tdl", "y", "NewTClin" ], axis=1)
y_train = train["y"]
X_test = test.drop(["uniprot", "protein_id", "name" ,"sym", "tdl", "y", "NewTClin" ], axis=1)
y_test = test["y"]
print("Total Features columns = ", len(X_train.columns.values.tolist()))
print("Training rows = ", len(X_train))
print("Testing rows = ", len(X_test))

In [None]:
# 1svm model training
I = 1000
warnings.filterwarnings(action='ignore', category=UserWarning)
param_grid = [
  {'kernel': ['linear'],
  'gamma': [  0.0001, 0.0005,0.001, 0.01, 0.05,0.005, 0.1, 0.15, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9,1, 5, 10],
  'nu': [0.0001, 0.0005, 0.001, 0.01,0.05, 0.1,0.2,0.5, 0.6, 0.7, 0.8, 0.9, 1 ]},   
 ]
scoring = {'Accuracy': 'accuracy', 'f1': 'f1'
           }
svm1 = OneClassSVM()
kfold = KFold(n_splits=5, shuffle=True, random_state=1998)
clf = RandomizedSearchCV(estimator=svm1,
                   cv=kfold,
                   param_distributions=param_grid, scoring=scoring,
                   n_iter=I,
                   verbose=2,
                   n_jobs=1,refit='Accuracy',return_train_score=False, random_state = 1998)
# Fit with all data
start = time.time()
clf.fit(X_train)
end = time.time()
print("Running time", end - start)

In [None]:
bestparams = clf.best_params_
testpredictions = clf.predict(X_test)
scores = clf.score_samples(X_test)
ypred = testpredictions
print("Best params are : ", bestparams)
print("Predictions are : ", ypred)
n_error_test = testpredictions[testpredictions == -1].size
print("Test error = ", n_error_test/len(X_test))
print("Wrong predictions = ", n_error_test, "/74")
print("Correct predictions = ", testpredictions[testpredictions == 1].size, "/74")

In [None]:
print("Accuracy = ", accuracy_score(y_test,ypred, normalize=True))
print("F1 score = ", f1_score(y_test,ypred))

## Get predictions and the probabilities for the test set

In [None]:
dfpred = test[["uniprot", "sym", "tdl", "NewTClin"]].reset_index(level=0)
dfpred['predictions']=pd.Series(ypred)

scores = pd.DataFrame(scores)
scores.rename(columns = {0:'scores'
                          }, inplace=True)

probs_svc = clf.decision_function(X_test)
probs_svm = pd.Series(probs_svc - probs_svc.min()) / (probs_svc.max() - probs_svc.min())
#probs_svm
dfpred = pd.concat([dfpred, scores, probs_svm], axis=1).sort_values("uniprot")

dfpred.rename(columns = {'predictions':'predictions(1=tclin)', 0: "Derivedprobability"
                          }, inplace=True)
dfpred['predictions(1=tclin)'] = dfpred['predictions(1=tclin)'].replace([-1], 0)
dfpred.to_csv("svm_74preds.csv", index=False)
dfpred

In [None]:
# Best XGB model that was found based on the metric score 
best_model = clf.best_estimator_
# Save model
pickle.dump(clf.best_estimator_, open("1svm_best_model.pickle", "wb"))