In [None]:
import numpy as np
import pandas as pd
import warnings
import pickle
from sklearn import metrics 
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine
import pyarrow.feather as feather
from pathlib import Path
import multiprocessing
print(multiprocessing.cpu_count())
print(Path.cwd())

## Get predictions for the rest of ~20k proteins 

In [None]:
# Open database connection
engine = create_engine("mysql+pymysql://tcrd@tcrd.kmc.io:3306/tcrd6134pharos2") # latest version
dfprotein2020 = pd.read_sql_query("""
  select 
  protein.id,
  protein.uniprot,
  protein.name,
  protein.sym,
  target.tdl
from
  protein
  join t2tc on protein.id=t2tc.protein_id
  join target on t2tc.target_id=target.id
order by
  protein.id
 """, engine)
engine = create_engine("mysql+pymysql://tcrd@tcrd.kmc.io:3306/tcrd540") # 2018 version
dfprotein2018 = pd.read_sql_query("""
  select 
  protein.id,
  protein.uniprot,
  protein.name,
  protein.sym,
  target.tdl
from
  protein
  join t2tc on protein.id=t2tc.protein_id
  join target on t2tc.target_id=target.id
order by
  protein.id
 """, engine)

In [None]:
tdllist = dfprotein2020[(dfprotein2020["uniprot"].isin(dfprotein2018["uniprot"]))]
data = pd.read_feather("v5data.feather") # read in data for predictions
data.dropna(subset=['uniprot'], inplace=True)
datadummy = data
datadummy['tdl2'] = datadummy.apply(
    lambda row: tdllist[tdllist['uniprot'] == row['uniprot']]['tdl'].values[0] 
    if row['uniprot'] in tdllist['uniprot'].values 
    else row['tdl'],
    axis=1
)
datadummy.drop(columns = ["tdl"], inplace=True)
datadummy.rename(columns = {"tdl2":"tdl"}, inplace=True)
data = datadummy
only_in_dfprotein2020 = dfprotein2020[~(dfprotein2020['uniprot'].isin(data['uniprot']))] # these proteins are in the latest pharos version but not in the 2018 version, so need to develop features for these

In [None]:
# Retrieve the developed features for proteins which are in the latest version of Pharos but not in the 2018 version
file_path = 'v5data2020.feather'  
output_file_path = 'only_in_dfprotein2020.feather'  # Path to store the selected rows in Feather format
list_of_strings = only_in_dfprotein2020['uniprot'].unique()
# Read only the selected rows based on the condition without loading the entire file
df = feather.read_feather(file_path)
# Filter the DataFrame based on the condition
selected_df = df[df['uniprot'].isin(list_of_strings)]
# Write the selected rows to the output Feather file
feather.write_feather(selected_df, output_file_path)
# Read the selected rows into a Pandas DataFrame if needed
selected_rows = feather.read_feather(output_file_path)

In [None]:
missing_cols = list(set(data.columns) - set(selected_rows.columns))
for col in missing_cols:
    selected_rows[col] = 0
# Reorder 'selected_rows' columns to match 'data' columns order
selected_rows = selected_rows[data.columns]
# Concatenate both DataFrames row-wise
combined_df = pd.concat([data, selected_rows], ignore_index=True)
testntrain = pd.read_csv("testandtrainlist.csv")
data = combined_df
contcols = (data.drop(["uniprot", "protein_id",  "name", "sym", "tdl" ], axis=1)).fillna(0)
df1 = data[["uniprot", "protein_id",  "name", "sym", "tdl"]]
df1 = pd.concat([df1.reset_index(drop=True), contcols], axis=1)
conditions = [
    (df1['tdl'] == "Tclin"),
    (df1['tdl'] != "Tclin")
    ]
values = [1, 0]
df1['y'] = np.select(conditions, values)

alltest = df1.loc[df1["uniprot"].isin(dfprotein2020["uniprot"])]
alltest = alltest.loc[~(alltest["uniprot"].isin(testntrain["uniprot"]))]

all_actual_test = alltest
alltest = all_actual_test.loc[all_actual_test["tdl"] != "Tclin"]

tclintest = all_actual_test.loc[all_actual_test["tdl"] == "Tclin"]

In [None]:
# load the model from disk
loaded_model = pickle.load(open("1svm_best_model.pickle", 'rb'))

In [None]:
X_alltest = alltest.drop(["uniprot", "protein_id",  "sym", "tdl", "y", "name" ], axis=1)
y_alltest = alltest["y"]

In [None]:
testpredictions = loaded_model.predict(X_alltest)
scores = loaded_model.score_samples(X_alltest)
ypred = testpredictions
print("Predictions are : ", ypred)
n_error_test = testpredictions[testpredictions == 1].size
print("Test error = ", n_error_test/(len(X_alltest)))
print("Wrong predictions = ", n_error_test, "/19708") # 5 proteins are in the set of 24 for 2020 pharos version 
print("Correct predictions = ", testpredictions[testpredictions == -1].size, "/19708")

In [None]:
y_alltest = y_alltest.replace(0, -1)
print("Accuracy = ", accuracy_score(y_alltest,ypred, normalize=True))

In [None]:
dfpred = alltest[["uniprot", "sym", "tdl"]].reset_index(level=0)
dfpred['predictions']=pd.Series(ypred)

scores = pd.DataFrame(scores)
scores.rename(columns = {0:'scores'
                          }, inplace=True)

probs_svc = loaded_model.decision_function(X_alltest)
dfpred['decision_probs']=pd.Series(probs_svc)
probs_svm = pd.Series(probs_svc - probs_svc.min()) / (probs_svc.max() - probs_svc.min())

dfpred = pd.concat([dfpred, scores, probs_svm], axis=1).sort_values("uniprot")

dfpred.rename(columns = {'predictions':'predictions(1=tclin)', 0: "Derivedprobability"
                          }, inplace=True)

dfpred['predictions(1=tclin)'] = dfpred['predictions(1=tclin)'].replace([-1], 0)
dfpred.to_csv("svm_allpreds.csv", index=False)
def assign_value(row):
    if row['tdl'] == 'Tclin':
        return 1
    else:
        return 0
dfpred['y'] = dfpred.apply(lambda row: assign_value(row), axis=1)

In [None]:
X_tclintest = tclintest.drop(["uniprot", "protein_id",  "sym", "tdl", "y", "name" ], axis=1)
y_tclintest = tclintest["y"]
testpredictions = loaded_model.predict(X_tclintest)
scores = loaded_model.score_samples(X_tclintest)
ypred = testpredictions
print("Predictions are : ", ypred)
n_error_test = testpredictions[testpredictions == -1].size
print("Test error = ", n_error_test/(len(X_tclintest)))
print("Wrong predictions = ", n_error_test, "/24") # 5 proteins from here are non-tclins
print("Correct predictions = ", testpredictions[testpredictions == 1].size, "/24")

In [None]:
y_tclintest = y_tclintest.replace(0, -1)
dfpred = tclintest[["uniprot", "sym", "tdl"]].reset_index(level=0)
dfpred['predictions']=pd.Series(ypred)

scores = pd.DataFrame(scores)
scores.rename(columns = {0:'scores'
                          }, inplace=True)

probs_svc = loaded_model.decision_function(X_tclintest)
dfpred['decision_probs']=pd.Series(probs_svc)
probs_svm = pd.Series(probs_svc - probs_svc.min()) / (probs_svc.max() - probs_svc.min())
#probs_svm
dfpred = pd.concat([dfpred, scores, probs_svm], axis=1).sort_values("uniprot")

dfpred.rename(columns = {'predictions':'predictions(1=tclin)', 0: "Derivedprobability"
                          }, inplace=True)
dfpred['predictions(1=tclin)'] = dfpred['predictions(1=tclin)'].replace([-1], 0)
dfpred.to_csv("svm_24tclins.csv", index=False)