In [None]:
import pandas as pd
import numpy as np
import time
import warnings
import pickle
from sklearn.metrics import accuracy_score
from pathlib import Path
from sqlalchemy import create_engine
import pyarrow.feather as feather
import multiprocessing
print(multiprocessing.cpu_count())
print(Path.cwd())

## Get predictions for the rest of ~20k proteins 

In [None]:
# Open database connection
engine = create_engine("mysql+pymysql://tcrd@tcrd.kmc.io:3306/tcrd6134pharos2") # latest version
dfprotein2020 = pd.read_sql_query("""
  select 
  protein.id,
  protein.uniprot,
  protein.name,
  protein.sym,
  target.tdl
from
  protein
  join t2tc on protein.id=t2tc.protein_id
  join target on t2tc.target_id=target.id
order by
  protein.id
 """, engine)
engine = create_engine("mysql+pymysql://tcrd@tcrd.kmc.io:3306/tcrd540") # 2018 version
dfprotein2018 = pd.read_sql_query("""
  select 
  protein.id,
  protein.uniprot,
  protein.name,
  protein.sym,
  target.tdl
from
  protein
  join t2tc on protein.id=t2tc.protein_id
  join target on t2tc.target_id=target.id
order by
  protein.id
 """, engine)

In [None]:
tdllist = dfprotein2020[(dfprotein2020["uniprot"].isin(dfprotein2018["uniprot"]))]
data = pd.read_feather("v5data.feather")
data.dropna(subset=['uniprot'], inplace=True)
datadummy = data
datadummy['tdl2'] = datadummy.apply(
    lambda row: tdllist[tdllist['uniprot'] == row['uniprot']]['tdl'].values[0] 
    if row['uniprot'] in tdllist['uniprot'].values 
    else row['tdl'],
    axis=1
)
datadummy.drop(columns = ["tdl"], inplace=True)
datadummy.rename(columns = {"tdl2":"tdl"}, inplace=True)
data = datadummy
only_in_dfprotein2020 = dfprotein2020[~(dfprotein2020['uniprot'].isin(data['uniprot']))] # these proteins are in the latest pharos version but not in the 2018 version, so need to develop features for these

In [None]:
# Retrieve the developed features for proteins which are in the latest version of Pharos but not in the 2018 version
file_path = 'v5data2020.feather'  
output_file_path = 'only_in_dfprotein2020.feather'  
list_of_strings = only_in_dfprotein2020['uniprot'].unique()
# Read only the selected rows based on the condition without loading the entire file
df = feather.read_feather(file_path)
# Filter the DataFrame based on the condition
selected_df = df[df['uniprot'].isin(list_of_strings)]
# Write the selected rows to the output Feather file
feather.write_feather(selected_df, output_file_path)
# Read the selected rows into a Pandas DataFrame if needed
selected_rows = feather.read_feather(output_file_path)

In [None]:
missing_cols = list(set(data.columns) - set(selected_rows.columns))
for col in missing_cols:
    selected_rows[col] = 0
# Reorder 'selected_rows' columns to match 'data' columns order
selected_rows = selected_rows[data.columns]

# Concatenate both DataFrames row-wise
combined_df = pd.concat([data, selected_rows], ignore_index=True)
testntrain = pd.read_csv("testandtrainlist.csv")
data = combined_df
contcols = (data.drop(["uniprot", "protein_id",  "name", "sym", "tdl" ], axis=1)).fillna(0)
df1 = data[["uniprot", "protein_id",  "name", "sym", "tdl"]]
df1 = pd.concat([df1.reset_index(drop=True), contcols], axis=1)

# changing the 5 proteins to tclins  
renameids = ["P02787", "O60840", "P13639", "Q16637", "Q9UM01"] # these proteins are in the latest pharos version but not in the 2018 version, so need to develop features for these
df1.loc[df1["uniprot"].isin(renameids), 'tdl'] = 'Tclin'

conditions = [
    (df1['tdl'] == "Tclin"),
    (df1['tdl'] != "Tclin")
    ]
values = [1, 0]
df1['y'] = np.select(conditions, values)
test19 = df1.loc[df1["uniprot"].isin(dfprotein2020["uniprot"])]
test19 = test19.loc[~(test19["uniprot"].isin(testntrain["uniprot"]))]
test19 = test19.loc[test19["tdl"] == "Tclin"]

alltest = df1.loc[df1["uniprot"].isin(dfprotein2020["uniprot"])]
alltest = alltest.loc[~(alltest["uniprot"].isin(testntrain["uniprot"]))]
alltest = alltest.loc[alltest["tdl"] != "Tclin"]
alltest = alltest.head(18418)

## All Non-tclin proteins

In [None]:
# load the model from disk
loaded_model = pickle.load(open("xgb_best_model.pickle", 'rb'))
X_alltest = alltest.drop(["uniprot", "protein_id", "name" ,"sym", "tdl", "y" ], axis=1)
y_alltest = alltest["y"]
allypred = (loaded_model.predict(X_alltest))
allprobs = (loaded_model.predict_proba(X_alltest))
dfpredall = alltest[["uniprot", "sym", "tdl", "y"]]
dfpredall = dfpredall.reset_index(level=0)
dfpredall['predictions']=pd.Series(allypred)
allprobdf = pd.DataFrame(allprobs)
dfpredall = pd.concat([dfpredall, allprobdf], axis=1).sort_values("uniprot")
dfpredall.rename(columns = {'predictions':'predictions(1=tclin)', 0: "probability_nontclin",
                          1: "probability_tclin"}, inplace=True)
dfpredall.to_csv("xgb_allpreds.csv", index=False)
print("Correct predictions for 18418 non-tclins = ", 
      len(dfpredall.loc[(dfpredall["y"] == 0) & 
                    (dfpredall["predictions(1=tclin)"] == 0)]), "/", 
      len(dfpredall.loc[dfpredall["y"] == 0]))
print("Accuracy = ", accuracy_score(y_alltest,allypred, normalize=True))

## All tclin proteins

In [None]:
X_test = test19.drop(["uniprot", "protein_id", "name" ,"sym", "tdl", "y" ], axis=1)
y_test = test19["y"]
ypred = (loaded_model.predict(X_test))
probs = (loaded_model.predict_proba(X_test))

In [None]:
dfpred = test19[["uniprot", "sym", "tdl", "y"]]
dfpred = dfpred.reset_index(level=0)
dfpred['predictions']=pd.Series(ypred)
probdf = pd.DataFrame(probs)
dfpred = pd.concat([dfpred, probdf], axis=1).sort_values("uniprot")
dfpred.rename(columns = {'predictions':'predictions(1=tclin)', 0: "probability_nontclin",
                          1: "probability_tclin"}, inplace=True)
dfpred.to_csv("xgb_19preds.csv", index=False)

In [None]:
print("Correct predictions for 19 tclins = ", 
      len(dfpred.loc[(dfpred["y"] == 1) & 
                    (dfpred["predictions(1=tclin)"] == 1)])-1, "/", 
      len(dfpred.loc[dfpred["y"] == 1])-1)
print("Accuracy = ", accuracy_score(y_test,ypred, normalize=True))