In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time
import warnings
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score 
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from pathlib import Path
import multiprocessing
print(multiprocessing.cpu_count())
print(Path.cwd())

In [None]:
data = pd.read_feather("v5data.feather")
data.dropna(subset=['uniprot'], inplace=True)
contcols = (data.drop(["uniprot", "protein_id",  "name", "sym", "tdl" ], axis=1)).fillna(0)
df1 = data[["uniprot", "protein_id",  "name", "sym", "tdl"]]
df1 = pd.concat([df1.reset_index(drop=True), contcols], axis=1)
# changing the 5 proteins to tclins  
renameids = ["P02787", "O60840", "P13639", "Q16637", "Q9UM01"] # these 5 proteins are marked nontclins in latest Pharos
df1.loc[df1["uniprot"].isin(renameids), 'tdl'] = 'Tclin'
conditions = [
    (df1['tdl'] == "Tclin"),
    (df1['tdl'] != "Tclin")
    ]
values = [1, 0]
df1['y'] = np.select(conditions, values)
dummy = pd.read_csv("nontclin1290.csv") # these proteins are in a random order
nontclin = dummy.head(611)
nontclin = nontclin.drop_duplicates()
nontclin_bottom = dummy.tail(dummy.shape[0] - 611) # Using 611 non-tclins from the 1SVM models

tdark = nontclin[["sym", "uniprot"]]
tdark = tdark.drop_duplicates()   
tdarkuniprotid = tdark["uniprot"]
tdark["NewTClin"] = "blank"
tdark["Quinquennal"] = "blank"
tclin = pd.read_csv("Tclin_list.csv")
tclin = tclin[['UniProt', 'Symbol', 'NewTClin','Quinquennal']]
tclin.rename(columns = {'UniProt':'uniprot', 'Symbol': 'sym'}, inplace=True)
tclinuniprotid = tclin["uniprot"]
tlist = pd.concat([tdark, tclin]) 
tlist.drop(["sym",  "Quinquennal"], axis=1, inplace=True)
ids = tlist["uniprot"]
df = df1.loc[df1['uniprot'].isin(ids)]
df = pd.merge(df, tlist, on ='uniprot')
df = df.sort_values("uniprot", ascending=False)
warnings.filterwarnings(action='ignore', category=UserWarning)
df1 = df.loc[~df['uniprot'].isin(renameids)]
df2 = df.loc[df['uniprot'].isin(renameids)]
df2["NewTClin"] = "Y"
df = pd.concat([df1, df2]) 
test = df.loc[df['NewTClin'] == "Y"] 
train = df.loc[df['NewTClin'] != "Y"] 

test["tdl"] = 'Tclin'
conditions = [
    (test['tdl'] == "Tclin"),
    (test['tdl'] != "Tclin")
    ]
values = [1, 0]
test['y'] = np.select(conditions, values)
train = train.loc[train["tdl"] != "Tchem"]

In [None]:
X_train = train.drop(["uniprot", "protein_id",  "name","sym", "tdl", "y", "NewTClin" ], axis=1)
y_train = train["y"]
X_test = test.drop(["uniprot", "protein_id", "name" ,"sym", "tdl", "y", "NewTClin" ], axis=1)
y_test = test["y"]
print("Total Features columns = ", len(X_train.columns.values.tolist()))
print("Training rows = ", len(X_train))
print("Testing rows = ", len(X_test))

In [None]:
I=1000
warnings.filterwarnings(action='ignore', category=UserWarning)
params = { 'max_depth': np.arange(3,20,1),
           'learning_rate': np.arange(0.01, 0.5, 0.01),
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'gamma': np.arange(0,10,0.05), 
           'reg_alpha': np.arange(0,80,1), 
           'reg_lambda': np.arange(0,1,0.1), 
           'n_estimators': [100, 500, 1000]} 
scoring = {'Accuracy': 'accuracy', 'f1': 'f1'
           }
start = time.time()
xgbc = xgb.XGBClassifier(seed = 40)
kfold = KFold(n_splits=2, shuffle=True, random_state=40)
clf = RandomizedSearchCV(estimator=xgbc,
                         param_distributions=params,
                         scoring=scoring,
                         n_iter=I,
                         verbose=2,
                          n_jobs=1,refit='Accuracy',return_train_score=False, random_state = 40, cv=kfold)
## change n_jobs to use more cores
clf.fit(X_train, y_train)
end = time.time()
print(end - start)
allresults = clf.cv_results_

In [None]:
probs = (clf.predict_proba(X_test)) 
ypred = (clf.predict(X_test))
print("Best parameters:", clf.best_params_)
print("Predictions = ", ypred)
print("Correct predictions = ", ypred.sum(), "/74")
print("Accuracy = ", accuracy_score(y_test,ypred, normalize=True))
print("F1 score = ", f1_score(y_test,ypred))

## Get predictions and the probabilities for the test set

In [None]:
dfpred = test[["uniprot", "sym", "tdl", "NewTClin"]]
dfpred = dfpred.reset_index(level=0)
dfpred['predictions']=pd.Series(ypred)
probdf = pd.DataFrame(probs)
dfpred = pd.concat([dfpred, probdf], axis=1).sort_values("uniprot")
dfpred.rename(columns = {'predictions':'predictions(1=tclin)', 0: "probablity_nontclin",
                          1: "probability_tclin"}, inplace=True)
dfpred.to_csv("xgb_74preds.csv", index=False)

In [None]:
# Best XGB model
best_model = clf.best_estimator_
# Save model
pickle.dump(clf.best_estimator_, open("xgb_best_model.pickle", "wb"))

## GET THE FEATURE IMP PLOTS

In [None]:
imp = pd.DataFrame()
imp['features']=pd.Series(X_train.columns.values)
imp['importancescore']=pd.Series((prd.best_estimator_.feature_importances_))
imp = imp.sort_values("importancescore", ascending = False)
df44 = pd.DataFrame()
df44["importancefscore"] = pd.Series(prd.best_estimator_.get_booster().get_fscore())
df44 = df44.reset_index(level=0)
df44.rename(columns = {'index':'features'}, inplace=True)
imp = pd.merge(imp, df44, on ='features', how="left")
gain = pd.DataFrame()
gain["gain"] = pd.Series(prd.best_estimator_.get_booster().get_score(importance_type="gain"))
gain = gain.reset_index(level=0)
gain.rename(columns = {'index':'features'}, inplace=True)
imp = pd.merge(imp, gain, on ='features', how="left")
cover = pd.DataFrame()
cover["cover"] = pd.Series(prd.best_estimator_.get_booster().get_score(importance_type="cover"))
cover = cover.reset_index(level=0)
cover.rename(columns = {'index':'features'}, inplace=True)
imp = pd.merge(imp, cover, on ='features', how="left")
imp[["importancescore","importancefscore", "gain", "cover"]] = imp[["importancescore","importancefscore", "gain", "cover"]].fillna(0)
imp.to_csv("feat_imp.csv", index=False)