In [None]:
import pickle
import numpy as np
import pandas as pd
import warnings
import shap
from pathlib import Path
import multiprocessing
print(multiprocessing.cpu_count())
print(Path.cwd())
# load the model from disk
loaded_model = pickle.load(open("1svm_best_model.pickle", 'rb'))
loaded_model

In [None]:
data = pd.read_feather("v5data.feather")
data.dropna(subset=['uniprot'], inplace=True)

In [None]:
contcols = (data.drop(["uniprot", "protein_id",  "name", "sym", "tdl" ], axis=1)).fillna(0)
df1 = data[["uniprot", "protein_id",  "name", "sym", "tdl"]]
df1 = pd.concat([df1.reset_index(drop=True), contcols], axis=1)
# changing the 5 proteins to tclins  
renameids = ["P02787", "O60840", "P13639", "Q16637", "Q9UM01"]
df1.loc[df1["uniprot"].isin(renameids), 'tdl'] = 'Tclin'

conditions = [
    (df1['tdl'] == "Tclin"),
    (df1['tdl'] != "Tclin")
    ]
values = [1, 0]
df1['y'] = np.select(conditions, values)
tclin = pd.read_csv("Tclin_list.csv")
tclin = tclin[['UniProt', 'Symbol', 'NewTClin','Quinquennal']]
tclin.rename(columns = {'UniProt':'uniprot', 'Symbol': 'sym'}, inplace=True)
tclinuniprotid = tclin["uniprot"]
tlist = tclin # First 709 are tclins 
tlist.drop(["sym",  "Quinquennal"], axis=1, inplace=True)

ids = tlist["uniprot"]
df = df1.loc[df1['uniprot'].isin(ids)]
df = pd.merge(df, tlist, on ='uniprot')
df = df.sort_values("uniprot", ascending=False)

df1 = df.loc[~df['uniprot'].isin(renameids)]
df2 = df.loc[df['uniprot'].isin(renameids)]
df2["NewTClin"] = "Y"

df = pd.concat([df1, df2]) 
warnings.filterwarnings(action='ignore', category=UserWarning)
test = df.loc[df['NewTClin'] == "Y"] 
train = df.loc[df['NewTClin'] != "Y"] 
test["tdl"] = 'Tclin'
conditions = [
    (test['tdl'] == "Tclin"),
    (test['tdl'] != "Tclin")
    ]
values = [1, 0]
test['y'] = np.select(conditions, values)
train = train.loc[train["y"] == 1]

In [None]:
X_train = train.drop(["uniprot", "protein_id",  "name","sym", "tdl", "y", "NewTClin" ], axis=1)
y_train = train["y"]
X_test = test.drop(["uniprot", "protein_id", "name" ,"sym", "tdl", "y", "NewTClin" ], axis=1)
y_test = test["y"]
print("Total Features columns = ", len(X_train.columns.values.tolist()))
print("Training rows = ", len(X_train))
print("Testing rows = ", len(X_test))

In [None]:
# loaded_model is the trained 1SVM model
explainer = shap.Explainer(loaded_model, X_train)

# Calculate SHAP values
shap_values = explainer.shap_values(X_train)

# Summary plot of SHAP values
shap.summary_plot(shap_values, X_train)

In [None]:
mean_abs_shap_values = np.abs(shap_values).mean(axis=0)

# Get the indices of the top 50 features
top_50_indices = np.argsort(mean_abs_shap_values)[-50:][::-1]

# Extract the top 50 feature names based on your X_train dataset
top_50_features = X_train.columns[top_50_indices]

# Display top 50 feature names
print("Top 50 Features:")
print(top_50_features)

In [None]:
top_50_features_df = pd.DataFrame(top_50_features, columns=['9_1TopFeatures'])

# Save top 50 features to a CSV file
top_50_features_df.to_csv('top_50_features.csv', index=False)