In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [3]:
# Import feature set for set8 ml model
s8_fset = pd.read_csv('./datasets/feature_data.csv')
msight_scores = pd.read_csv('./big_datasets/MS-full-proteome.csv')
ms_test = pd.read_csv('./datasets/MS-test-set.csv')
pplus = pd.read_csv('./datasets/methylated_sites_info.csv')

In [4]:
# Format s8 feature set 
# Identify site position from MOD_RSD
pplus['Site'] = pplus['MOD_RSD'].str[1:-3]
pplus['adj_Site'] = pd.to_numeric(pplus['Site'])
pplus['combo'] = pplus['ACC_ID'] + '_' + pplus['adj_Site'].astype(str)
pplus['ID'] = pplus['ACC_ID'] + '_K' + pplus['SITE_GRP_ID'].astype(str)

s8_fset['ID']= s8_fset['ID'].str[:-3]
s8_fset = s8_fset.set_index('ID')

s8_fset = s8_fset.drop(['Gs(U)_NO'], axis=1)

In [5]:
# Pull ms test set from s8 feature set
ms_test['ID'] = ms_test['uniprot_id'] + '_K' + ms_test['position'].astype(str)
ms = ms_test[['ID', 'ground_truth']]
ms = ms.set_index('ID')

In [6]:
# Merge to generate test set
holdout_set = pd.merge(ms, s8_fset, left_index=True, right_index=True, how='inner')
#holdout_set.to_csv('./holdout_set.csv')

In [7]:
s8_fset

Unnamed: 0_level_0,Mw_NO,HP_NO,IP_NO,ECI_NO,L1-9_NO,DHf_NO,Z1_NO,Z2_NO,Z3_NO,ISA_NO,...,158_maccs,159_maccs,160_maccs,161_maccs,162_maccs,163_maccs,164_maccs,165_maccs,166_maccs,METHYLATED
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P35579_K8,1738.0,-12.6,83.91,9.50,271.93,-6867.66,10.40,2.93,-3.25,1159.91,...,1,1,1,1,1,1,1,1,0,1.0
Q6ZMI0_K9,1644.0,-14.8,87.74,9.18,272.52,-6940.39,13.91,-6.87,-9.58,993.77,...,1,1,1,1,1,1,1,1,0,1.0
O60610_K42,1788.0,-10.7,104.86,8.19,263.35,-6794.58,3.03,2.31,-18.42,1439.53,...,1,1,1,1,1,1,1,1,0,1.0
Q8WY21_K700,1831.0,-30.2,121.04,10.67,270.00,-6631.72,24.16,6.09,-23.09,1166.05,...,1,1,1,1,1,1,1,1,0,1.0
P51957_K345,1607.0,-12.1,95.56,6.91,266.86,-6772.06,13.74,-3.47,-1.70,1168.01,...,1,1,1,1,1,0,1,1,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O60353_K532,1813.0,-14.1,100.45,8.37,261.23,-6650.34,8.83,5.03,-4.74,1333.46,...,1,1,1,1,1,1,1,1,0,0.0
P63215_K24,1784.0,-5.6,101.97,9.75,271.63,-6797.83,3.62,-1.56,-15.41,1264.19,...,1,1,1,1,0,0,1,0,0,0.0
Q13283_K5,1494.0,13.4,93.41,3.78,274.27,-6660.41,-11.76,-16.14,-4.04,1421.01,...,1,1,1,1,0,0,1,1,0,0.0
Q13283_K453,1523.0,-8.5,100.45,6.26,276.15,-6362.28,7.60,-22.23,-6.67,1136.08,...,1,1,1,1,1,1,1,1,0,0.0


In [8]:
# CREATE TRAINING AND TEST SETS FOR S8 ML MODEL
from sklearn.model_selection import train_test_split
# create x and y sets for set8 ml test data
s8_test_y = holdout_set['METHYLATED']
s8_test_x = holdout_set.drop(['METHYLATED'], axis=1)
# create x and y sets for set8 ml training data with test set removed
drop_rows = holdout_set.index
s8_fset.drop(drop_rows, inplace=True)
s8_train_y = s8_fset['METHYLATED']
s8_train_x = s8_fset.drop(['METHYLATED'], axis=1)
# split training sets in two for 1) training of set8 model then 2) training of combo model
s8_mod_train_x, s8_combo_train_x, s8_mod_train_y, s8_combo_train_y = train_test_split(
    s8_train_x, s8_train_y, test_size=0.5, random_state=8, stratify=s8_train_y)

In [9]:
# count training and test set sizes for 1) and 2)
from collections import Counter
print('Without Hold-Out Set:', Counter(s8_train_y))
print('S8 Model Training:', Counter(s8_mod_train_y))
print('Combo Model Training:', Counter(s8_combo_train_y))

Without Hold-Out Set: Counter({0.0: 4221, 1.0: 201})
S8 Model Training: Counter({0.0: 2111, 1.0: 100})
Combo Model Training: Counter({0.0: 2110, 1.0: 101})


In [10]:
# TEST S8 MODEL on S8 Model Training data
# Linear Discriminant Analysis produced the best results in testing - use it here
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from numpy import mean
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer

discrim = LinearDiscriminantAnalysis(tol = 0.007070707070707071, store_covariance=False, solver='svd')
over = RandomOverSampler(sampling_strategy=0.35)
steps = [('sampling', over), ('model', discrim)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
scores = cross_val_score(pipeline, s8_mod_train_x, s8_mod_train_y, cv=cv, scoring="precision")
print("score:", mean(scores))

score: 0.08246700594237563


In [11]:
# TRAIN S8 MODEL
# fit model to training data
step2 = [('sampling', over)]
pip2 = Pipeline(steps=step2)
s8_X, s8_y = pip2.fit_resample(s8_mod_train_x, s8_mod_train_y)
model = discrim.fit(s8_X, s8_y)

In [12]:
# PREDICT SCORES FOR COMBO TRAINING SET USING S8 MODEL FOR INSERTION INTO COMBO MODEL
s8_scores = model.predict_proba(s8_combo_train_x)
combo_train_x = pd.DataFrame(s8_scores)
combo_train_x = combo_train_x.rename(columns={1:'s8_proba_methylated'})
combo_train_x = combo_train_x.drop([0], axis=1)
combo_train_x = combo_train_x.set_index(s8_combo_train_x.index)
combo_train_x['methylated'] = s8_combo_train_y

In [13]:
# ADD MS SCORES TO COMBO TRAINING SET
# formatting
ms_combo_train_x = pd.DataFrame(msight_scores['Score'])
msight_scores['ID'] = msight_scores['UniProtID'] + '_K' + msight_scores['Site'].astype(str)
ms_combo_train_x = ms_combo_train_x.set_index(msight_scores['ID'])
ms_combo_train_x = ms_combo_train_x.rename(columns={'Score':'ms_proba_methylated'})

# add to s8 data
X_combo_train = pd.merge(combo_train_x, ms_combo_train_x, left_index=True, right_index=True, how='inner')

# isolate methylated data for y combo train
y_combo_train = X_combo_train['methylated']
X_combo_train = X_combo_train.drop(['methylated'], axis=1)

In [14]:
# quick count of classes in combo train
print('Combination Model Training Set:', Counter(y_combo_train))

Combination Model Training Set: Counter({0.0: 2076, 1.0: 97})


In [15]:
holdout_set

Unnamed: 0_level_0,ground_truth,Mw_NO,HP_NO,IP_NO,ECI_NO,L1-9_NO,DHf_NO,Z1_NO,Z2_NO,Z3_NO,...,158_maccs,159_maccs,160_maccs,161_maccs,162_maccs,163_maccs,164_maccs,165_maccs,166_maccs,METHYLATED
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P55289_K642,1,1755.0,-14.4,103.83,9.87,265.91,-6791.30,11.31,-1.61,-19.81,...,1,1,1,1,0,0,1,0,0,0.0
Q9H0E3_K869,1,1636.0,-12.1,109.03,7.29,268.41,-6785.98,9.11,-3.83,-18.24,...,1,1,1,1,0,0,1,1,0,0.0
A2RTX5_K662,1,1687.0,-15.3,97.99,8.65,268.56,-6767.26,13.44,-2.65,-6.21,...,1,1,1,1,1,0,1,1,0,0.0
Q14683_K149,1,1715.0,-9.3,100.22,8.83,274.35,-6789.70,1.90,-0.13,-11.55,...,1,1,1,1,1,1,1,1,0,0.0
Q9UGJ0_K62,1,1566.0,-19.2,99.67,7.37,269.40,-6610.30,16.86,-8.50,0.25,...,1,1,1,1,1,1,1,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q2NKX8_K1156,1,1680.0,-11.2,90.18,10.10,276.36,-7040.12,6.44,-7.09,-6.05,...,1,1,1,1,1,1,1,1,0,0.0
O43293_K37,1,1680.0,-12.6,100.91,7.84,272.58,-6645.81,12.14,-7.63,-11.13,...,1,1,1,1,1,1,1,1,0,0.0
P10745_K1070,1,1832.0,-0.2,96.32,7.51,266.59,-6635.88,-8.18,0.33,-8.45,...,1,1,1,1,1,1,1,1,0,0.0
P11137-4_K412,1,1610.0,-15.8,95.31,9.24,276.16,-6743.65,22.67,-7.11,-1.57,...,1,1,1,1,1,0,1,1,0,0.0


In [16]:
# CREATE COMBO TEST SET
# y value is experimental methylation data
y_combo_test = holdout_set['METHYLATED']
# generate s8 scores for combo x data
X_s8_combo_test = holdout_set.drop(['ground_truth', 'METHYLATED'], axis=1)
s8_t_scores = model.predict_proba(X_s8_combo_test)
x_combo_test = pd.DataFrame(s8_t_scores)
x_combo_test = x_combo_test.rename(columns={1:'s8_proba_methylated'})
x_combo_test = x_combo_test.drop([0], axis=1)
x_combo_test = x_combo_test.set_index(X_s8_combo_test.index)
x_combo_test['methylated'] = y_combo_test

In [17]:
# pull out ms scores from ms data for combo test x data
X_combo_test = pd.merge(x_combo_test, ms_combo_train_x, left_index=True, right_index=True, how='inner')

# isolate methylated data for y combo train
y_combo_test = X_combo_test['methylated']
X_combo_test = X_combo_test.drop(['methylated'], axis=1)

In [28]:
# STACKING ~ LOGISTIC REGRESSION + SVMSMOTE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SVMSMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score

logit = LogisticRegression(max_iter=100)
smote = SVMSMOTE()
steps = [('sampling', smote), ('clf', logit)]
pipeline = Pipeline(steps=steps)
lmodel = pipeline.fit(X_combo_train, y_combo_train)

In [29]:
# CARRY ON WITH MEAN PROBABILITY VOTING + LOGISTIC REGRESSION WITH SVMSMOTE

In [30]:
# Import experimental featureset, as created by Feature_Set_Generation.ipynb 
exp = pd.read_csv('./big_datasets/surface_exposed_lysine_proteome.csv', index_col=0)

In [31]:
# Format feature set and exp set before run
# Drop Gs(U) from features since it's not in our example set
features = s8_fset
methylated = features['METHYLATED']
features = features.drop(['METHYLATED'], axis=1)

seqs = exp['sequence']
exp = exp.drop(['sequence'], axis=1)

# Reorder example columns to resemble training data
exp = exp[features.columns]

In [32]:
# Voting: Generate SET8ML Scoring 
# fit ML model to data
from collections import Counter
step2 = [('sampling', over)]
pip2 = Pipeline(steps=step2)
X, y = pip2.fit_resample(features, methylated)
model = discrim.fit(X, y)
print(Counter(y))
print(Counter(methylated))

Counter({0.0: 4221, 1.0: 1477})
Counter({0.0: 4221, 1.0: 201})


In [34]:
# format ms data for merging
msight_scores['ID'] = msight_scores['UniProtID'] + '_' + msight_scores['Site'].astype(str)
msight_scores.index = msight_scores['ID']
ms_exp = pd.DataFrame(msight_scores['Score'])

In [35]:
# predict with s8 model
s8_temp = model.predict_proba(exp)
s8_exp = pd.DataFrame(s8_temp)
s8_exp = s8_exp.rename(columns={1:'s8_proba_methylated'})
s8_exp = s8_exp.drop([0], axis=1)
s8_exp = s8_exp.set_index(s8_exp.index)
s8_exp.index = exp.index

In [36]:
# merge model scores
exp_comb = pd.merge(ms_exp, s8_exp, left_index=True, right_index=True, how='inner')

In [37]:
# Stacking: Generate SET8ML Scoring
# apply model to methylsight + SET8 ML scores
X = X_combo_train.append(X_combo_test)
y = y_combo_train.append(y_combo_test)
lmodel = pipeline.fit(X, y)

# make x experimental data resemble features
exp_comb = exp_comb[['s8_proba_methylated', 'Score']]
exp_comb = exp_comb.rename({'Score':'ms_proba_methylated'},axis=1)

  X = X_combo_train.append(X_combo_test)
  y = y_combo_train.append(y_combo_test)


In [38]:
exp_comb

Unnamed: 0,s8_proba_methylated,ms_proba_methylated
A0A075B759_118,0.698159,0.421
A0A075B759_125,0.016816,0.495
A0A075B759_15,0.037216,0.193
A0A075B759_151,0.509387,0.236
A0A075B759_154,0.920912,0.241
...,...,...
S4R3V1_185,0.641960,0.193
S4R3V1_193,0.264744,0.230
S4R3V1_204,0.014832,0.223
S4R3V1_87,0.262216,0.332


In [39]:
# get resulting probability scores and classification
stacked_classify = lmodel.predict(exp_comb)
stacked_proba = lmodel.predict_proba(exp_comb)

In [40]:
# format and export
stacked_out = pd.DataFrame(stacked_proba, columns=['proba_not_methylated', 'proba_methylated'])

In [41]:
stacked_out.index = exp_comb.index

In [42]:
stacked_out.loc[stacked_out['proba_methylated'] >= 0.82, 'pred_methylated'] = 1
stacked_out.loc[stacked_out['proba_methylated'] < 0.82, 'pred_methylated'] = 0

In [43]:
stacked_out = pd.merge(stacked_out, seqs, left_index=True, right_index=True, how='inner')
stacked_out = stacked_out.drop_duplicates()

In [44]:
stacked_out = stacked_out.reset_index()
stacked_out[['uniprot_id','position']] = stacked_out['index'].str.split('_', 1, expand=True)
stacked_out = stacked_out.drop(['index'], axis=1)

  stacked_out[['uniprot_id','position']] = stacked_out['index'].str.split('_', 1, expand=True)


In [45]:
stacked_out = stacked_out[['uniprot_id', 'position', 'sequence', 'proba_methylated', 'pred_methylated']]
stacked_out = stacked_out.sort_values(by=['proba_methylated'], ascending=False)
stacked_out = stacked_out.reset_index(drop=True)

In [46]:
stacked_out.to_csv('./ML-Hybrid_Ensemble_Results_SET8_v_Proteome.csv', sep='\t')