In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [None]:
# Import feature set for set8 ml model
s8_fset = pd.read_csv('./datasets/feature_data.csv')
msight_scores = pd.read_csv('./datasets/MS-full-proteome.csv')
ms_test = pd.read_csv('./datasets/MS-test-set.csv')
pplus = pd.read_csv('./datasets/methylated_sites_info.csv')

In [None]:
# Format s8 feature set 
# Identify site position from MOD_RSD
pplus['Site'] = pplus['MOD_RSD'].str[1:-3]
pplus['adj_Site'] = pd.to_numeric(pplus['Site'])
pplus['combo'] = pplus['ACC_ID'] + '_' + pplus['adj_Site'].astype(str)
pplus['ID'] = pplus['ACC_ID'] + '_K' + pplus['SITE_GRP_ID'].astype(str)

s8_fset['ID']= s8_fset['ID'].str[:-3]
s8_fset = s8_fset.set_index('ID')

s8_fset = s8_fset.drop(['Gs(U)_NO'], axis=1)

In [None]:
# Pull ms test set from s8 feature set
ms_test['ID'] = ms_test['uniprot_id'] + '_K' + ms_test['position'].astype(str)
ms = ms_test[['ID', 'ground_truth']]
ms = ms.set_index('ID')

In [None]:
# Merge to generate test set
holdout_set = pd.merge(ms, s8_fset, left_index=True, right_index=True, how='inner')
#holdout_set.to_csv('./holdout_set.csv')

In [None]:
s8_fset

In [None]:
# CREATE TRAINING AND TEST SETS FOR S8 ML MODEL
from sklearn.model_selection import train_test_split
# create x and y sets for set8 ml test data
s8_test_y = holdout_set['METHYLATED']
s8_test_x = holdout_set.drop(['METHYLATED'], axis=1)
# create x and y sets for set8 ml training data with test set removed
drop_rows = holdout_set.index
s8_fset.drop(drop_rows, inplace=True)
s8_train_y = s8_fset['METHYLATED']
s8_train_x = s8_fset.drop(['METHYLATED'], axis=1)
# split training sets in two for 1) training of set8 model then 2) training of combo model
s8_mod_train_x, s8_combo_train_x, s8_mod_train_y, s8_combo_train_y = train_test_split(
    s8_train_x, s8_train_y, test_size=0.5, random_state=8, stratify=s8_train_y)

In [None]:
# count training and test set sizes for 1) and 2)
from collections import Counter
print('Without Hold-Out Set:', Counter(s8_train_y))
print('S8 Model Training:', Counter(s8_mod_train_y))
print('Combo Model Training:', Counter(s8_combo_train_y))

In [None]:
# TEST S8 MODEL on S8 Model Training data
# Linear Discriminant Analysis produced the best results in testing - use it here
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from numpy import mean
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer

discrim = LinearDiscriminantAnalysis(tol = 0.007070707070707071, store_covariance=False, solver='svd')
over = RandomOverSampler(sampling_strategy=0.35)
steps = [('sampling', over), ('model', discrim)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
scores = cross_val_score(pipeline, s8_mod_train_x, s8_mod_train_y, cv=cv, scoring="precision")
print("score:", mean(scores))

In [None]:
# TRAIN S8 MODEL
# fit model to training data
step2 = [('sampling', over)]
pip2 = Pipeline(steps=step2)
s8_X, s8_y = pip2.fit_resample(s8_mod_train_x, s8_mod_train_y)
model = discrim.fit(s8_X, s8_y)

In [None]:
# PREDICT SCORES FOR COMBO TRAINING SET USING S8 MODEL FOR INSERTION INTO COMBO MODEL
s8_scores = model.predict_proba(s8_combo_train_x)
combo_train_x = pd.DataFrame(s8_scores)
combo_train_x = combo_train_x.rename(columns={1:'s8_proba_methylated'})
combo_train_x = combo_train_x.drop([0], axis=1)
combo_train_x = combo_train_x.set_index(s8_combo_train_x.index)
combo_train_x['methylated'] = s8_combo_train_y

In [None]:
# ADD MS SCORES TO COMBO TRAINING SET
# formatting
ms_combo_train_x = pd.DataFrame(msight_scores['Score'])
msight_scores['ID'] = msight_scores['UniProtID'] + '_K' + msight_scores['Site'].astype(str)
ms_combo_train_x = ms_combo_train_x.set_index(msight_scores['ID'])
ms_combo_train_x = ms_combo_train_x.rename(columns={'Score':'ms_proba_methylated'})

# add to s8 data
X_combo_train = pd.merge(combo_train_x, ms_combo_train_x, left_index=True, right_index=True, how='inner')

# isolate methylated data for y combo train
y_combo_train = X_combo_train['methylated']
X_combo_train = X_combo_train.drop(['methylated'], axis=1)

In [None]:
# quick count of classes in combo train
print('Combination Model Training Set:', Counter(y_combo_train))

In [None]:
holdout_set

In [None]:
# CREATE COMBO TEST SET
# y value is experimental methylation data
y_combo_test = holdout_set['METHYLATED']
# generate s8 scores for combo x data
X_s8_combo_test = holdout_set.drop(['ground_truth', 'METHYLATED'], axis=1)
s8_t_scores = model.predict_proba(X_s8_combo_test)
x_combo_test = pd.DataFrame(s8_t_scores)
x_combo_test = x_combo_test.rename(columns={1:'s8_proba_methylated'})
x_combo_test = x_combo_test.drop([0], axis=1)
x_combo_test = x_combo_test.set_index(X_s8_combo_test.index)
x_combo_test['methylated'] = y_combo_test

In [None]:
# pull out ms scores from ms data for combo test x data
X_combo_test = pd.merge(x_combo_test, ms_combo_train_x, left_index=True, right_index=True, how='inner')

# isolate methylated data for y combo train
y_combo_test = X_combo_test['methylated']
X_combo_test = X_combo_test.drop(['methylated'], axis=1)

In [None]:
# FIT AND TEST MODELS : train with X_combo_train and y_combo_train,
# test with X_combo_test and y_combo_test

In [None]:
# STACKING ~ LOGISTIC REGRESSION + SVMSMOTE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SVMSMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score

logit = LogisticRegression(max_iter=100)
smote = SVMSMOTE()
steps = [('sampling', smote), ('clf', logit)]
pipeline = Pipeline(steps=steps)
lmodel = pipeline.fit(X_combo_train, y_combo_train)
logit_proba = lmodel.predict_proba(X_combo_test)
# convert to 1 or 0 values
y_proba = logit_proba[:, [1]] 
y_pred = np.where((y_proba>=0.855), 1, y_proba)
y_pred = np.where((y_pred<0.855), 0, y_pred)
print('logistic regression + SVMSMOTE precision', precision_score(y_combo_test, y_pred))

In [None]:
# CARRY ON WITH MEAN PROBABILITY VOTING + LOGISTIC REGRESSION WITH SVMSMOTE

In [None]:
# Import experimental featureset, as created by Feature_Set_Generation.ipynb 
exp = pd.read_csv('./datasets/surface_exposed_lysine_proteome.csv', index_col=0)

In [None]:
# Format feature set and exp set before run
# Drop Gs(U) from features since it's not in our example set
features = s8_fset
methylated = features['METHYLATED']
features = features.drop(['METHYLATED'], axis=1)

seqs = exp['sequence']
exp = exp.drop(['sequence'], axis=1)

# Reorder example columns to resemble training data
exp = exp[features.columns]

In [None]:
# Voting: Generate SET8ML Scoring 
# fit ML model to data
from collections import Counter
step2 = [('sampling', over)]
pip2 = Pipeline(steps=step2)
X, y = pip2.fit_resample(features, methylated)
model = discrim.fit(X, y)
print(Counter(y))
print(Counter(methylated))

In [None]:
# format ms data for merging
msight_scores['ID'] = msight_scores['UniProtID'] + '_' + msight_scores['Site'].astype(str)
msight_scores.index = msight_scores['combo']
ms_exp = pd.DataFrame(msight_scores['Score'])

In [None]:
# predict with s8 model
s8_temp = model.predict_proba(exp)
s8_exp = pd.DataFrame(s8_temp)
s8_exp = s8_exp.rename(columns={1:'s8_proba_methylated'})
s8_exp = s8_exp.drop([0], axis=1)
s8_exp = s8_exp.set_index(s8_exp.index)
s8_exp.index = exp.index

In [None]:
# merge model scores
exp_comb = pd.merge(ms_exp, s8_exp, left_index=True, right_index=True, how='inner')

In [None]:
# Stacking: Generate SET8ML Scoring
# apply model to methylsight + SET8 ML scores
X = X_combo_train.append(X_combo_test)
y = y_combo_train.append(y_combo_test)
lmodel = pipeline.fit(X, y)

# make x experimental data resemble features
exp_comb = exp_comb[['s8_proba_methylated', 'Score']]
exp_comb = exp_comb.rename({'Score':'ms_proba_methylated'},axis=1)

In [None]:
exp_comb

In [None]:
# get resulting probability scores and classification
stacked_classify = lmodel.predict(exp_comb)
stacked_proba = lmodel.predict_proba(exp_comb)

In [None]:
# format and export
stacked_out = pd.DataFrame(stacked_proba, columns=['proba_not_methylated', 'proba_methylated'])

In [None]:
stacked_out.index = exp_comb.index

In [None]:
stacked_out.loc[stacked_out['proba_methylated'] >= 0.82, 'pred_methylated'] = 1
stacked_out.loc[stacked_out['proba_methylated'] < 0.82, 'pred_methylated'] = 0

In [None]:
stacked_out = pd.merge(stacked_out, seqs, left_index=True, right_index=True, how='inner')
stacked_out = stacked_out.drop_duplicates()

In [None]:
stacked_out = stacked_out.reset_index()
stacked_out[['uniprot_id','position']] = stacked_out['index'].str.split('_', 1, expand=True)
stacked_out = stacked_out.drop(['index'], axis=1)

In [None]:
stacked_out = stacked_out[['uniprot_id', 'position', 'sequence', 'proba_methylated', 'pred_methylated']]
stacked_out = stacked_out.sort_values(by=['proba_methylated'], ascending=False)
stacked_out = stacked_out.reset_index(drop=True)

In [None]:
stacked_out.to_csv('./ML-Hybrid_Ensemble_Results_SET8_v_Proteome.csv', sep='\t')