In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [None]:
random.seed(8)

In [None]:
# Import feature set for set8 ml model
s8_fset = pd.read_csv('./datasets/feature_data.csv')
msight_scores = pd.read_csv('./datasets/MS-full-proteome.csv')
ms_test = pd.read_csv('./datasets/MS-test-set.csv')
pplus = pd.read_csv('./datasets/annotated_methylome.csv')

In [None]:
# Format s8 feature set 
# Format ID (uniprotID_K#) for feature set 
s8_fset['ID'] = s8_fset['ID'].str[:-3]

# Format ID (uniprotID_K#) for methylsight holdout set
ms_test['ID'] = ms_test['uniprot_id'] + '_K' + ms_test['position'].astype(str)

In [None]:
# Merge to generate holdout test set
holdout_set = pd.merge(ms_test, s8_fset, on = 'ID')

In [None]:
# CREATE TRAINING AND TEST SETS FOR S8 ML MODEL
from sklearn.model_selection import train_test_split
# create x and y sets for set8 ml test data
holdout_set = holdout_set.set_index('ID')
s8_test_y = holdout_set['METHYLATED']
s8_test_x = holdout_set.drop(['uniprot_id', 'position', 'METHYLATED'], axis=1)
# create x and y sets for set8 ml training data with test set removed
drop_rows = holdout_set.index
s8_fset = s8_fset.set_index('ID')
s8_fset.drop(drop_rows, inplace=True)
s8_train_y = s8_fset['METHYLATED']
s8_train_x = s8_fset.drop(['METHYLATED'], axis=1)
# split training sets in two for 1) training of set8 model then 2) training of combo model
s8_mod_train_x, s8_combo_train_x, s8_mod_train_y, s8_combo_train_y = train_test_split(
    s8_train_x, s8_train_y, test_size=0.5, random_state=8, stratify=s8_train_y)

In [None]:
# count training and test set sizes for 1) and 2)
from collections import Counter
print('Hold-Out Set:', Counter(s8_test_y))
print('Without Hold-Out Set:', Counter(s8_train_y))
print('S8 Model Training:', Counter(s8_mod_train_y))
print('Combo Model Training:', Counter(s8_combo_train_y))

In [None]:
# TEST S8 MODEL on S8 Model Training data
# Linear Discriminant Analysis produced the best results in testing - use it here
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from numpy import mean
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer

discrim = LinearDiscriminantAnalysis(tol = 0.007070707070707071, store_covariance=False, solver='svd')
over = RandomOverSampler(sampling_strategy=0.35)
steps = [('sampling', over), ('model', discrim)]
pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
scores = cross_val_score(pipeline, s8_mod_train_x, s8_mod_train_y, cv=cv, scoring="f1")
print("score:", mean(scores))

In [None]:
# TRAIN S8 MODEL
# fit model to training data
step2 = [('sampling', over)]
pip2 = Pipeline(steps=step2)
s8_X, s8_y = pip2.fit_resample(s8_mod_train_x, s8_mod_train_y)
model = discrim.fit(s8_X, s8_y)

In [None]:
# PREDICT SCORES FOR COMBO TRAINING SET USING S8 MODEL FOR INSERTION INTO COMBO MODEL
s8_scores = model.predict_proba(s8_combo_train_x)
combo_train_x = pd.DataFrame(s8_scores)
combo_train_x = combo_train_x.rename(columns={1:'s8_proba_methylated'})
combo_train_x = combo_train_x.drop([0], axis=1)
combo_train_x = combo_train_x.set_index(s8_combo_train_x.index)
combo_train_x['methylated'] = s8_combo_train_y

In [None]:
# ADD MS SCORES TO COMBO TRAINING SET
# formatting
ms_combo_train_x = pd.DataFrame(msight_scores['Score'])
msight_scores['ID'] = msight_scores['UniProtID'] + '_K' + msight_scores['Site'].astype(str)
ms_combo_train_x = ms_combo_train_x.set_index(msight_scores['ID'])
ms_combo_train_x = ms_combo_train_x.rename(columns={'Score':'ms_proba_methylated'})

# add to s8 data
X_combo_train = pd.merge(combo_train_x, ms_combo_train_x, left_index=True, right_index=True, how='inner')

# isolate methylated data for y combo train
y_combo_train = X_combo_train['methylated']
X_combo_train = X_combo_train.drop(['methylated'], axis=1)

In [None]:
X_combo_train

In [None]:
print('MethylSight and SET8ML Predicted Positives:', len(X_combo_train[(X_combo_train['ms_proba_methylated'] >= 0.5) & (X_combo_train['s8_proba_methylated'] >= 0.5)]))

In [None]:
print('SET8ML Predicted Positives:', len(X_combo_train[X_combo_train['s8_proba_methylated'] >= 0.5]))

In [None]:
# quick count of classes in combo train
print('Combination Model Training Set:', Counter(y_combo_train))

In [None]:
holdout_set

In [None]:
# CREATE COMBO TEST SET
# y value is experimental methylation data
y_combo_test = holdout_set['METHYLATED']
# generate s8 scores for combo x data
X_s8_combo_test = holdout_set.drop(['ground_truth', 'uniprot_id', 'METHYLATED', 'position'], axis=1)
s8_t_scores = model.predict_proba(X_s8_combo_test)
x_combo_test = pd.DataFrame(s8_t_scores)
x_combo_test = x_combo_test.rename(columns={1:'s8_proba_methylated'})
x_combo_test = x_combo_test.drop([0], axis=1)
x_combo_test = x_combo_test.set_index(X_s8_combo_test.index)
x_combo_test['methylated'] = y_combo_test

In [None]:
# pull out ms scores from ms data for combo test x data
X_combo_test = pd.merge(x_combo_test, ms_combo_train_x, left_index=True, right_index=True, how='inner')

# isolate methylated data for y combo train
y_combo_test = X_combo_test['methylated']
X_combo_test = X_combo_test.drop(['methylated'], axis=1)

In [None]:
# FIT AND TEST MODELS : train with X_combo_train and y_combo_train,
# test with X_combo_test and y_combo_test

In [None]:
# VOTING ~ Mean Probabilities
mean_prob = (X_combo_test['s8_proba_methylated'] + X_combo_test['ms_proba_methylated'])/2

In [None]:
# VOTING ~ Sum Probabilities
sum_prob = (X_combo_test['s8_proba_methylated'] + X_combo_test['ms_proba_methylated'])

In [None]:
# GRAPH VOTING

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

# METRIC v. THRESHOLD
threshold = 0.0
rec = []
prec = []
spec = []
thresh = []

while threshold <= 1:
    
    # select just the scores over our threshold
    cond = mean_prob >= threshold
    # convert to 1 or 0 values
    y_pred = np.where((mean_prob>=threshold), 1, mean_prob)
    y_pred = np.where((y_pred<threshold), 0, y_pred)
    # calculate recall
    recall = recall_score(y_combo_test, y_pred)
    rec.append(recall)
    # calculate precision
    precision = precision_score(y_combo_test, y_pred)
    prec.append(precision)
    # calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_combo_test, y_pred).ravel()
    specificity = tn/(tn+fp)
    spec.append(specificity)
    
    # update threshold
    thresh.append(threshold)
    threshold += 0.01
    #print(threshold)

voting_metrics = pd.DataFrame(list(zip(thresh, rec, spec, prec)), columns=['Threshold'
                                                                              , 'Recall'
                                                                              , 'Specificity'
                                                                              , 'Precision'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_context('paper')
ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(voting_metrics, 'Threshold'))
ax.set(ylabel='Performance')
ax.grid()
ax.legend(title='', bbox_to_anchor=(.5, 1), loc='lower center', ncol=3)
plt.ylim(0,1)
plt.xlim(0,1)
plt.savefig('./mean_sum_ms_s8.pdf', dpi=300, bbox_inches="tight")

In [None]:
# FIT DIFFERENT ML MODELS

In [None]:
# Append feature set to pdcal set then run through model fitting
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import ComplementNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_score

# Create our models
dummy = DummyClassifier(strategy='most_frequent')
logit = LogisticRegression(max_iter=100)
nb = ComplementNB()
dtree = tree.DecisionTreeClassifier()
knn = KNeighborsClassifier()
svc = svm.SVC()
bagged = BaggingClassifier()
rand_forest = RandomForestClassifier()
ext_trees = ExtraTreesClassifier()
gboost = GradientBoostingClassifier()

models = [dummy, logit, nb, dtree, knn, svc, bagged, rand_forest, ext_trees, gboost]


for m in models:
    mod = m.fit(X_combo_train, y_combo_train)
    predicted_y = mod.predict(X_combo_test)
    print('Model_Type:', m, '\t f1:', precision_score(y_combo_test, predicted_y))

In [None]:
# nb + dtree + bagged gives highest score - proceed to sampling strategies with that
# OVERSAMPLING
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, KMeansSMOTE, ADASYN
from imblearn.pipeline import Pipeline

#models = [nb, dtree, bagged]

# Create our oversamplers
over = RandomOverSampler()
smote = SMOTE()
border_smote = BorderlineSMOTE()
svm_smote = SVMSMOTE()
km_smote = KMeansSMOTE()
adasyn = ADASYN()
samplings = [over, smote, border_smote, svm_smote, adasyn]   #all oversamplers

top = 0
topmodel = ''
stype = ''

for m in models:
    for s in samplings:
        s3 = [('sampling', s), ('clf', m)]
        pip3 = Pipeline(steps=s3)
        m3 = pip3.fit(X_combo_train, y_combo_train)
        #re_X, re_y = pip3.fit_resample(X_combo_train, y_combo_train)
        #m3 = m.fit(re_X, re_y)
        predicted_y = m3.predict(X_combo_test)
        f1 = f1_score(y_combo_test, predicted_y)
        if f1 > top:
            top = f1
            topmodel=m
            stype=s
        #print('Model_Type:', m, '\t Sampling_type:', s ,'\t f1:', f1_score(y_combo_test, predicted_y))
print('Top scoring model is', topmodel, 'with', stype, 'sampling, and an f1 score of', top)

In [None]:
# UNDERSAMPLING
# Data Undersampling - random undersampling, condensed nearest neighbor, tomek links,
# edited nearest neighbors, neighborhood cleaning rule, one-sided selection
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours
from imblearn.under_sampling import NeighbourhoodCleaningRule, OneSidedSelection

under = RandomUnderSampler()
cnn = CondensedNearestNeighbour()
tomek = TomekLinks()
enn = EditedNearestNeighbours()
n_cleaning = NeighbourhoodCleaningRule()
onesided = OneSidedSelection()
u_samplings = [under, cnn, tomek, enn, n_cleaning, onesided]

top = 0
topmodel = ''
stype = ''

for m in models:
    for u in u_samplings:
        s4 = [('sampling', u)]
        pip4 = Pipeline(steps=s4)
        u_re_X, u_re_y = pip4.fit_resample(X_combo_train, y_combo_train)
        m4 = m.fit(u_re_X, u_re_y)
        predicted_y = m4.predict(X_combo_test)
        f1 = f1_score(y_combo_test, predicted_y)
        if f1 > top:
            top = f1
            topmodel=m
            stype=s
print('Top scoring model is', topmodel, 'with', stype, 'sampling, and an f1 score of', top)

In [None]:
# OVERSAMPLING + UNDERSAMPLING
# Combined Oversampling + Undersampling - smote + random undersampling, smote + tomek links, 
# smote + edited nearest neighbors
from imblearn.combine import SMOTEENN, SMOTETomek

smoteenn = SMOTEENN()
smotetomek = SMOTETomek()
combined = [smoteenn, smotetomek]

top = 0
topmodel = ''
stype = ''

for m in models:
    for c in combined:
        s5 = [('sampling', c)]
        pip5 = Pipeline(steps=s5)
        c_re_X, c_re_y = pip5.fit_resample(X_combo_train, y_combo_train)
        m5 = m.fit(c_re_X, c_re_y)
        predicted_y = m5.predict(X_combo_test)
        f1 = f1_score(y_combo_test, predicted_y)
        if f1 > top:
            top = f1
            topmodel=m
            stype=c
print('Top scoring model is', topmodel, 'with', stype, 'sampling, and an f1 score of', top)

In [None]:
# HYPERPARAMETER SEARCH - cannot be run on ensemble model

In [None]:
from sklearn.metrics import confusion_matrix
# Logistic Regression + SVMSMOTE oversampling produced highest score - proceed with those
svm_smote = SVMSMOTE()
steps = [('sampling', svm_smote), ('clf', logit)]
pipe = Pipeline(steps=steps)
# fit the model to the training data
f_model = pipe.fit(X_combo_train, y_combo_train)

# GRAPHING: METRIC v. THRESHOLD
threshold = 0.0
t_rec = []
t_prec = []
t_spec = []
thresh = []

while threshold <= 1:
    # calculate probability on testing data
    y_proba = f_model.predict_proba(X_combo_test)
    y_proba = y_proba[:, [1]]   #select the probability for the positive case only
    
    # select just the scores over our threshold
    cond = y_proba >= threshold
    # convert to 1 or 0 values
    y_pred = np.where((y_proba>=threshold), 1, y_proba)
    y_pred = np.where((y_pred<threshold), 0, y_pred)
    # calculate recall
    recall = recall_score(y_combo_test, y_pred)
    t_rec.append(recall)
    # calculate precision
    precision = precision_score(y_combo_test, y_pred)
    t_prec.append(precision)
    # calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_combo_test, y_pred).ravel()
    specificity = tn/(tn+fp)
    t_spec.append(specificity)

    # update threshold
    thresh.append(threshold)
    threshold += 0.01


metrics = pd.DataFrame(list(zip(thresh, t_rec, t_spec, t_prec)), columns=['Threshold'
                                                                              , 'Recall'
                                                                              , 'Specificity'
                                                                              , 'Precision'])

In [None]:
# graph it
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_context('paper')
ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(metrics, 'Threshold'))
ax.set(ylabel='Performance')
ax.grid()
ax.legend(title='', bbox_to_anchor=(.5, 1), loc='lower center', ncol=3)
plt.ylim(0,1)
plt.xlim(0,1)
plt.savefig('./combined_ms_s8_logistic_regression_svmsmote.pdf', dpi=300, bbox_inches="tight")