In [13]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
from q_value_calc_crosslinks import calcQ
import matplotlib.pyplot as plt
from functions import get_target_id
import re

In [14]:
input_file = "../data/crosslink_data/AChernev_080219_HeLa_RNA_UV.pkl"
#input_file_perc = "../data/crosslink_data/AChernev_080219_HeLa_RNA_UV_opti_.pkl"
#input_file = "../data/crosslink_data/M_Raabe_A_Wulf_220421_270421_Expl3_Ecoli_XL_UV_S30_LB_bRPfrac_11.pkl" # rank 0
#input_file_perc = "../data/crosslink_data/M_Raabe_A_Wulf_220421_270421_Expl3_Ecoli_XL_UV_S30_LB_bRPfrac_11_perc.pkl" # rank 0 - 7
#input_file = "../data/crosslink_data/M_Raabe_A_Wulf_220421_290421_Expl3_Ecoli_XL_DEB_S30_LB_bRPfrac_12.pkl" # rank 0
#input_file_perc = "../data/crosslink_data/M_Raabe_A_Wulf_220421_290421_Expl3_Ecoli_XL_DEB_S30_LB_bRPfrac_12_perc.pkl" # rank 0 - 7
#input_file = "../data/crosslink_data/MRaabe_LW_091221_171221_Expl2_XL_Ecoli_NM_S30_bRP_rep1_11.pkl" # rank 0
#input_file_perc = "../data/crosslink_data/MRaabe_LW_091221_171221_Expl2_XL_Ecoli_NM_S30_bRP_rep1_11_perc.pkl" # rank 0 - 7


In [15]:
# read original dataframes
original_df = pd.read_pickle(input_file)

# filter dataframes
filter_col = 'PSMId'
filter_val = 1

In [16]:
features = ['Score','peplen', 'NuXL:isXL', 'NuXL:modds', 'NuXL:pl_modds', 
                'NuXL:mass_error_p', 'NuXL:tag_XLed', 'NuXL:tag_unshifted' ,
                'NuXL:tag_shifted', 'missed_cleavages', 'NuXL:ladder_score',
                'variable_modifications']

# filter data and sort according to descending score
original_df = original_df.filter(np.concatenate([features,['ScanNr', 'rank', 'Label', 'PSMId']]))
original_df.sort_values('Score',ascending=False, inplace=True)


In [17]:
# determine minority class 
minority_class = min({len(original_df.loc[original_df['NuXL:isXL'] == 0]), len(original_df.loc[original_df['NuXL:isXL'] == 1])})
class_threshold = 500
if (minority_class > class_threshold):
    print("Truncating to " + str(class_threshold) + "\n") 
    minority_class = class_threshold

# define training data (peptides with top and bottom scores of each class with PSMId = 1)
pep_top = original_df.loc[(original_df['NuXL:isXL'] == 0) & (original_df[filter_col] == filter_val)][:int(minority_class/2)]
pep_bottom = original_df.loc[(original_df['NuXL:isXL'] == 0) & (original_df[filter_col] == filter_val)][-int(minority_class/2):]
XL_top = original_df.loc[(original_df['NuXL:isXL'] == 1) & (original_df[filter_col] == filter_val)][:int(minority_class/2)]
XL_bottom = original_df.loc[(original_df['NuXL:isXL'] == 1) & (original_df[filter_col] == filter_val)][-int(minority_class/2):]
train_idx = np.concatenate([pep_top.index, pep_bottom.index, XL_top.index, XL_bottom.index])

# set train labels for training
original_df.loc[pep_top.index,'train_label'] = 1
original_df.loc[pep_bottom.index,'train_label'] = 0
original_df.loc[XL_top.index,'train_label'] = 1
original_df.loc[XL_bottom.index,'train_label'] = 0

Truncating to 500



In [None]:
# define function for setting new rank column
def set_new_rank(x):
    x.sort_values("Score",ascending=False,inplace=True)
    x["new_PSMId"] = range(1,len(x) + 1)
    return x

# initialise the k-fold cross validator
no_split = 5
kf = KFold(n_splits=no_split, shuffle=True, random_state=1)

param_grid = {
        'C': np.power(float(2), [-5,-1,1,5,7,11,15])
    }
# create the pipeline
pipe = make_pipeline(MinMaxScaler(), 
                        GridSearchCV(
                            estimator=SVC(kernel='linear', probability=True), 
                            param_grid=param_grid, 
                            n_jobs=-1,
                            scoring="accuracy",
                            cv=kf, 
                            refit=True))
pipe.fit(original_df.loc[train_idx, features], original_df.loc[train_idx, 'train_label'])
original_df['Score_old'] = original_df['Score']
original_df.sort_values(['Score'], ascending=[False], inplace=True)
original_df['Score'] = 1.0 - pipe.predict_proba(original_df.loc[:,features])
# rerank PSMs
original_df = original_df.groupby("ScanNr")\
    .apply(
        lambda x: set_new_rank(x)
    )
original_df.index = original_df.index.droplevel('ScanNr')

# filter for rank = 0
original_df_filtered_new = original_df.loc[original_df['new_PSMId'] == 1,:]
original_df.sort_values(['Score', 'Score_old','PSMId'], ascending=[False, False, True], inplace=True)
# compute q-values
q_vals_SVM = calcQ(original_df_filtered_new)
target_ID = get_target_id(q_vals_SVM)
output_file = re.sub('.pkl', '_SVM.pkl', input_file)
target_ID.to_pickle(output_file)