In [81]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
from q_value_calc_crosslinks import calcQ
import matplotlib.pyplot as plt
from functions import get_target_id
import re

In [82]:
i = 2
dataset = {1: { 'type': 'crosslink_data',
                'name':'AChernev_080219_HeLa_RNA_UV',
                'comparison':'opti_'},
           2: { 'type': 'crosslink_data',
                'name':'M_Raabe_A_Wulf_220421_270421_Expl3_Ecoli_XL_UV_S30_LB_bRPfrac_11',
                'comparison':'perc'},
           3: { 'type': 'crosslink_data',
                'name':'M_Raabe_A_Wulf_220421_290421_Expl3_Ecoli_XL_DEB_S30_LB_bRPfrac_12',
                'comparison':'perc'},
           4: { 'type': 'crosslink_data',
                'name':'MRaabe_LW_091221_171221_Expl2_XL_Ecoli_NM_S30_bRP_rep1_11',
                'comparison':'perc'}}

In [83]:
input_file = f"../data/{dataset[i]['type']}/{dataset[i]['name']}.pkl" 


In [84]:
# read original dataframes
original_df = pd.read_pickle(input_file)

# filter dataframes
filter_col = 'PSMId'
filter_val = 1
original_df_filtered = original_df.loc[original_df[filter_col] == filter_val,:]

In [85]:
features = ['Score','peplen', 'NuXL:isXL', 'NuXL:modds', 'NuXL:pl_modds', 
                'NuXL:mass_error_p', 'NuXL:tag_XLed', 'NuXL:tag_unshifted' ,
                'NuXL:tag_shifted', 'missed_cleavages', 'NuXL:ladder_score',
                'variable_modifications']

# filter data and sort according to descending score
original_df = original_df.filter(np.concatenate([features,['ScanNr', 'rank', 'Label', 'PSMId']]))
original_df.sort_values('Score',ascending=False, inplace=True)


In [86]:
# determine minority class 
minority_class = min({len(original_df.loc[original_df['NuXL:isXL'] == 0]), len(original_df.loc[original_df['NuXL:isXL'] == 1])})
class_threshold = 500
if (minority_class > class_threshold):
    print("Truncating to " + str(class_threshold) + "\n") 
    minority_class = class_threshold

# define training data (peptides with top and bottom scores of each class with PSMId = 1)
pep_top = original_df.loc[(original_df['NuXL:isXL'] == 0) & (original_df[filter_col] == filter_val)][:int(minority_class/2)]
pep_bottom = original_df.loc[(original_df['NuXL:isXL'] == 0) & (original_df[filter_col] == filter_val)][-int(minority_class/2):]
XL_top = original_df.loc[(original_df['NuXL:isXL'] == 1) & (original_df[filter_col] == filter_val)][:int(minority_class/2)]
XL_bottom = original_df.loc[(original_df['NuXL:isXL'] == 1) & (original_df[filter_col] == filter_val)][-int(minority_class/2):]
train_idx = np.concatenate([pep_top.index, pep_bottom.index, XL_top.index, XL_bottom.index])

# set train labels for training
original_df.loc[pep_top.index,'train_label'] = 1
original_df.loc[pep_bottom.index,'train_label'] = 0
original_df.loc[XL_top.index,'train_label'] = 1
original_df.loc[XL_bottom.index,'train_label'] = 0

Truncating to 500



In [87]:
# define function for setting new rank column
def set_new_rank(x):
    x.sort_values("Score",ascending=False,inplace=True)
    x["new_PSMId"] = range(1,len(x) + 1)
    return x

# initialise the k-fold cross validator
no_split = 5
kf = KFold(n_splits=no_split, shuffle=True, random_state=1)

param_grid = {
        'C': np.power(float(2), [-5,-1,1,5,7,11,15])
    }
# create the pipeline
pipe = make_pipeline(MinMaxScaler(), 
                        GridSearchCV(
                            estimator=SVC(kernel='linear', probability=True), 
                            param_grid=param_grid, 
                            n_jobs=-1,
                            scoring="accuracy",
                            cv=kf, 
                            refit=True))
pipe.fit(original_df.loc[train_idx, features], original_df.loc[train_idx, 'train_label'])
original_df['Score_old'] = original_df['Score']
original_df.sort_values(['Score'], ascending=[False], inplace=True)
original_df['Score'] = 1.0 - pipe.predict_proba(original_df.loc[:,features])
# rerank PSMs
original_df = original_df.groupby("ScanNr")\
    .apply(
        lambda x: set_new_rank(x)
    )
original_df.index = original_df.index.droplevel('ScanNr')

output_file = re.sub('.pkl', '_SVM.pkl', input_file)
original_df.to_pickle(output_file)
# filter for rank = 0
original_df_filtered_new = original_df.loc[original_df['new_PSMId'] == 1,:]
# compute q-values
q_vals_SVM = calcQ(original_df_filtered_new)
target_ID = get_target_id(q_vals_SVM)
output_file = re.sub('.pkl', '_SVM_filtered.pkl', input_file)
target_ID.to_pickle(output_file)

  .apply(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(scoreColName, ascending=ascending, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[labelColName].replace(to_replace=-1, value=0, inplace=True)
