In [384]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
from q_value_calc_crosslinks import calcQ
import matplotlib.pyplot as plt
from functions import get_target_id, get_datasets, rerank, get_top_indices, get_last_top_indices, get_nth_top_scan
import re
import os

In [385]:
i = 2
n_iters = 5
pep_filter = True
#decoy = "second" 
decoy = "bottom" 
#decoy = "last" 
#decoy = "bottom+last"
#decoy = "bottom+second" 
class_threshold = 500
dataset = get_datasets()

In [386]:
input_filename = f"{dataset[i]['file']}.pkl"
input_file = f"../data/{dataset[i]['type']}/{dataset[i]['name']}/{input_filename}" 

In [387]:
# read original dataframes
original_df = pd.read_pickle(input_file)

In [388]:
if dataset[i]['type'] == 'crosslink_data':

    features = ['Score','peplen', 'NuXL:isXL', 'NuXL:modds', 'NuXL:pl_modds', 
                    'NuXL:mass_error_p', 'NuXL:tag_XLed', 'NuXL:tag_unshifted' ,
                    'NuXL:tag_shifted', 'missed_cleavages', 'NuXL:ladder_score',
                    'variable_modifications']
elif dataset[i]['type'] == 'top_down_data':
    features = ['Score', 'NumMass', 'MatchingFragments', 'Coverage(%)', 'TagCount', 'ModCount', 'PrecursorQscore']

# filter data and sort according to descending score
original_df = original_df.filter(np.concatenate([features,['ScanNr', 'rank', 'Label', 'PSMId', 'Peptide','class-specific_q-val', 'cum_target_id']]))
original_df.sort_values('Score',ascending=False, inplace=True)


In [None]:
 # initialise the k-fold cross validator
no_split = 5
kf = KFold(n_splits=no_split, shuffle=True, random_state=1)

param_grid = {
        'svc__C': np.power(float(2), [-5,-1,1,5,7,11,15]),
        #'svc__class_weight': [{1:0.3,0:0.7},{1:0.4,0:0.6},{1:0.5,0:0.5}]
    }

pipe = make_pipeline(MinMaxScaler(), SVC(kernel='linear'))
# create the pipeline
gs = GridSearchCV(pipe,
                param_grid=param_grid, 
                n_jobs=-1,
                scoring="accuracy",
                cv=kf, 
                refit=True,
                verbose = 2)
# filter dataframes
filter_col = 'PSMId'
filter_val = 1

train_idx_list_bottom = []
train_idx_list_top = []
class_weights = []
original_df[f'Score_no_svm'] = original_df['Score']
original_df[f"PSMId_no_svm"] = original_df['PSMId']
original_df[f"class-specific_q-val_no_svm"] = original_df['class-specific_q-val']
original_df[f"cum_target_id_no_svm"] = original_df['cum_target_id']
for iteration in range(1,n_iters + 1):
    
   # sort dataframe according to Score column
    original_df.sort_values('Score',ascending=False, inplace=True)
    if dataset[i]['type'] == 'crosslink_data':
        if pep_filter:
            peptides_top_indices = get_nth_top_scan(original_df ,n=0,group_col= "Peptide", score_col="Score").index
            psm_top_indices = get_nth_top_scan(original_df, n=0,group_col= "ScanNr", score_col= "Score").index
            top_indices = peptides_top_indices.intersection(psm_top_indices)
        else:
            top_indices = get_nth_top_scan(original_df, n= 0, group_col="ScanNr", score_col="Score").index
    elif dataset[i]['type'] == 'top_down_data':
        top_indices = pd.Index(get_top_indices(original_df, "ScanNr", "Score"))
    # determine minority class 
    classes = []
    for c in np.unique(original_df[dataset[i]['group']]):
        group_indices = original_df.loc[(original_df[dataset[i]['group']] == c),:].index
        filtered_indices = top_indices.intersection(group_indices)
        classes.append(len(original_df.loc[filtered_indices,:]))
    minority_class = min(classes)
    
    if (minority_class > class_threshold):
        print("Truncating to " + str(class_threshold) + "\n") 
        minority_class = class_threshold

    # define training data (top and bottom scores of each class with PSMId = 1)
    train_idx = []
    train_idx_list_bottom_iter = []
    train_idx_list_top_iter = []
    original_df['train_label'] = np.NaN
    original_df.sort_values('Score',ascending=False, inplace=True)
    
    for c in np.unique(original_df[dataset[i]['group']]):
        group_indices = original_df.loc[(original_df[dataset[i]['group']] == c),:].index
        filtered_indices = top_indices.intersection(group_indices)
        # define targets as PSMs with top scores
        class_top = original_df.loc[filtered_indices,:].sort_values('Score',ascending=False)[:int(minority_class/2)].index
        # define decoys with different methods
        if decoy == "bottom": # worst scores
            class_bottom = original_df.loc[filtered_indices,:].sort_values('Score',ascending=False)[-int(minority_class/2):].index
        elif decoy == "last": # last PSM of each target
            top_scans = original_df.loc[class_top,'ScanNr']
            last_decoys = get_nth_top_scan(original_df[original_df['ScanNr'].isin(top_scans)],n=-1, group_col="ScanNr", score_col="Score").index
            last_decoys = last_decoys.difference(class_top)
            class_bottom = last_decoys
        elif decoy == "second": # second PSM of each target
            top_scans = original_df.loc[class_top,'ScanNr']
            second_decoys = get_nth_top_scan(original_df[original_df['ScanNr'].isin(top_scans)],n= 1, group_col="ScanNr", score_col="Score").index
            class_bottom = second_decoys
        elif decoy == "bottom+last": # bottoms and last of each target
            class_bottom = original_df.loc[filtered_indices,:].sort_values('Score',ascending=False)[-int(minority_class/2):].index
            top_scans = original_df.loc[class_top,'ScanNr']
            last_decoys = get_nth_top_scan(original_df[original_df['ScanNr'].isin(top_scans)],n=-1, group_col="ScanNr", score_col="Score").index
            last_decoys = last_decoys.difference(class_top)
            class_bottom = np.concatenate([class_bottom, last_decoys])
        elif decoy == "bottom+second": # bottom and second of each target
            class_bottom = original_df.loc[filtered_indices,:].sort_values('Score',ascending=False)[-int(minority_class/2):].index
            top_scans = original_df.loc[class_top,'ScanNr']
            second_decoys = get_nth_top_scan(original_df[original_df['ScanNr'].isin(top_scans)],n=1,group_col="ScanNr", score_col="Score").index
            class_bottom = np.concatenate([class_bottom, second_decoys])
        original_df.loc[class_top,'train_label'] = 1
        original_df.loc[class_bottom,'train_label'] = 0
        
        train_idx = np.concatenate([train_idx, class_top, class_bottom])
        train_idx_list_bottom_iter = np.concatenate([train_idx_list_bottom_iter, class_bottom])
        train_idx_list_top_iter = np.concatenate([train_idx_list_top_iter, class_top])

    train_idx_list_bottom.append(train_idx_list_bottom_iter) 
    train_idx_list_top.append(train_idx_list_top_iter)
    
    # fit SVM
    gs.fit(original_df.loc[train_idx, features], original_df.loc[train_idx, 'train_label'])
    class_weights.append(gs.best_params_)
    
    # compute new score
    original_df['Score'] = gs.decision_function(original_df.loc[:,features])
    original_df[f'Score_{iteration}'] = original_df['Score']

    # rerank PSMs
    if dataset[i]['type'] != 'top_down_data':
        original_df = rerank(original_df, "ScanNr", "Score", "PSMId")
    original_df[f"PSMId_{iteration}"] = original_df['PSMId']

    # filter for rank = 0
    mask = original_df[original_df[filter_col] == filter_val].index
    
    # compute q-values
    original_df.loc[mask,:] = calcQ(original_df.loc[mask,:], classColName=dataset[i]['group'])
    original_df.loc[mask,f"class-specific_q-val_{iteration}"] = original_df.loc[mask,'class-specific_q-val']
    
    # compute target IDs
    original_df.loc[mask,:] = get_target_id(original_df.loc[mask,:], isXLColName=dataset[i]['group'])
    original_df.loc[mask,f"cum_target_id_{iteration}"] = original_df.loc[mask,'cum_target_id']

Truncating to 500

Fitting 5 folds for each of 7 candidates, totalling 35 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)


Truncating to 500

Fitting 5 folds for each of 7 candidates, totalling 35 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)


Truncating to 500

Fitting 5 folds for each of 7 candidates, totalling 35 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)


Truncating to 500

Fitting 5 folds for each of 7 candidates, totalling 35 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)


Truncating to 500

Fitting 5 folds for each of 7 candidates, totalling 35 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)


In [382]:
display(original_df)

Unnamed: 0,Score,peplen,NuXL:isXL,NuXL:modds,NuXL:pl_modds,NuXL:mass_error_p,NuXL:tag_XLed,NuXL:tag_unshifted,NuXL:tag_shifted,missed_cleavages,...,class-specific_q-val_3,cum_target_id_3,Score_4,PSMId_4,class-specific_q-val_4,cum_target_id_4,Score_5,PSMId_5,class-specific_q-val_5,cum_target_id_5
0,-7.883586,7,0,7.697114,0.000000,0.974126,0,1,0,0,...,0.472727,1155.0,-4.492309,1,0.550273,1283.0,-7.883586,1,0.617959,1225.0
2,-16.017823,7,1,10.708378,2.146140,0.995740,3,2,1,0,...,0.372253,3049.0,-6.677674,1,0.361931,2879.0,-16.017823,1,0.310727,2433.0
4,-17.428850,8,1,6.421797,2.126318,0.995740,0,2,1,0,...,,,-8.029241,2,,,-17.428850,2,,
1,-21.265463,18,1,10.708378,2.208735,0.995740,3,2,1,0,...,,,-8.427393,3,,,-21.265463,3,,
3,-22.676758,19,1,6.421797,2.187323,0.995740,0,2,1,0,...,,,-9.779098,4,,,-22.676758,4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87046,-32.537697,36,1,1.920601,0.989611,0.327290,0,1,0,0,...,0.628677,5133.0,-12.059326,1,0.627332,4932.0,-32.537697,1,0.627997,4672.0
87045,-43.854320,46,1,1.940972,0.989611,0.059752,0,1,0,2,...,,,-14.946619,2,,,-43.854320,2,,
87047,-29.985017,14,1,6.584142,1.194499,0.087589,0,1,0,2,...,0.625538,5114.0,-10.655624,1,0.623008,4894.0,-29.985017,1,0.626612,4652.0
87048,-30.205372,31,1,2.624601,1.157999,0.881045,0,1,0,1,...,0.628844,5138.0,-12.458483,1,0.627582,4938.0,-30.205372,1,0.627202,4654.0


In [383]:
# save entire dataframe
directory=f"../data/{dataset[i]['type']}/{dataset[i]['name']}/{class_threshold}/"
if pep_filter and dataset[i]['type'] == 'crosslink_data':
    directory += "pep_unique/"
else:
    directory += "default/"
if decoy != 'bottom':
    directory += f"decoy_{decoy}/"
if not os.path.exists(directory):
    os.makedirs(directory)
output_file = re.sub('.pkl', f"_svm.pkl", input_filename)
path = directory + output_file
original_df.loc[:,[ 'NuXL:isXL','Label',
                    'Score_no_svm', 'PSMId_no_svm', 'class-specific_q-val_no_svm', 'cum_target_id_no_svm', 
                    'Score_1', 'PSMId_1', 'class-specific_q-val_1', 'cum_target_id_1',
                    'Score_2', 'PSMId_2', 'class-specific_q-val_2', 'cum_target_id_2',
                    'Score_3', 'PSMId_3', 'class-specific_q-val_3', 'cum_target_id_3',
                    'Score_4', 'PSMId_4', 'class-specific_q-val_4', 'cum_target_id_4',
                    'Score_5', 'PSMId_5', 'class-specific_q-val_5', 'cum_target_id_5']].to_pickle(path)