In [9]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
from q_value_calc_crosslinks import calcQ
import matplotlib.pyplot as plt
from functions import get_target_id, get_datasets, rerank
import re

In [10]:
i = 2
filter_PSMId = True
n_iters = 5
dataset = get_datasets()

In [11]:
input_file = f"../data/{dataset[i]['type']}/{dataset[i]['file']}.pkl" 


In [12]:
# read original dataframes
original_df = pd.read_pickle(input_file)

In [13]:
if dataset[i]['type'] == 'crosslink_data':

    features = ['Score','peplen', 'NuXL:isXL', 'NuXL:modds', 'NuXL:pl_modds', 
                    'NuXL:mass_error_p', 'NuXL:tag_XLed', 'NuXL:tag_unshifted' ,
                    'NuXL:tag_shifted', 'missed_cleavages', 'NuXL:ladder_score',
                    'variable_modifications']
elif dataset[i]['type'] == 'top_down_data':
    features = ['Score', 'NumMass', 'MatchingFragments', 'Coverage(%)', 'TagCount', 'ModCount', 'PrecursorQscore']

# filter data and sort according to descending score
original_df = original_df.filter(np.concatenate([features,['ScanNr', 'rank', 'Label', 'PSMId']]))
original_df.sort_values('Score',ascending=False, inplace=True)


In [14]:
# determine minority class 
classes = []
for c in np.unique(original_df[dataset[i]['group']]):
    classes.append(len(original_df.loc[original_df[dataset[i]['group']] == c]))
minority_class = min(classes)
class_threshold = 500
if (minority_class > class_threshold):
    print("Truncating to " + str(class_threshold) + "\n") 
    minority_class = class_threshold



Truncating to 500



In [None]:
 # initialise the k-fold cross validator
no_split = 5
kf = KFold(n_splits=no_split, shuffle=True, random_state=1)

param_grid = {
        'C': np.power(float(2), [-5,-1,1,5,7,11,15])
    }
# create the pipeline
pipe = make_pipeline(MinMaxScaler(), 
                        GridSearchCV(
                            estimator=SVC(kernel='linear', 
                                          probability=True
                                          ), 
                            param_grid=param_grid, 
                            n_jobs=-1,
                            scoring="accuracy",
                            cv=kf, 
                            refit=True))
# filter dataframes
filter_col = 'PSMId'
filter_val = 1
for iteration in range(1,n_iters + 1):

   
    original_df.sort_values('Score',ascending=False, inplace=True)
    # define training data (peptides with top and bottom scores of each class with PSMId = 1)
    if filter_PSMId == True:
        filtered = (original_df[filter_col] == filter_val)
    else:
        filtered = True
    train_idx = []
    original_df['train_label'] = np.NaN
    for c in np.unique(original_df[dataset[i]['group']]):
        class_top = original_df.loc[(original_df[dataset[i]['group']] == c) &  filtered][:int(minority_class/2)]
        class_bottom = original_df.loc[(original_df[dataset[i]['group']] == c) &  filtered][-int(minority_class/2):]
        
        original_df.loc[class_top.index,'train_label'] = 1
        original_df.loc[class_bottom.index,'train_label'] = 0
        
        train_idx = np.concatenate([train_idx, class_top.index, class_bottom.index])
     

    # fit SVM
    pipe.fit(original_df.loc[train_idx, features], original_df.loc[train_idx, 'train_label'])
    original_df[f'Score_{iteration - 1}'] = original_df['Score']
    original_df.sort_values(['Score'], ascending=[False], inplace=True)
    original_df['Score'] = 1.0 - pipe.predict_proba(original_df.loc[:,features])

    if dataset[i]['type'] == 'crosslink_data':
        # rerank PSMs
        original_df[f"PSMId_{iteration - 1}"] = original_df['PSMId']
        original_df = rerank(original_df, "ScanNr", "Score", "PSMId")

    # save entire dataframe
    output_file = re.sub('.pkl', f"_SVM_iter_{iteration}.pkl", input_file)
    original_df.to_pickle(output_file)
    original_df_filtered_new = original_df
    if dataset[i]['type'] == 'crosslink_data':
        # filter for rank = 0
        original_df_filtered_new = original_df.loc[original_df[filter_col] == filter_val,:]
    
    # compute q-values
    q_vals_SVM = calcQ(original_df_filtered_new, classColName=dataset[i]['group'])
    target_ID = get_target_id(q_vals_SVM)
    output_file = re.sub('.pkl', f"_SVM_filtered_iter_{iteration}.pkl", input_file)
    target_ID.to_pickle(output_file) # save filtered dataframe

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(scoreColName, ascending=ascending, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[labelColName].replace(to_replace=-1, value=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[labelColName].replace(to_replace=-1, value=0, inplace=True)
A value is

In [None]:
    #if 'NuXL:isXL' in original_df.columns:
    #    pep_top = original_df.loc[(original_df['NuXL:isXL'] == 0) & (original_df[filter_col] == filter_val) ][:int(minority_class/2)]
    #    pep_bottom = original_df.loc[(original_df['NuXL:isXL'] == 0) & (original_df[filter_col] == filter_val) ][-int(minority_class/2):]
    #    XL_top = original_df.loc[(original_df['NuXL:isXL'] == 1) & (original_df[filter_col] == filter_val)][:int(minority_class/2)]
    #    XL_bottom = original_df.loc[(original_df['NuXL:isXL'] == 1) & (original_df[filter_col] == filter_val)][-int(minority_class/2):]
    #    train_idx = np.concatenate([pep_top.index, pep_bottom.index, XL_top.index, XL_bottom.index])
    #    train_idx_list.append(train_idx)
    #    
        # set train labels for training
    #    original_df['train_label'] = np.NaN
    #    original_df.loc[pep_top.index,'train_label'] = 1
    #    original_df.loc[pep_bottom.index,'train_label'] = 0
    #    original_df.loc[XL_top.index,'train_label'] = 1
    #    original_df.loc[XL_bottom.index,'train_label'] = 0
    #else: 
    #    top =  original_df[:int(minority_class/2)]
    #    bottom = original_df[-int(minority_class/2):]
    #    train_idx = np.concatenate([top.index, bottom.index])#

    #    original_df['train_label'] = np.NaN
    #    original_df.loc[top.index,'train_label'] = 1
    #    original_df.loc[bottom.index,'train_label'] = 0
        
   