In [None]:
# Copyright 2023 Regeneron Pharmaceuticals Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Code to collate all comparision scores for TCR clustering benchmarking accross different labelled reference data sets, spikes ins or irrelevant TCRs and clustering methods.

In [1]:
import os
import glob
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns

import matplotlib as mpl
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt


### Collect ismart, tcrdist and tcrvalid scoring

In [2]:
base_path = "./Spikein_scores/"

folder=['ismartreference_gliph','ismartreference_tcrvalid','tcrdistreference_gliph','tcrdistreference_tcrvalid','tcrvalidreference_gliph','tcrvalidreference_tcrvalid','ismartreference_gliph_HQ','tcrdistreference_gliph_HQ','tcrvalidreference_gliph_HQ']
temp_df = pd.DataFrame()
master_score = pd.DataFrame() 

for approach in folder:
    files = glob.glob(base_path+approach+"/*.csv")
    #print(files)
    content = [] 
    for filename in files:  
        df = pd.read_csv(filename, index_col=None) 
        content.append(df) 
    temp_df = pd.concat(content)
    if 'eps' in temp_df.columns:
        temp_df['epsilon_scaled'] = (temp_df['eps'] - temp_df['eps'].min()) / (temp_df['eps'].max() - temp_df['eps'].min())
    master_score = master_score.append(temp_df) 

    
master_score=master_score.rename(columns={"well_clustered_total": "c-CSI","well_clustered": "c-precision"})
master_score['chain']='TRB'
master_score.drop(master_score.columns[[0,1]], inplace=True, axis=1)

### Compute mean and standard deviation of spike in repeats for each fold

In [3]:
master_score_meta=pd.DataFrame({'mean_CSI' : master_score.groupby( ['spike_x','minsize','eps','label_reference','method','epsilon_scaled','chain'])['c-CSI'].mean()}).reset_index()
master_score_meanprecision=pd.DataFrame({'mean_precision' : master_score.groupby( ['spike_x','minsize','eps','label_reference','method','epsilon_scaled','chain'])['c-precision'].mean()}).reset_index()
master_score_stdcsi=pd.DataFrame({'std_CSI' : master_score.groupby( ['spike_x','minsize','eps','label_reference','method','epsilon_scaled','chain'])['c-CSI'].std()}).reset_index()
master_score_stdprecision=pd.DataFrame({'std_precision' : master_score.groupby( ['spike_x','minsize','eps','label_reference','method','epsilon_scaled','chain'])['c-precision'].std()}).reset_index()
master_score_meta['mean_precision']=master_score_meanprecision['mean_precision']
master_score_meta['std_precision']=master_score_stdprecision['std_precision']
master_score_meta['std_CSI']=master_score_stdcsi['std_CSI']


### Collate clustcr outputs and mean spike ins. No eps parameter to scan

In [4]:
clustcr_Output_scored = pd.read_csv('./Spikein_scores/clustcr_scored_df.csv', index_col=None)
clustcr_Output_scored['chain']='TRB'
clustcr_Output_scored.drop(master_score.columns[[0,1]], inplace=True, axis=1)
clustcr_Output_means=pd.DataFrame({'mean_CSI' : clustcr_Output_scored.groupby( ['spike_x','minsize','label_reference','method'])['c-CSI'].mean()}).reset_index()
temp_2=pd.DataFrame({'mean_precision' : clustcr_Output_scored.groupby( ['spike_x','minsize','label_reference','method'])['c-precision'].mean()}).reset_index()
temp_3=pd.DataFrame({'std_precision' : clustcr_Output_scored.groupby( ['spike_x','minsize','label_reference','method'])['c-precision'].std()}).reset_index()
temp_4=pd.DataFrame({'std_CSI' : clustcr_Output_scored.groupby( ['spike_x','minsize','label_reference','method'])['c-CSI'].std()}).reset_index()



clustcr_Output_means['mean_precision']=temp_2['mean_precision']
clustcr_Output_means['std_CSI']=temp_4['std_CSI']    
clustcr_Output_means['std_precision']=temp_3['std_precision']    
clustcr_Output_means['chain']='TRB'
                                           

### Finally collate to fomatted scoring for both the GLIPH webtool and deeptcrwith and without double assignment corrections.

In [5]:
GLIPH_Output_scored = pd.read_csv('./Spikein_scores/Gliph_webtool_scored_df_withHQ_score.csv', index_col=None)
GLIPH_Output_scored.drop(GLIPH_Output_scored.columns[[0,1,2,3,4,5,6,12]], inplace=True, axis=1)
GLIPH_Output_scored=GLIPH_Output_scored.rename(columns={"c-CSI": "mean_CSI","c-precision": "mean_precision",})
GLIPH_Output_scored['chain']='TRB'

deeptcr_Output_scored = pd.read_csv('./Spikein_scores/deeptcr_scored_df.csv', index_col=None)
deeptcr_Output_scored['spike_x']=0
deeptcr_Output_scored['feature']=0

deeptcr_Output_scored.drop(deeptcr_Output_scored.columns[[0]], inplace=True, axis=1)

deeptcr_Output_scored=deeptcr_Output_scored.rename(columns={"well_clustered_total": "mean_CSI","well_clustered": "mean_precision",})

deeptcr_Output_scored['label_reference']='tcrvalid'

### Save both the full spike in master score dataframe along with the mean'ed on for downstream plotting
- Some eps were dropped from the scans and they caused the scan to be non monotonic which complicated visulaizations 

In [6]:
master_score_meta_f=pd.DataFrame()
master_score_meta_f=master_score_meta.append(clustcr_Output_means.append(GLIPH_Output_scored.append(deeptcr_Output_scored)))

master_score_meta_f = master_score_meta_f.drop(master_score_meta_f[(master_score_meta_f.eps.isin([3.16,5.43])) &
                                                                    (master_score_meta_f['minsize']==3) &
                                                                    (master_score_meta_f['spike_x']==0) &
                                                                    (master_score_meta_f['label_reference']=='gliph_HQ') &
                                                                    (master_score_meta_f['method']=='tcrvalid')
                                                                  
                                                                  ].index)
master_score_meta_f=master_score_meta_f.drop(master_score_meta_f[
                            (master_score_meta_f.eps.isin([16,34])) &
                            (master_score_meta_f['minsize']==3) &
                            (master_score_meta_f['spike_x']==0) &
                            (master_score_meta_f['label_reference']=='gliph')&
                            (master_score_meta_f['method']=='tcrdist')].index)
master_score_meta_f=master_score_meta_f.rename(columns={"mean_CSI":"c-CSI","mean_precision":"c-precision"})

master_score_meta_f.to_csv('master_score_meta_f.csv', sep =',')        


In [27]:
master_score_meta_f = pd.read_csv('../comparitor_tooling/master_score_meta_f.csv')

In [28]:
transformer_df = pd.read_csv('../results_data/comparison_scores_221012.csv')
transformer_df = transformer_df[transformer_df.method.isin(['tcr-bert','ESM'])]
transformer_df = transformer_df[transformer_df.mean_features==True]
transformer_df = transformer_df[transformer_df.sources=='both']
transformer_df = transformer_df[~(
    (transformer_df.method == 'ESM')
    & (transformer_df.feature == 'CDR3')
)]
transformer_df['label_reference'] = 'tcrvalid'
transformer_df['spike_x'] = 0
transformer_df['chain'] = transformer_df['chains'] 
scaling_map = {
    m:transformer_df[transformer_df['method']==m].epsilon.max() for m in ['tcr-bert','ESM']
}
transformer_df['epsilon_scaled']=transformer_df.apply(
    lambda x: x['epsilon']/scaling_map[x['method']],
    axis=1

)
transformer_df=transformer_df.rename(columns={"epsilon":"eps"})
master_score=pd.concat((master_score_meta_f,transformer_df))


In [29]:
concatenated_seq_encoding = pd.read_csv('../comparitor_tooling/1HotvsPC_score.csv')
concatenated_seq_encoding=concatenated_seq_encoding.rename(columns={"epsilon":"eps",'reference':'label_reference','Method':'method'})
master_score=pd.concat((master_score,concatenated_seq_encoding))

In [31]:
master_score.to_csv('master_score.csv', sep =',')        
