In [None]:
# Copyright 2023 Regeneron Pharmaceuticals Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
import sys
# note: git clone clusTCR into lib using setup_tool.sh
sys.path.append('lib/clusTCR')

In [8]:
from clustcr import Clustering

import pandas as pd
import numpy as np
import time
import os
import sys
from os.path import dirname as up
from pathlib import Path

sys.path.append('../..')
from tcrvalid.data_subsetting import *
from tcrvalid.metrics import clustering_scoring

sys.path.append('../distance_based_tools')
from tcrclustering.cluster.prep_gliph_data import *

from matplotlib import pyplot as plt

In [4]:
labelled_data_path = '../../tcrvalid/data/antigen_reference_tcrs.csv'

df = pd.read_csv(labelled_data_path)
sources = sources_['both']
chains = chains_['TRB']
feature = features_['CDR3']
subset_df = make_subset(df,sources=sources,chains=chains,feature=feature,min_size=2, max_len=28)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df.junction_aa.map(lambda x: x[1:-1])


In [5]:
def clean_multiassigned(df_cl):
    clone_counts = df_cl.groupby('cluster')\
        .count()\
        .rename(columns={'junction_aa':'count'})\
        .sort_values('count',ascending=False)

    cluster_size_map = {clone:count for clone, count in zip(clone_counts.index,clone_counts['count'])}

    def choose_biggest_cluster(x):
        sizes = [cluster_size_map[y] for y in x]
        idx_best = np.argmax(sizes)
        return x[idx_best]

    clone_to_cluster_df = df_cl.groupby('junction_aa').apply(lambda x: list(x['cluster'])).rename('all_clusters').to_frame()
    clone_to_cluster_df['cluster'] = clone_to_cluster_df['all_clusters'].map(lambda x: x[0] if len(x)==1 else choose_biggest_cluster(x))
    
    clone_to_cluster_df = clone_to_cluster_df.reset_index()
    # remove remaining singletons (assign to unclustered)
    df_occ = clone_to_cluster_df.groupby('cluster').junction_aa.count().to_frame().reset_index().rename(columns={'junction_aa':'occupancy'})
    clone_to_cluster_df = clone_to_cluster_df.merge(df_occ,how='left',on='cluster')
    clone_to_cluster_df.loc[clone_to_cluster_df.occupancy==1,'cluster'] = -1.0 

    return clone_to_cluster_df

In [6]:
def spike_score(cluster_method,reference,multiple,minsize=2,spikein_hash=True):
    """ spike in multiple aka 1x, 2x etc
    """
    df_score = pd.DataFrame()
    
    if reference=='tcrvalid':
        df_ref = get_data('TRB', False)
    elif reference=='gliph':
        df_ref=prep_gliph_data('TRB','ref')
   
    if multiple==0:
        df_in = subset_df.copy()
        tmp_ins = df_in['junction_aa'].values
        cltcr = Clustering(n_cpus=8,)
        outs = cltcr.fit(tmp_ins)

        df_cl = clean_multiassigned(outs.clusters_df)
        df_cl = df_in.merge(df_cl,on='junction_aa',how='outer')
        df_cl.cluster.fillna(-1.0, inplace=True)
        df_cl['freq_count']=df_cl.groupby('cluster')['cluster'].transform('count')
        
        df_cl3=df_cl.copy()
        df_cl3.loc[df_cl3.freq_count==2,'cluster']=-1

        scores= clustering_scoring(
            df_cl,
            df_cl,
            epitope_col = 'peptide',
            cluster_col = 'cluster'
        )[0]
        
        #score for cluster sizes 3   

        score3=clustering_scoring(
                df_cl3,
                df_cl3, 
                epitope_col='peptide',
                cluster_col='cluster'
        )[0]
        #where i is a loop variable
        df_score2=pd.DataFrame(scores,index=[cluster_method])
        df_score2['method']=cluster_method
        df_score2=df_score2.reset_index()
        df_score2['minsize']=2
        df_score2['spike_x']=multiple
        df_score2['label_reference']=reference
        df_score2['fold']=0

        
        df_score3=pd.DataFrame(score3,index=[cluster_method])
        df_score3['method']=df_score3.index
        df_score3=df_score3.reset_index()
        #df_score3['eps']=eps_name.split('_')[1]
        df_score3['minsize']=3
        df_score3['spike_x']=multiple
        df_score3['label_reference']=reference
        df_score3['fold']=0
        
        df_score=pd.concat([df_score2,df_score3])


        
        

    else:
        base_name='S'+str(multiple)
        two_up = up(up(os.getcwd()))    
        path='/tcrvalid/data/Spike_splits/'

        for x in range(1, 11):
            import_name=two_up+path+base_name+'_'+str(x)+'.csv'
            df_merge=prep_gliph_data('TRB',import_name)
            if spikein_hash==True:
                df_merge['peptide']=pd.util.hash_pandas_object(df_merge).astype(str)
            else:
                df_merge['peptide']=np.nan
                    
            df_merge['spike_fold']=x
            
            #Merge the antigen labelled reference with the spike in TCRs
            spike_score2=pd.concat([df_ref,df_merge],join='inner', ignore_index=True)    
            tmp_ins = spike_score2['cdr3_TRB'].values

            cltcr = Clustering(
                n_cpus=8, 
            )
            outs = cltcr.fit(tmp_ins)
            outs.clusters_df
            df_cl = clean_multiassigned(outs.clusters_df)
            df_cl=df_cl.rename(columns={'junction_aa':'cdr3_TRB'})
            df_cl = spike_score2.merge(df_cl,on='cdr3_TRB',how='outer')
            df_cl.cluster.fillna(-1.0, inplace=True)
            df_cl['freq_count']=df_cl.groupby('cluster')['cluster'].transform('count')
            #df_cl = spike_score.merge(df_cl,on='cdr3_TRB',how='outer')
            #df_cl.cluster.fillna(-1.0, inplace=True)

            df_cl3=df_cl.copy()
            df_cl3.loc[df_cl3.freq_count==2,'cluster']=-1

            score2=clustering_scoring(
                df_cl,
                df_cl, 
                epitope_col='peptide',
                cluster_col='cluster')[0]
            score3=clustering_scoring(
                df_cl3,
                df_cl3, 
                epitope_col='peptide',
                cluster_col='cluster'
        )[0]
            
            df_score2= pd.DataFrame(score2,index=[x])
            df_score2['fold']=df_score2.index
            df_score2['method']=cluster_method
            #df_score2['eps']=eps_name.split('_')[1]
            df_score2['minsize']=2
            df_score2['spike_x']=multiple
            df_score2['label_reference']=reference
            
            df_score3=pd.DataFrame(score3,index=[x])
            df_score3['fold']=df_score3.index
            df_score3['method']=cluster_method
            df_score3=df_score3.reset_index()
            #df_score3['eps']=eps_name.split('_')[1]
            df_score3['minsize']=3
            df_score3['spike_x']=multiple
            df_score3['label_reference']=reference 
            
            df_score_int=pd.concat([df_score2,df_score3])
            df_score = pd.concat([df_score, df_score_int], ignore_index=True)
    
    directory_name='../Spikein_scores/'+cluster_method+'reference_'+reference
    print(directory_name)
    Path(directory_name).mkdir(parents=True, exist_ok=True)
    save_name='../Spikein_scores/'+cluster_method+'reference_'+reference+'/Spikein'+str(multiple)+'.csv'
    print(save_name)
    df_score.to_csv(save_name)
    return df_score


In [11]:
multiple_range=[0,1,2,3,4,5]

cluster_tool='clustcr'
reference=['tcrvalid','gliph']
test_df=pd.DataFrame()
for ref in reference:
    for multiples in multiple_range:
            test=spike_score(cluster_method=cluster_tool,minsize=2,reference=ref,multiple=multiples)
            test_df=pd.concat([test_df,test],ignore_index=True)
            print(str(multiples))
master_score=test_df.rename(columns={"well_clustered_total": "c-CSI","well_clustered": "c-precision"})

master_score.to_csv('../Spikein_scores/clustcr_scored_df.csv')

Clustering 2909 TCRs using two-step approach.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df['cdr2_no_gaps'] + '-' + df.junction_aa.map(lambda x: x[1:-1])


Total time to run ClusTCR: 0.421s
../Spikein_scores/clustcrreference_tcrvalid
../Spikein_scores/clustcrreference_tcrvalid/Spikein0.csv
0
Clustering 6275 TCRs using two-step approach.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df['cdr2_no_gaps'] + '-' + df.junction_aa.map(lambda x: x[1:-1])


Total time to run ClusTCR: 0.519s
Clustering 6276 TCRs using two-step approach.
Total time to run ClusTCR: 0.634s
Clustering 6276 TCRs using two-step approach.
Total time to run ClusTCR: 0.528s
Clustering 6275 TCRs using two-step approach.
Total time to run ClusTCR: 0.521s
Clustering 6275 TCRs using two-step approach.
Total time to run ClusTCR: 0.625s
Clustering 6274 TCRs using two-step approach.
Total time to run ClusTCR: 0.546s
Clustering 6274 TCRs using two-step approach.
Total time to run ClusTCR: 0.521s
Clustering 6274 TCRs using two-step approach.
Total time to run ClusTCR: 0.552s
Clustering 6275 TCRs using two-step approach.
Total time to run ClusTCR: 0.669s
Clustering 6276 TCRs using two-step approach.
Total time to run ClusTCR: 0.521s
../Spikein_scores/clustcrreference_tcrvalid
../Spikein_scores/clustcrreference_tcrvalid/Spikein1.csv
1
Clustering 9539 TCRs using two-step approach.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df['cdr2_no_gaps'] + '-' + df.junction_aa.map(lambda x: x[1:-1])


Total time to run ClusTCR: 0.686s
Clustering 9540 TCRs using two-step approach.
Total time to run ClusTCR: 0.962s
Clustering 9539 TCRs using two-step approach.
Total time to run ClusTCR: 0.760s
Clustering 9538 TCRs using two-step approach.
Total time to run ClusTCR: 0.677s
Clustering 9537 TCRs using two-step approach.
Total time to run ClusTCR: 0.754s
Clustering 9538 TCRs using two-step approach.
Total time to run ClusTCR: 0.651s
Clustering 9537 TCRs using two-step approach.
Total time to run ClusTCR: 0.760s
Clustering 9536 TCRs using two-step approach.
Total time to run ClusTCR: 0.660s
Clustering 9538 TCRs using two-step approach.
Total time to run ClusTCR: 0.779s
Clustering 9534 TCRs using two-step approach.
Total time to run ClusTCR: 0.651s
../Spikein_scores/clustcrreference_tcrvalid
../Spikein_scores/clustcrreference_tcrvalid/Spikein2.csv
2
Clustering 12800 TCRs using two-step approach.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df['cdr2_no_gaps'] + '-' + df.junction_aa.map(lambda x: x[1:-1])


Total time to run ClusTCR: 0.876s
Clustering 12801 TCRs using two-step approach.
Total time to run ClusTCR: 0.855s
Clustering 12803 TCRs using two-step approach.
Total time to run ClusTCR: 0.837s
Clustering 12802 TCRs using two-step approach.
Total time to run ClusTCR: 0.772s
Clustering 12803 TCRs using two-step approach.
Total time to run ClusTCR: 0.872s
Clustering 12802 TCRs using two-step approach.
Total time to run ClusTCR: 0.875s
Clustering 12804 TCRs using two-step approach.
Total time to run ClusTCR: 0.737s
Clustering 12798 TCRs using two-step approach.
Total time to run ClusTCR: 0.906s
Clustering 12802 TCRs using two-step approach.
Total time to run ClusTCR: 0.856s
Clustering 12802 TCRs using two-step approach.
Total time to run ClusTCR: 0.739s
../Spikein_scores/clustcrreference_tcrvalid
../Spikein_scores/clustcrreference_tcrvalid/Spikein3.csv
3
Clustering 16064 TCRs using two-step approach.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df['cdr2_no_gaps'] + '-' + df.junction_aa.map(lambda x: x[1:-1])


Total time to run ClusTCR: 0.957s
Clustering 16061 TCRs using two-step approach.
Total time to run ClusTCR: 0.940s
Clustering 16064 TCRs using two-step approach.
Total time to run ClusTCR: 1.010s
Clustering 16064 TCRs using two-step approach.
Total time to run ClusTCR: 0.955s
Clustering 16066 TCRs using two-step approach.
Total time to run ClusTCR: 0.914s
Clustering 16064 TCRs using two-step approach.
Total time to run ClusTCR: 0.996s
Clustering 16065 TCRs using two-step approach.
Total time to run ClusTCR: 0.828s
Clustering 16065 TCRs using two-step approach.
Total time to run ClusTCR: 0.917s
Clustering 16062 TCRs using two-step approach.
Total time to run ClusTCR: 0.892s
Clustering 16061 TCRs using two-step approach.
Total time to run ClusTCR: 0.931s
../Spikein_scores/clustcrreference_tcrvalid
../Spikein_scores/clustcrreference_tcrvalid/Spikein4.csv
4
Clustering 19328 TCRs using two-step approach.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre_feature'] = df['cdr2_no_gaps'] + '-' + df.junction_aa.map(lambda x: x[1:-1])


Total time to run ClusTCR: 1.043s
Clustering 19326 TCRs using two-step approach.
Total time to run ClusTCR: 1.086s
Clustering 19328 TCRs using two-step approach.
Total time to run ClusTCR: 1.052s
Clustering 19330 TCRs using two-step approach.
Total time to run ClusTCR: 1.133s
Clustering 19330 TCRs using two-step approach.
Total time to run ClusTCR: 1.025s
Clustering 19329 TCRs using two-step approach.
Total time to run ClusTCR: 1.062s
Clustering 19330 TCRs using two-step approach.
Total time to run ClusTCR: 1.049s
Clustering 19330 TCRs using two-step approach.
Total time to run ClusTCR: 1.105s
Clustering 19327 TCRs using two-step approach.
Total time to run ClusTCR: 1.184s
Clustering 19325 TCRs using two-step approach.
Total time to run ClusTCR: 1.073s
../Spikein_scores/clustcrreference_tcrvalid
../Spikein_scores/clustcrreference_tcrvalid/Spikein5.csv
5
Clustering 2909 TCRs using two-step approach.
Total time to run ClusTCR: 0.428s
../Spikein_scores/clustcrreference_gliph
../Spikein_sc