In [None]:
# Copyright 2023 Regeneron Pharmaceuticals Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Import cluster scoring from GLIPH webtool and process 


In [32]:
import os 
import sys
sys.path.append('distance_based_tools')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
from tcrclustering.cluster.helper import (
    tcrdist_parallel_simple_run,
    tcrdist_simple_run,
    ismart_simple_run
)
from tcrvalid.cluster_loop import *
from tcrvalid.data_subsetting import make_subset,sources_,chains_,features_
from tcrvalid.metrics import get_cluster_purity_df,clustering_scoring
from tcrvalid.defaults import *
from tcrclustering.cluster.prep_gliph_data import *
from tcrvalid.load_models import *
from tcrvalid.physio_embedding import SeqArrayDictConverter

## Import output from GLIPH webtool and process for plotting

- Add cluster size column for downstream filtering
- Remove duplicate TCRs that appear in multiple clusters and only keep first instance aka in largest cluster. 
- Filter out singleton clusters
- Merge with the original reference data as the clustering output only returns clustered TCRs
- Add -1 labels to unclustered TCRs from reference in order to compute CSI and precision

In [33]:
def prep_GLIPH_web_output(GLIPH_input,GLIPH_output,HQ=False):
    if GLIPH_input=='gliph_ref':
        Glip_input_raw=prep_gliph_data('TRB','ref')
    elif GLIPH_input=='tcrvalid_ref':
        Glip_input_raw = get_data('TRB', False)
    elif HQ==True:
        input_path='../tcrvalid/data/Prep_data_GLIPH/'+str(GLIPH_input)
        Glip_input_raw=pd.read_csv(input_path,sep='\t')
    else:
        input_path='../tcrvalid/data/Prep_data_GLIPH/'+str(GLIPH_input)
        Glip_input_raw=pd.read_csv(input_path)
        
        
    output_path='./Spikein_scores/GLIPH_webtool_output/GLIPH_output/Ref48_V1/'+str(GLIPH_output)

    gliph_output_raw = pd.read_csv(output_path)
    gliph_output = gliph_output_raw[["index", 'V','J','TcRb','TcRa',' number_unique_cdr3']]
    gliph_output=gliph_output.rename(columns={'index':'GLIPH_cluster','V':'v_gene_TRB','J':'j_gene_TRB','TcRa':'peptide','TcRb':'cdr3_TRB'})
    
    ## doublecount option to remove the duplicates in GLIPH clustering output by only keeping the instance in the largest cluster.
    gliph_output_trim=gliph_output.drop_duplicates(subset = ['cdr3_TRB', 'v_gene_TRB','j_gene_TRB']).reset_index(drop = True)
    gliph_output_trim['freq_count'] = gliph_output_trim.groupby('GLIPH_cluster')['GLIPH_cluster'].transform('count')
    gliph_output_trim = gliph_output_trim[gliph_output_trim.freq_count!=1]
    
        
    gliph_output['freq_count'] = gliph_output.groupby('GLIPH_cluster')['GLIPH_cluster'].transform('count')
    
    df_cl = Glip_input_raw.merge(gliph_output, how='left',on=['cdr3_TRB','v_gene_TRB','j_gene_TRB'])
    df_cl_trim = Glip_input_raw.merge(gliph_output_trim, how='left',on=['cdr3_TRB','v_gene_TRB','j_gene_TRB'])

    df_cl['GLIPH_cluster'] = df_cl['GLIPH_cluster'].fillna(-1)
    df_cl_trim['GLIPH_cluster'] = df_cl_trim['GLIPH_cluster'].fillna(-1)
    df_cl_min3=df_cl.copy()
    df_cl_min3.loc[df_cl_min3.freq_count == 2, 'GLIPH_cluster'] = -1 
    df_cl_trim_min3=df_cl_trim.copy()
    df_cl_trim_min3.loc[df_cl_trim_min3.freq_count == 2, 'GLIPH_cluster'] = -1 
    df_list=[df_cl,df_cl_min3,df_cl_trim,df_cl_trim_min3]
    score_df_f=pd.DataFrame()
    for df in df_list:
        score=clustering_scoring(
            df,
            df, 
            epitope_col='peptide_x',
            cluster_col='GLIPH_cluster',
            )[0]        
        score_df_int=pd.DataFrame.from_dict(score,orient='index').T
        score_df_f=score_df_f.append(score_df_int)
    
    
    score_df_f=score_df_f.rename(columns={"well_clustered_total": "c-CSI","well_clustered": "c-precision"})
    score_df_f['minsize']=[2,3,2,3]
    score_df_f['method']='GLIPH'
    score_df_f['Doublecount']=[True,True,False,False]
    score_df_f['reference']=str(GLIPH_input)
    return score_df_f


In [34]:
Gliph_ref_df=prep_GLIPH_web_output('gliph_ref','Gliph_ref_Gliphclustered_48RefV1.csv')
Gliph_ref_df['spike_x']=0
Gliph_ref_df['label_reference']='gliph'

spike1_df=prep_GLIPH_web_output('df_spike1_ref_hash.csv','GliphRefSpike1_GliphClustered_48RefV1.csv')
spike1_df['spike_x']=1
spike1_df['label_reference']='gliph'

tcrvalid_ref_df=prep_GLIPH_web_output('tcrvalid_ref','tcrvalid_reference_GliphClustered_48RefV1.csv')
tcrvalid_ref_df['spike_x']=0
tcrvalid_ref_df['label_reference']='tcrvalid'
Glip_tool_df=(spike1_df.append(Gliph_ref_df)).append(tcrvalid_ref_df)

In [35]:
spike_df_f=pd.DataFrame()
for n in range(2,6):
    gliph_input_name='df_spike'+str(n)+'_hash.csv'
    gliph_output_name='GliphRefSpike'+str(n)+'_GliphClustered_48RefV1.csv'

    spike_df=prep_GLIPH_web_output(gliph_input_name,gliph_output_name)
    spike_df['spike_x']=n
    spike_df['label_reference']='gliph'
    spike_df_f=spike_df_f.append(spike_df)

    

In [36]:
spike_df_tcrvalid_ref=pd.DataFrame()
for n in range(1,6):
    gliph_input_name='df_ref_tcrvalid_spike_'+str(n)+'_hash.csv'
    gliph_output_name='tcrvalidRefSpike'+str(n)+'_GliphClustered.csv'

    spike_df=prep_GLIPH_web_output(gliph_input_name,gliph_output_name)
    spike_df['spike_x']=n
    spike_df['label_reference']='tcrvalid'
    spike_df_tcrvalid_ref=spike_df_tcrvalid_ref.append(spike_df)

In [37]:
spike_dfHQ_f=pd.DataFrame()
for n in range(1,6):
    gliph_input_name='df_spike_HQ'+str(n)+'_hash.csv'
    gliph_output_name='GliphRefHQSpike'+str(n)+'_GliphClustered_48RefV1.csv'

    spike_df=prep_GLIPH_web_output(gliph_input_name,gliph_output_name)
    spike_df['spike_x']=n
    spike_df['label_reference']='gliph_HQ'
    spike_dfHQ_f=spike_dfHQ_f.append(spike_df)

In [38]:
##Note due to some weird format issue of my making prep_web_output input csv loading must be changed to sep='\t'
Gliph_refHQf_df=prep_GLIPH_web_output('GLIPH_RefHQ.csv','GliphRefHQ_GliphClustered_48RefV1.csv',HQ=True)
Gliph_refHQf_df['spike_x']=0
Gliph_refHQf_df['label_reference']='gliph_HQ'

Unnamed: 0,n_clusters,n_epitopes_captured,percent_clustered,percent_clustered_in_largest,percent_labeled_clustered,mean_purity,c-CSI,c-precision,minsize,method,Doublecount,reference,spike_x,label_reference
0,191.0,12.0,50.931174,2.510121,33.333333,93.356272,23.212321,69.636964,2,GLIPH,True,GLIPH_RefHQ.csv,0,gliph_HQ
0,78.0,8.0,32.631579,2.510121,18.811881,89.989164,10.451045,55.555556,3,GLIPH,True,GLIPH_RefHQ.csv,0,gliph_HQ
0,84.0,12.0,28.822882,3.410341,28.822882,90.113501,18.70187,64.885496,2,GLIPH,False,GLIPH_RefHQ.csv,0,gliph_HQ
0,28.0,8.0,16.50165,3.410341,16.50165,89.983359,8.80088,53.333333,3,GLIPH,False,GLIPH_RefHQ.csv,0,gliph_HQ


### Concat all score df and save into meta table for GLIPH outputs

In [39]:
Glip_tool_df=pd.concat([Gliph_ref_df,spike1_df,spike_df_f,Gliph_refHQf_df,spike_dfHQ_f,tcrvalid_ref_df,spike_df_tcrvalid_ref])
Glip_tool_df.to_csv('./Spikein_scores/Gliph_webtool_scored_df_withHQ_score.csv')

In [None]:
# Copyright 2023 Regeneron Pharmaceuticals Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.