### Notebook to prep data into GLIPH clustering format with spike ins
- Can be shared if need be. Its output is also used when scoring GLIPH clustering output.

In [30]:
import os 
import sys
sys.path.append('.../tcrvalid/comparitor_tooling/distance_based_tools')
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
from tcrclustering.cluster.helper import (
    tcrdist_parallel_simple_run,
    tcrdist_simple_run,
    ismart_simple_run
)
from tcrclustering.cluster.prep_gliph_data import *


In [31]:
from tcrvalid.data_subsetting import make_subset,sources_,chains_,features_
from tcrvalid.metrics import get_cluster_purity_df,clustering_scoring
from tcrvalid.defaults import *

In [4]:
def get_rename_dict(chain_type):
    """ rename df columns to match wrapper package expectations
    """
    if chain_type == 'TRB':
        rename_cols = {
            'junction_aa': 'cdr3_TRB',
            'meta_v_call': 'v_gene_TRB',
            'meta_j_call': 'j_gene_TRB',
        }
    elif chain_type=='TRA':
         rename_cols = {
            'junction_aa': 'cdr3_TRA',
            'meta_v_call': 'v_gene_TRA',
            'meta_j_call': 'j_gene_TRA',
        }
    elif chain_type=='TR':
        rename_cols = {
            'junction_aa_TRA': 'cdr3_TRA',
            'meta_v_call_TRA': 'v_gene_TRA',
            'meta_j_call_TRA': 'j_gene_TRA',
            'junction_aa_TRB': 'cdr3_TRB',
            'meta_v_call_TRB': 'v_gene_TRB',
            'meta_j_call_TRB': 'j_gene_TRB',
            #'peptide_TRA': 'peptide' 
        }
    else: 
        raise ValueError('unknown chain type')
    return rename_cols

def get_data(chain_type,noflu):
    """ pull in the labeled dataset and make subset
    
    Currently assumes CDR23 dropping in this implementation
    
    If using only one chain - introduce dummy variables into other 
    so that consistent package input. Not used f not requested.
    
    """
    df = pd.read_csv(labelled_data_path)
    source_type='both'
    tmp_df = make_subset(
        df,
        sources=sources_[source_type],
        chains=chains_[chain_type],
        feature=features_['CDR23'],
        noflu=noflu
    )
    rename_cols = get_rename_dict(chain_type)
    tmp_df = tmp_df.rename(columns=rename_cols)
    if chain_type=='TRB':
        tmp_df['v_gene_TRA'] = ['TRAV1-1']*len(tmp_df)
        tmp_df['j_gene_TRA'] = ['TRAJ1']*len(tmp_df)
        tmp_df['cdr3_TRA'] = ['CAAAAAAAF']*len(tmp_df)
    elif chain_type=='TRA':
        tmp_df['v_gene_TRB'] = ['TRBV19']*len(tmp_df)
        tmp_df['j_gene_TRB'] = ['TRBJ2-1']*len(tmp_df)
        tmp_df['cdr3_TRB'] = ['CAAAAAAAF']*len(tmp_df)

    return tmp_df



In [40]:
#Choose reference to create spike in datasets: gliph, gliphHQ and tcrvalid
def Generate_spike_ins(reference):
    if reference=='gliph':
            df_ref=prep_gliph_data('TRB','ref')
    elif reference=='gliphHQ':
        df_ref_orig=prep_gliph_data('TRB','ref')
        VDJDB_ref=pd.read_csv('./VDJDB_TCRs_w_quality.tsv',sep='\t')
        VDJDB_ref['v_gene_TRB'] = VDJDB_ref['V'].str.split('*').str[0]
        VDJDB_ref['j_gene_TRB'] = VDJDB_ref['J'].str.split('*').str[0]
        VDJDB_ref=VDJDB_ref.rename(columns={'CDR3':'cdr3_TRB','Epitope':'peptide'}) 
        tmp_df2=df_ref_orig.merge(VDJDB_ref,on=['v_gene_TRB','cdr3_TRB','j_gene_TRB','peptide'],how='left')
        Filtered=tmp_df2.loc[(tmp_df2["Gene"]=='TRB') & (tmp_df2["Score"]>0),]                       
        Filtered=Filtered[[ 'sequence_id','cdr3_TRB', 'v_call', 'j_call', 'peptide','Score', 'source', 'v_gene_TRB', 'j_gene_TRB', 'new_v_call', 'cdr1_no_gaps', 'cdr2_no_gaps', 'cdr1_cdr2_no_gaps', 'clono_id', 'v_gene_TRA', 'j_gene_TRA', 'cdr3_TRA']]
        Filtered=Filtered.sort_values(by=['clono_id','Score'], ascending=False)
        df_ref=Filtered.drop_duplicates('clono_id')
    elif reference=='tcrvalid':
        df_ref=get_data('TRB', False)

        
    for n in range(1,6):
        path='../Spike_splits/S'+str(n)+'_1.csv'
        df_merge=prep_gliph_data('TRB',path)

        df_merge['spike_fold']=n
        

        spike_score=pd.concat([df_ref,df_merge],join='inner', ignore_index=True)    
        spike_score['subject']='NA'
        spike_score['count']=1
        df_spike_4GLIP_NA=spike_score[["cdr3_TRB",'v_gene_TRB','j_gene_TRB',"peptide",'subject','count']]

        save_name_NA='df_ref_'+str(reference)+'_spike_'+str(n)+'_4GLIP_NA1.csv'
        df_spike_4GLIP_NA.to_csv(save_name_NA, sep ='\t',header=False,index=False)

        df_merge['peptide']=pd.util.hash_pandas_object(df_merge).astype(str)
        spike_score=pd.concat([df_ref,df_merge],join='inner', ignore_index=True)    
        spike_score['subject']='NA'
        spike_score['count']=1

        df_spike_4GLIP_hash=spike_score[["cdr3_TRB",'v_gene_TRB','j_gene_TRB',"peptide",'subject','count']]
        save_name_hash='df_ref_'+str(reference)+'_spike_'+str(n)+'_4GLIP_hash1.csv'
        df_spike_4GLIP_hash.to_csv(save_name_hash, sep ='\t',header=False,index=False)


        save_name_full='df_spike_'+str(reference)+'_spike_'+str(n)+'_hash1.csv'
        spike_score.to_csv(save_name_full, sep =',')


In [41]:
Generate_spike_ins('tcrvalid')