In [None]:
# Copyright 2023 Regeneron Pharmaceuticals Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
from sklearn.cluster import DBSCAN

In [3]:
from tcrvalid.metrics import clustering_scoring

In [34]:
pre = 'deeptcr_features'
base = 'deeptcr_datafeats'
chain = 'TRB'
minsize = 2

name = os.path.join(
    pre,
    base+'_'+chain+'_'+str(minsize)
)
features = np.load(name+'/features.npy')
labels = np.load(name+'/labels.npy')
df = pd.read_csv(name+'/input_df_predeeptcrreformat.csv')
with open(name+'/labelnames.json','r') as fp:
    idx_to_class = json.load(fp)

In [35]:
epitopes = [idx_to_class[str(l)] for l in labels]

In [20]:
clustering = DBSCAN(eps=0.3, min_samples=3).fit(features)

In [None]:
df_cl = pd.DataFrame({
    'clono_id': np.arange(len(epitopes)),
    'sequence_id': np.arange(len(epitopes)),
    'epitope': epitopes,
    'cluster': clustering.labels_
})

In [14]:
df_cl

Unnamed: 0,clono_id,sequence_id,epitope,cluster
0,0,0,AAFKRSCLK,-1
1,1,1,AAFKRSCLK,-1
2,2,2,AAFKRSCLK,-1
3,3,3,AAFKRSCLK,-1
4,4,4,ALDPHSGHFV,-1
...,...,...,...,...
2907,2907,2907,YLQPRTFLL,-1
2908,2908,2908,YLQPRTFLL,-1
2909,2909,2909,YLQPRTFLL,-1
2910,2910,2910,YLQPRTFLL,-1


In [8]:
scores, _ = clustering_scoring(
    df_cl,
    df_cl, 
    epitope_col='epitope',
    cluster_col='cluster',
)

In [9]:
scores

{'n_clusters': 1,
 'n_epitopes_captured': 1,
 'percent_clustered': 0.07351139426611125,
 'percent_clustered_in_largest': 0.07351139426611125,
 'percent_labeled_clustered': 0.07351139426611125,
 'mean_purity': 100.0,
 'well_clustered_total': 0.07351139426611125,
 'well_clustered': 100.0}

In [28]:
def get_scores(eps):
    clustering = DBSCAN(eps=eps, min_samples=2).fit(features)
    df_cl = pd.DataFrame({
        'clono_id': np.arange(len(epitopes)),
        'sequence_id': np.arange(len(epitopes)),
        'epitope': epitopes,
        'cluster': clustering.labels_
    })
    scores, _ = clustering_scoring(
        df_cl,
        df_cl, 
        epitope_col='epitope',
        cluster_col='cluster',
    )
    return scores

In [57]:
os.getcwd()

'/data/home/allen.leary/repos/tcrvalid/comparitor_tooling/deeptcr_vae'

In [60]:
chain_ = ['TRB','TRA','TR']
minsizes_=[2,3]
df_deep_tcr_final=pd.DataFrame()
for chainz in chain_:
    for minsizez in minsizes_:
        pre = 'deeptcr_features'
        base = 'deeptcr_datafeats'
        chain = chainz
        minsize = minsizez

        name = os.path.join(
            pre,
            base+'_'+chain+'_'+str(minsize)
        )
        features = np.load(name+'/features.npy')
        labels = np.load(name+'/labels.npy')
        df = pd.read_csv(name+'/input_df_predeeptcrreformat.csv')
        with open(name+'/labelnames.json','r') as fp:
            idx_to_class = json.load(fp)
        epitopes = [idx_to_class[str(l)] for l in labels]
    
        all_scores = []
        if chain!='TR':
            epses = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.2]
        else:
            epses = [0.5,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0]

        for eps in epses:
            print(eps)
            all_scores.append( get_scores(eps) )
        df_deep_tcr=pd.DataFrame()
        df_deep_tcr['eps']=epses
        df_deep_tcr['well_clustered']=[ass['well_clustered'] for ass in all_scores]
        df_deep_tcr['well_clustered_total']=[ass['well_clustered_total'] for ass in all_scores]
        df_deep_tcr['method']='deeptcr'
        df_deep_tcr['chain']=str(chainz)
        df_deep_tcr['minsize']=str(minsizez)
        df_deep_tcr['feature']='CDR3'
        df_deep_tcr['epsilon_scaled'] = (df_deep_tcr['eps'] - df_deep_tcr['eps'].min()) / (df_deep_tcr['eps'].max() - df_deep_tcr['eps'].min())


        df_deep_tcr_final=df_deep_tcr_final.append(df_deep_tcr)
df_deep_tcr_final.to_csv('../Spikein_scores/deeptcr_scored_df.csv', sep =',')        
        
