In [1]:
import requests
import pandas as pd
from pathlib import Path
import json
from tqdm import tqdm
import numpy as np
import plotly.express as px
from sklearn.cluster import DBSCAN
import shutil


In [40]:
cancers_patient_barcode = ["0"+str(i) for i in range(1, 10)]
def filter_patient(x):
    patient_id = x[:12]
    patient_sample = x[13:15]
    if patient_sample in cancers_patient_barcode:
        return patient_id
    else:
        return None


for file in Path("/rsrch4/home/mol_cgenesis/nkdang/CDR3/gdc/portal").glob("**/*.txt"):
    df = pd.read_csv(file, sep='\t')
    df['patient_id'] = df['patient'].apply(filter_patient)
    df = df.rename(columns={'sample': 'sample_id'})
    
    out = Path("/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/CDR3/gdc/portal").joinpath(file.parent.stem)
    out.mkdir(parents=True, exist_ok=True)
    out_na = Path("/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/CDR3/gdc_healthy/portal").joinpath(file.parent.stem)
    out_na.mkdir(parents=True, exist_ok=True)
    
    df_nan = df[df['patient_id'].isna()]
    df_nan = df_nan.drop("patient_id",axis=1)
    df = df.dropna()
    df.to_csv(out.joinpath(file.name), index=False)
    if df_nan.empty:
        out_na.rmdir()
    else:    
        df_nan.to_csv(out_na.joinpath(file.name), index=False)

In [23]:
for f in Path("/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results").glob("**/*.tsv"):
    if "temp" in str(f) or "data_" in str(f):
        continue
    f_new = Path(str(f).replace("/rsrch4/scratch/mol_cgenesis/","/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/"))
    f_new.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy(f, f_new)

In [5]:
a = pd.read_csv("/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal/SKCM/heavy/clustering_exact/clusters/4/IGHV3-15_IGHJ4_CATGPLDYW.tsv", sep='\t')
# a['ratio'] = a['#count']/a['frequency']
a[['sample_id','sequence_id','#count','frequency','patient_id','v_call','j_call','junction_aa']]

Unnamed: 0,sample_id,sequence_id,#count,frequency,patient_id,v_call,j_call,junction_aa
0,be9a0e19-9105-40a9-9a5e-fd7d577a8c92,assemble3896,5,2.5e-05,TCGA-ER-A42L,IGHV3-15,IGHJ4,CATGPLDYW
1,b7ee8f06-f81f-4447-b598-f5c43578fb8d,assemble36194,1,4e-06,TCGA-D9-A3Z3,IGHV3-15,IGHJ4,CATGPLDYW
2,bf1c6c9e-408c-40aa-876d-2222e2665549,assemble1491,106,0.000237,TCGA-D3-A2JB,IGHV3-15,IGHJ4,CATGPLDYW
3,bf1c6c9e-408c-40aa-876d-2222e2665549,assemble1491,6,1.5e-05,TCGA-D3-A2JB,IGHV3-15,IGHJ4,CATGPLDYW
4,bf1c6c9e-408c-40aa-876d-2222e2665549,assemble17338,8,1.8e-05,TCGA-D3-A2JB,IGHV3-15,IGHJ4,CATGPLDYW
5,f1202d0e-0b03-4ba9-b4f2-4d3c72bf1c9b,assemble7066,4,2.4e-05,TCGA-FS-A1Z4,IGHV3-15,IGHJ4,CATGPLDYW


In [2]:
cancer = "LUAD"
def customized_correlation(ls1: np.array, ls2: np.array) -> float:
    """
    Input: list1 and list2
    Output: correlation value after grouping nearby points 
    """
    points = np.array([[x, y] for x, y in zip(ls1, ls2)])
    clustering = DBSCAN(eps=0.001, min_samples=1).fit(points)
    labels = clustering.labels_
    ls1_new = []
    ls2_new = []
    for g in set(labels):
        indices = np.where(labels == g)
        group = points[indices]
        x, y = np.mean(group, axis=0)
        ls1_new.append(x)
        ls2_new.append(y)
    if len(ls1_new)< 2:
        return -1
    return pd.Series(ls1_new).corr(pd.Series(ls2_new))

In [3]:
light_chains = pd.read_feather(f"/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal/{cancer}/light/data_clean.feather")
heavy_chains = pd.read_feather(f"/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal/{cancer}/heavy/data_clean.feather")
patients_light_count = pd.DataFrame(light_chains.value_counts("patient_id"),columns=['total_expression'])
patients_heavy_count = pd.DataFrame(heavy_chains.value_counts("patient_id"),columns=['total_expression'])

light_groups = pd.DataFrame(light_chains.value_counts(['v_call','j_call','junction_aa','patient_id']),columns=['light_chain_expression'])
light_groups = light_groups.reset_index()

In [5]:
"""
A cluster of heavy chain IG
"""
heavy_cluster = pd.read_csv("/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal/LUAD/heavy/clustering_group/clusters/16/IGHV3-30_IGHJ6_CARDSYGMDVW.tsv", sep='\t')
heavy_expression = pd.DataFrame(heavy_cluster.value_counts(['patient_id']).sort_index(),columns=['cluster_expression'])
heavy_expression = heavy_expression.merge(patients_heavy_count, how='left',left_on='patient_id', right_on='patient_id')
heavy_expression['percent'] = heavy_expression['cluster_expression']/heavy_expression['total_expression']
patients = heavy_expression.index.tolist()

F_Ht = heavy_expression['percent'].values

### FREQUENCY LIGHT CHAIN MATRIX
groups = light_groups[light_groups['patient_id'].isin(patients)]
groups = groups.sort_values(by=["patient_id"])
groups = groups.merge(patients_light_count, how='left',left_on='patient_id', right_on='patient_id')
groups['percent'] = groups['light_chain_expression'] / groups['total_expression']

F_L = pd.crosstab(index=[groups["v_call"],groups["j_call"],groups['junction_aa']], columns=groups['patient_id'], values=groups['percent'], aggfunc='sum').fillna(0)
F_L

Unnamed: 0_level_0,Unnamed: 1_level_0,patient_id,TCGA-49-6743,TCGA-50-6590,TCGA-55-6968,TCGA-55-6969,TCGA-55-6985,TCGA-55-6987,TCGA-55-7570,TCGA-55-8505,TCGA-55-8621,TCGA-55-A4DF,TCGA-64-5779,TCGA-78-7150,TCGA-86-8585,TCGA-91-6831,TCGA-91-A4BC,TCGA-93-A4JQ
v_call,j_call,junction_aa,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
IGKV1-12,IGKJ1,CEQASSFPPWTF,0.0,0.000000,0.000000,0.0,0.0,0.000010,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGKV1-12,IGKJ1,CFQHNSYPWTF,0.0,0.000000,0.000000,0.0,0.0,0.000005,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGKV1-12,IGKJ1,CHEYVRFPTF,0.0,0.000015,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGKV1-12,IGKJ1,CHQAKSFPRTF,0.0,0.000000,0.000033,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGKV1-12,IGKJ1,CHQANSFPATF,0.0,0.000000,0.000000,0.0,0.0,0.000005,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IGLV9-49,IGLJ3,CGTDHGTGSSFVWVF,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000068,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGLV9-49,IGLJ3,RGADHGSGSNFVWVF,0.0,0.000000,0.002247,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGLV9-49,IGLJ3,RGADHGTGSNFVWVF,0.0,0.000000,0.000099,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IGLV9-49,IGLJ3,SGADHGSGSNFVWVF,0.0,0.000000,0.001123,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# remove low signal light chain
F_L_high_expression = F_L[F_L.sum(axis=1)>F_Ht.sum()/5]

# additional filters? (half number of values must be non-zero number)
F_L_high_expression = F_L_high_expression[(F_L_high_expression < 0.00001).astype(int).sum(axis=1) < len(F_L_high_expression.columns)/2]

F_L_high_expression

Unnamed: 0_level_0,Unnamed: 1_level_0,patient_id,TCGA-49-6743,TCGA-50-6590,TCGA-55-6968,TCGA-55-6969,TCGA-55-6985,TCGA-55-6987,TCGA-55-7570,TCGA-55-8505,TCGA-55-8621,TCGA-55-A4DF,TCGA-64-5779,TCGA-78-7150,TCGA-86-8585,TCGA-91-6831,TCGA-91-A4BC,TCGA-93-A4JQ
v_call,j_call,junction_aa,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
IGKV1-12,IGKJ3,CQQANSFPFTF,0.000000,0.005337,0.000000,0.000019,0.000081,0.000041,0.000000,0.000000,0.000407,0.000000,0.000015,0.000391,0.000751,0.001046,0.000012,0.000054
IGKV1-12,IGKJ4,CQQANSFPLTF,0.000343,0.000045,0.000231,0.000290,0.001863,0.000041,0.000085,0.000948,0.000587,0.000219,0.000798,0.001758,0.002669,0.000017,0.000159,0.001630
IGKV1-16,IGKJ4,CQQYNSYPLTF,0.000038,0.000075,0.000132,0.000377,0.000486,0.000015,0.000000,0.000211,0.000158,0.000548,0.000318,0.003517,0.000250,0.000017,0.000061,0.000489
IGKV1-17,IGKJ1,CLQHNSYPRTF,0.000190,0.002452,0.000991,0.000000,0.000486,0.000000,0.000000,0.000000,0.000316,0.000219,0.000007,0.000000,0.000000,0.000034,0.000085,0.000543
IGKV1-17,IGKJ4,CLQHNSYPLTF,0.000114,0.000075,0.000066,0.000029,0.000324,0.000026,0.000043,0.000000,0.000226,0.000000,0.000030,0.000000,0.000083,0.000000,0.004627,0.000163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IGLV3-1,IGLJ2,CQAWDSTTVVF,0.000000,0.000912,0.000396,0.000377,0.000000,0.000041,0.000085,0.001053,0.000361,0.000000,0.000007,0.003908,0.000000,0.000000,0.000147,0.001032
IGLV3-21,IGLJ2,CQVWDSSSDHVVF,0.003846,0.000299,0.001685,0.000145,0.000486,0.000092,0.000171,0.000632,0.000090,0.000000,0.000148,0.000000,0.001835,0.000291,0.001636,0.001250
IGLV3-25,IGLJ2,CQSADSSGTYVVF,0.001637,0.000194,0.001685,0.000396,0.000162,0.000020,0.000000,0.000316,0.000000,0.000000,0.000104,0.000000,0.000167,0.000103,0.002808,0.001250
IGLV4-69,IGLJ3,CQTWGTGIRVF,0.000000,0.000269,0.000033,0.000077,0.003159,0.000138,0.000128,0.000000,0.000226,0.000657,0.000067,0.000000,0.006840,0.000034,0.000000,0.000435


In [16]:
F_L_high_expression["correlation"] = np.array([row.reset_index(drop=True).corr(pd.Series(F_Ht)) for _, row in F_L_high_expression.iterrows()])
F_L_high_expression["fined_correlation"] = np.array([customized_correlation(row, F_Ht) for _, row in F_L_high_expression.iterrows()])

F_L_high_expression_high_correlation = F_L_high_expression[F_L_high_expression['fined_correlation']>0]
F_L_high_expression_high_correlation

Unnamed: 0_level_0,Unnamed: 1_level_0,patient_id,TCGA-49-6743,TCGA-50-6590,TCGA-55-6968,TCGA-55-6969,TCGA-55-6985,TCGA-55-6987,TCGA-55-7570,TCGA-55-8505,TCGA-55-8621,TCGA-55-A4DF,TCGA-64-5779,TCGA-78-7150,TCGA-86-8585,TCGA-91-6831,TCGA-91-A4BC,TCGA-93-A4JQ,correlation,fined_correlation
v_call,j_call,junction_aa,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
IGKV1-12,IGKJ3,CQQANSFPFTF,0.0,0.005337,0.0,1.9e-05,8.1e-05,4.1e-05,0.0,0.0,0.000407,0.0,1.5e-05,0.000391,0.000751,0.001046,1.2e-05,5.4e-05,0.413593,0.140244
IGKV1-17,IGKJ1,CLQHNSYPRTF,0.00019,0.002452,0.000991,0.0,0.000486,0.0,0.0,0.0,0.000316,0.000219,7e-06,0.0,0.0,3.4e-05,8.5e-05,0.000543,0.407232,0.172615
IGKV1-33,IGKJ2,CQQYDNLPYTF,0.005483,9e-05,0.00043,0.000367,0.000486,0.000251,0.0,0.001264,0.000474,0.0,0.000104,0.0,0.000417,6.9e-05,0.000269,0.000217,0.690088,0.727116
IGKV1-39,IGKJ1,CQQSYSTPRTF,0.002323,0.000777,0.000198,0.000271,0.001215,0.000102,0.000256,0.000105,0.000723,0.000986,8.9e-05,0.000586,0.000334,0.0,0.000488,0.00038,0.683952,0.870098
IGKV1-39,IGKJ1,CQQSYSTPWTF,0.001104,0.002467,0.000562,0.00029,0.000243,6.7e-05,0.000427,0.000211,0.001829,0.00011,0.000185,0.0,0.002753,0.000137,0.000232,0.000869,0.515369,0.143986
IGKV1-39,IGKJ4,CQQSYSTPLTF,0.000457,0.000314,0.00043,0.000135,0.000486,0.000256,0.001196,0.0,0.000158,0.0,0.000473,0.000782,0.000334,0.000274,0.000452,0.000435,0.053513,0.334727
IGKV1-5,IGKJ1,CQQYNSYWTF,0.000533,0.000105,0.000892,0.000184,0.000648,6.1e-05,0.000427,0.000632,0.001016,0.000438,9.6e-05,0.0,0.000334,1.7e-05,0.00011,0.0,0.018272,0.270937
IGKV2-28,IGKJ2,CMQALQTPRTF,0.000495,0.000882,0.0,0.000164,0.0,0.000681,0.0,0.000632,0.001039,0.001314,0.000222,0.0,0.0,0.00072,0.00022,0.000869,0.169904,0.322345
IGKV2-28,IGKJ4,CMQALQTPPTF,0.000495,0.003483,0.0,8.7e-05,0.0,0.000205,0.0,0.0,0.0,0.000329,0.0,0.001563,0.0,3.4e-05,4.9e-05,0.000217,0.512813,0.223798
IGKV3-11,IGKJ4,CQQRSNWPLTF,0.000533,0.000224,0.000991,0.000348,0.0,5e-06,0.000128,0.000421,0.000248,0.000329,6.7e-05,0.000782,0.001084,0.000171,0.000452,0.000869,0.317388,0.19089


In [20]:
score = pd.DataFrame(np.power(F_L_high_expression_high_correlation.drop(['correlation',"fined_correlation"],axis=1)-F_Ht, 2).sum(axis=1), columns=['score'])
cov_match = []
for idx, row in F_L_high_expression_high_correlation.iterrows():
    cov_match.append(row['fined_correlation'])
score['correlation'] = cov_match
score['distance'] = np.power(np.power(score['score'], 2) + np.power(score['correlation']-1,2), 1/2)
score = score.sort_values('distance')
score

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score,correlation,distance
v_call,j_call,junction_aa,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
IGKV1-39,IGKJ1,CQQSYSTPRTF,4.7e-05,0.870098,0.129902
IGLV1-51,IGLJ2,CGTWDSSLSAVVF,5.3e-05,0.786158,0.213842
IGKV4-1,IGKJ2,CQQYYRTPYTF,4.6e-05,0.73462,0.26538
IGKV1-33,IGKJ2,CQQYDNLPYTF,3.9e-05,0.727116,0.272884
IGKV4-1,IGKJ4,CQQYYSSPLTF,6.9e-05,0.709476,0.290524
IGKV4-1,IGKJ2,CQQYYSTPYTF,3.5e-05,0.697579,0.302421
IGKV3-20,IGKJ1,CQQYGSSPRTF,2.6e-05,0.67796,0.32204
IGKV4-1,IGKJ2,CQQYYTTPYTF,5.1e-05,0.609936,0.390064
IGLV3-21,IGLJ2,CQVWDSSSDHVVF,3.8e-05,0.557559,0.442441
IGKV1-39,IGKJ4,CQQSYSTPLTF,6.9e-05,0.334727,0.665273


In [18]:
score_viz = score.reset_index()
score_viz

Unnamed: 0,v_call,j_call,junction_aa,score,correlation,distance
0,IGKV1-39,IGKJ1,CQQSYSTPRTF,4.7e-05,0.870098,0.129902
1,IGLV1-51,IGLJ2,CGTWDSSLSAVVF,5.3e-05,0.786158,0.213842
2,IGKV4-1,IGKJ2,CQQYYRTPYTF,4.6e-05,0.73462,0.26538
3,IGKV1-33,IGKJ2,CQQYDNLPYTF,3.9e-05,0.727116,0.272884
4,IGKV4-1,IGKJ4,CQQYYSSPLTF,6.9e-05,0.709476,0.290524
5,IGKV4-1,IGKJ2,CQQYYSTPYTF,3.5e-05,0.697579,0.302421
6,IGKV3-20,IGKJ1,CQQYGSSPRTF,2.6e-05,0.67796,0.32204
7,IGKV4-1,IGKJ2,CQQYYTTPYTF,5.1e-05,0.609936,0.390064
8,IGLV3-21,IGLJ2,CQVWDSSSDHVVF,3.8e-05,0.557559,0.442441
9,IGKV1-39,IGKJ4,CQQSYSTPLTF,6.9e-05,0.334727,0.665273


In [35]:
from sklearn.cluster import DBSCAN
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import random

row = ('IGKV1-39','IGKJ1','CQQSYSTPRTF')

ls1 = F_L_high_expression.loc[row].reset_index(drop=True)
ls2 = pd.Series(F_Ht)
[print(i,j) for i,j in zip(ls1, ls2)]

fig = make_subplots(rows=1, cols=2, subplot_titles=(f"Original data - Cov: {ls1.corr(ls2)}", f"Clean data - Cov: {customized_correlation(ls1, ls2)}"))

points = np.array([[x,y] for x,y in zip(ls2, ls1)])
clustering = DBSCAN(eps=0.001, min_samples=1).fit(points)
labels = clustering.labels_.astype(str)
# [print(i,j) for i,j in zip(points, labels)]
colors = random.sample(['black', 'blue', 'green', 'orange', 'purple', 'red'], len(set(labels)))

ls1_new = []
ls2_new = []
for idx, g in enumerate(set(labels)):
    indices = np.where(labels == g)
    x, y = np.mean(points[indices], axis=0)
    fig.add_trace(go.Scatter(x=[x], y=[y], mode="markers", marker_color=f'{colors[idx]}'), row=1, col=2
    )

for idx, g in enumerate(set(labels)):
    indices = np.where(labels == g)
    fig.add_trace(
        go.Scatter(x=ls2.iloc[indices], y=ls1.iloc[indices], mode="markers", marker_color=f'{colors[idx]}'), row=1, col=1
    )

print(colors)
# [print(i,j) for i,j in zip(ls1_new, ls2_new)]
m = np.max(points)
fig.update_yaxes(range=[-0.0005, m+0.0005])
fig.update_xaxes(range=[-0.0005, m+0.0005])
fig.update_layout(title_text="_".join(row), height=1000, width=2000, showlegend=False)
fig.show()

0.0023226592544644555 0.006292134831460674
0.0007773724810141721 0.004469511546238161
0.00019824880224681975 0.0009637006103437199
0.00027077994294279775 0.00012806556957162068
0.001215066828675577 0.0006035003017501509
0.00010236514671484653 5.992688919518188e-05
0.0002563554795983764 0.0013831258644536654
0.00010532968190436065 0.0018450184501845018
0.0007226902143228167 0.0002300966405890474
0.0009856532690833424 0.0019230769230769232
8.871605686699245e-05 0.0002875215641173088
0.0005861664712778429 0.0016835016835016834
0.000333667000333667 0.002779708130646282
0.0 0.0004141644232760406
0.0004883349000744711 9.972078181092939e-05
0.0003803727653100038 0.0023228803716608595
['blue', 'red', 'purple']


In [40]:
fig = px.scatter(score, x="score", y="correlation", width=800, height=800, hover_data=[score.index.values], title="LUAD - Heavy chain: IGHV3-30_IGHJ6_CARDSYGMDVW")
fig.update_yaxes(range = [-0.01,1.01])
fig.update_traces(hovertemplate = 'Light chain: %{customdata[0]}<br>Score: %{x}<br>Correlation: %{y}')
# fig.update_yaxes(range=[-0.05, 1.05])
# fig.update_xaxes(range=[-0.05, 1.05])
fig.show()
# fig.write_html("testing.html")

In [None]:
light_path = Path(f"/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal/{cancer}/light/clustering_group")
for f in light_path.glob("**/*.tsv"):
    if int(f.parent.name)<10:
        continue
    # heavy = pd.read_csv(str(f).replace("light","heavy"), sep='\t', usecols=['v_call','j_call','junction_aa'])
    temp = pd.read_csv(f)
    print(f.stem)
    print(temp)
    break

In [58]:
cancers, heavys, lights, scores = [],[],[], []
for folder in Path("/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal").glob("*"):
    cancer = folder.name
    for lightchain in folder.joinpath("light").joinpath("clustering_group").glob("**/*.tsv"):
        temp = pd.read_csv(lightchain)
        for _, row in temp.iterrows():
            cancers.append(cancer)
            heavys.append(lightchain.stem)
            lights.append(row['v_call']+"_"+row['j_call']+"_"+row['junction_aa'])
            scores.append(row['score'])
potential_ig = pd.DataFrame({"cancer":cancers,"heavy":heavys,"lights":lights,"score":scores})
potential_ig

Unnamed: 0,cancer,heavy,lights,score
0,UCEC,IGHV1-18_IGHJ5_CARVVSVGASNWFDPW,IGKV1-39_IGKJ1_CQQSYTTPWTF,0.006430
1,UCEC,IGHV1-18_IGHJ5_CARVVSVGASNWFDPW,IGLV3-10_IGLJ3_CYSTDSSGNHWVF,0.010014
2,STAD,IGHV1-24_IGHJ4_CATDYDYW,IGKV1-16_IGKJ4_CQQYNSYPLTF,0.000001
3,STAD,IGHV1-24_IGHJ4_CATDYDYW,IGKV3-15_IGKJ1_CQQYNTWPRTF,0.000002
4,STAD,IGHV1-24_IGHJ4_CATDYDYW,IGKV1-39_IGKJ2_CQQSYSTPRTF,0.000002
...,...,...,...,...
1015,KIRC,IGHV1-69-2_IGHJ4_CAVPDHYHVLTGYYAPPRYW,IGKV2-28_IGKJ2_CMQTLQAAWTF,0.001289
1016,KIRC,IGHV1-69-2_IGHJ4_CAVPDHYHVLTGYYAPPRYW,IGKV2-28_IGKJ2_CMQALQAAWTF,0.001786
1017,KIRC,IGHV1-69-2_IGHJ4_CAVPDHYHVLTGYYAPPRYW,IGKV2-28_IGKJ2_CMQALQPAWTF,0.001616
1018,KIRC,IGHV1-69-2_IGHJ4_CAVPDHYHVLTGYYAPPRYW,IGKV3-20_IGKJ4_CQQFSISPPTF,0.001909


In [59]:
potential_ig['cancer'].value_counts()

STAD    461
LUSC    432
LUAD    110
KIRC      6
BRCA      5
TGCT      4
UCEC      2
Name: cancer, dtype: int64

In [72]:
cancers, heavys, lights, scores, patients = [],[],[],[], []
for folder in Path("/rsrch4/scratch/mol_cgenesis/nkdang/CDR3/results/portal").glob("*"):
    cancer = folder.name
    for lightchain in folder.joinpath("light").joinpath("clustering_group").glob("**/*.tsv"):
        temp_light = pd.read_csv(lightchain)
        patient = pd.read_csv(str(lightchain).replace("light","heavy"), sep='\t')['patient_id'].nunique()
        for _, row in temp.iterrows():
            cancers.append(cancer)
            patients.append(patient)
            heavys.append(lightchain.stem)
            lights.append(row['v_call']+"_"+row['j_call']+"_"+row['junction_aa'])
            scores.append(row['score'])
potential_ig_5 = pd.DataFrame({"cancer":cancers,"heavy":heavys,"lights":lights,"score":scores,"patient":patients})
potential_ig_5['cancer'].value_counts()

LUSC    295
STAD    225
LUAD    190
BRCA    150
KIRC     75
TGCT     50
HNSC     45
UCEC     30
COAD     30
THCA     30
OV       25
ESCA     20
CESC     15
SKCM     10
PAAD     10
LIHC      5
LAML      5
READ      5
Name: cancer, dtype: int64

In [73]:
potential_ig_5[potential_ig_5['cancer']=='UCEC']

Unnamed: 0,cancer,heavy,lights,score,patient
0,UCEC,IGHV3-30_IGHJ6_CARDSYGMDVW,IGKV1-33_IGKJ2_CQQYDNLPYTF,0.000293,7
1,UCEC,IGHV3-30_IGHJ6_CARDSYGMDVW,IGLV2-18_IGLJ3_CSSYTSSSTWVF,0.000396,7
2,UCEC,IGHV3-30_IGHJ6_CARDSYGMDVW,IGKV1-5_IGKJ1_CQQYNSYPWTF,0.000262,7
3,UCEC,IGHV3-30_IGHJ6_CARDSYGMDVW,IGLV3-1_IGLJ2_CQAWDSSTVIF,0.000197,7
4,UCEC,IGHV3-30_IGHJ6_CARDSYGMDVW,IGKV4-1_IGKJ2_CQQYYSTPYTF,0.000296,7
5,UCEC,IGHV4-39_IGHJ5_CARHHGSGFSWFDPW,IGKV1-33_IGKJ2_CQQYDNLPYTF,0.000293,7
6,UCEC,IGHV4-39_IGHJ5_CARHHGSGFSWFDPW,IGLV2-18_IGLJ3_CSSYTSSSTWVF,0.000396,7
7,UCEC,IGHV4-39_IGHJ5_CARHHGSGFSWFDPW,IGKV1-5_IGKJ1_CQQYNSYPWTF,0.000262,7
8,UCEC,IGHV4-39_IGHJ5_CARHHGSGFSWFDPW,IGLV3-1_IGLJ2_CQAWDSSTVIF,0.000197,7
9,UCEC,IGHV4-39_IGHJ5_CARHHGSGFSWFDPW,IGKV4-1_IGKJ2_CQQYYSTPYTF,0.000296,7
