In [1]:
import sys
sys.path.insert(1, '/Users/labadmin/Projects/immusign')
import shutil
import os.path

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
np.float = float
np.bool = bool
np.int = int
import matplotlib.pyplot as plt
import seaborn as sns
import random
import torch
import time
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import utils
import shap
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


# Read Filereport

In [4]:
filereport = pd.read_csv("intega/filereport_read_run_PRJEB55475_tsv.txt",delim_whitespace=True)
filereport["filenames"] = filereport["sra_ftp"].apply(lambda x: "_".join(x.split("/")[-1].split(".")[0].split("_")[0:2]))
filereport["stripped_patient_id"] = filereport["filenames"].apply(utils.get_stripped_pat_no)

In [7]:
for file in tqdm(filereport.filenames):
    file_name = file + ".clones.txt"
    dest_path = "intega/Intega_final/"
    src_path = "intega/Intega/"
    if not os.path.exists(dest_path + file_name):
        shutil.copy2( src_path + file_name, dest_path)

  0%|          | 0/146 [00:00<?, ?it/s]

# Load Data read from R-Script

In [5]:
import rpy2.robjects as robjects
robjects.r['load']("intega/final_intega_40000_reads_with_out_of_frame.RData")
twb = robjects.r['twb']

In [6]:
df = utils.convert_rtwb_to_pdtwb(twb)

In [7]:
df["clones.txt.name"] = df["sample"].apply(lambda x: x + ".clones.txt")

# Add metric info

In [8]:
stats = pd.read_csv("intega/output/stats.csv", sep=";")
stats.rename(columns={stats.columns[0]:"sample"}, inplace=True)

In [9]:
df = df.merge(stats.iloc[:-2], on = "sample")

In [22]:
len(df["clones.txt.name"].unique())

132

# Get meta info

## get disposition data

In [10]:
xl_file = pd.ExcelFile("intega/disposition_2022-07-01.xlsx")

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}
disposition = dfs["disposition"]

In [12]:
disposition.columns

Index(['Pat-No.', 'Age [y]', 'Gender (M/F)', 'Random No.',
       'Treatment as Randomized', 'Treatment as Treated',
       'Prior Surgery (Yes/No)', 'HER2 Status Local (IHC 2+/3+)',
       'HER2 Status Central', 'Tumor Proportion Score', 'Immune Cell Score',
       'Combined Positive Score', 'Prior Drug Therapy',
       'Therapy Duration, Any Component [mo]',
       'Therapy Duration, All Scheduled Components [mo]',
       'Therapy Duration, Nivolumab [mo]', 'Reason for End of Treatment',
       'Overall Best Response (CR/PR)', 'Time to Best Response [mo]',
       'Response at 1st Tumor Assessment (PD/SD/PR/CR/NA)',
       'Sum of Target Lesion Diameters [mm] at Screening',
       '% Change of Diameters (target lesions) at 1st Tumor Assessment',
       'Best % Change of Diameters (Taget Lesions) during Tumor Assessments',
       'PFS [mo]', 'Censoring (Yes/No)', 'No of AEs (Grade 3-5)', 'No of SAEs',
       'OS [mo]', 'Death (Yes/No)'],
      dtype='object')

## get results data

In [149]:
xl_file = pd.ExcelFile("intega/Ergebnisse_CDC_Her2_Aug2021_TRB.Immun_Metriken_INTEGA.xlsx")

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}
Ergebnisse_CDC_Her2 = dfs["Ergebnisse_CDC_Her2_Aug2021_INT"]
Ergebnisse_CDC_Her2.columns = Ergebnisse_CDC_Her2.iloc[0]
Ergebnisse_CDC_Her2 = Ergebnisse_CDC_Her2.iloc[1:].copy()


new_colums = [ 'Zentrum',              'Pat-ID',
                'Patient-ID',               'Probe',                   np.nan,
             'Probenabnahme',       'Datum_Analyse',                'CTC1',
                    'HER2-0',              'HER2-1',              'HER2-2',
                    'HER2-3',                 'CXC',             'PD-L1-0',
                   'PD-L1-1',             'PD-L1-2',             'PD-L1-3',
                'Kommentare1',                   "Kommentare2",                   "Kommentare3",
                         np.nan,                  'ID',           'all.names',
                 'Clonality',               'Group',           'Diversity',
                     'Group',            'Richness',               'Group']
Ergebnisse_CDC_Her2.columns = new_colums

In [150]:
Ergebnisse_CDC_Her2

Unnamed: 0,Zentrum,Pat-ID,Patient-ID,Probe,NaN,Probenabnahme,Datum_Analyse,CTC1,HER2-0,HER2-1,...,Kommentare3,NaN.1,ID,all.names,Clonality,Group,Diversity,Group.1,Richness,Group.2
1,1,1,0001-001,Baseline,1.0,2018-03-08 00:00:00,2018-03-13 00:00:00,1,,,...,,BL29,0001-001,Svenja-TRB-0001-001-BL-INTEGA_S100,0.094284,BL,9.551617,BL,1495,BL
2,1,1,0001-001,1Tag2Zyk,2.0,2018-04-05 00:00:00,2018-04-09 00:00:00,0,,,...,,C24,0001-001,Svenja-TRB-0001-001-C2-INTEGA_S101,0.08677,C2,9.816343,C2,1721,C2
3,1,1,0001-001,1Tag4-5,3.0,2018-06-04 00:00:00,2018-06-06 00:00:00,0,,,...,,,,,,,,,,
4,1,1,0001-001,ProEnd,4.0,2018-11-29 00:00:00,2018-11-30 00:00:00,1,1,,...,,,,,,,,,,
5,1,2,0001-002,Baseline,1.0,2019-02-22 00:00:00,2019-02-26 00:00:00,5,5,,...,,BL23,0001-002,Svenja-TRB-0001-002-BL-INTEGA_S79,0.082909,BL,9.154001,BL,1011,BL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,38,5,0038-005,1Tag4-5,3.0,2020-04-01 00:00:00,2020-04-06 00:00:00,142,130,12,...,,,0038-005,,0.124895,C2,8.294035,C2,713,C2
227,39,1,0039-001,Baseline,1.0,2019-11-21 00:00:00,2019-11-22 00:00:00,0,,,...,,,0039-001,NC-hs-TRB-0039-001-Baseline-PB-gDNA_S66,0.053723,BL,10.474131,BL,2148,BL
228,,,,,,,,,,,...,,,,,,,,,,
229,39,1,0039-001,ProEnd,4.0,2020-01-28 00:00:00,2020-01-30 00:00:00,0,,,...,,,,,,,,,,


## get probe review

In [266]:
xl_file = pd.ExcelFile("intega/Probenübersicht Sequenzierungsstatus Intega-Studie alle Zeitpunkte_LP.xlsx")

proben_df = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}
proben_df = proben_df["Probe_vorhanden"]

In [267]:
proben_df.keys()

Index(['Status', 'Patient', 'Tumorproben', 'Tumor', 'BL', 'C2', 'C5', 'EOT/PD',
       'Notizen', 'Cycle_EOT', 'Datum BL', 'Datum C2_sample',
       'Datum C5_sample', 'Datum EOT_sample', 'Datum_EOT_real',
       'week C2 sample', 'week C5 sample', 'week EOT sample',
       'week_to_EOT_C2_sample', 'week_to_EOT_C5_sample',
       'week_to_EOT_EOT_sample', 'Unnamed: 21', 'ja: Probe vorhanden'],
      dtype='object')

# Merge meta info

## merge dispistion 

In [23]:
df["stripped_patient_id"] = df["sample"].apply(utils.get_stripped_pat_no)

In [24]:
disposition["stripped_patient_id"] = disposition["Pat-No."].apply(utils.get_stripped_pat_no)

In [28]:
print("Patients not in disposition: ", set(df["stripped_patient_id"].unique()).difference(set(disposition["stripped_patient_id"].unique())))
print("Patients not in twb data: ", set(disposition["stripped_patient_id"].unique()).difference(set(df["stripped_patient_id"].unique())))

Patients not in disposition:  {'38-1'}
Patients not in twb data:  {'15-4', '22-3', '16-7', '1-3', '32-4', '6-1', '35-5', '2-4'}


In [33]:
df = df.merge(disposition, on = "stripped_patient_id", how = "left")

## merge results

In [66]:
Ergebnisse_CDC_Her2.columns

Index([      'Zentrum',        'Pat-ID',    'Patient-ID',         'Probe',
                   nan, 'Probenabnahme', 'Datum_Analyse',          'CTC1',
              'HER2-0',        'HER2-1',        'HER2-2',        'HER2-3',
                 'CXC',       'PD-L1-0',       'PD-L1-1',       'PD-L1-2',
             'PD-L1-3',   'Kommentare1',   'Kommentare2',   'Kommentare3',
                   nan,            'ID',     'all.names',     'Clonality',
               'Group',     'Diversity',         'Group',      'Richness',
               'Group'],
      dtype='object')

In [155]:
str(None)

'None'

In [156]:
Ergebnisse_CDC_Her2["stripped_patient_id"] = Ergebnisse_CDC_Her2["all.names"].apply(utils.get_stripped_pat_no)
Ergebnisse_CDC_Her2["Probe_cleaned"] = Ergebnisse_CDC_Her2["Probe"].apply(lambda x: "BL" if x in ["Baseline", "BL"] 
                                                                          else ( "C2" if "2Zyk" in str(x) else x))

In [157]:
Ergebnisse_CDC_Her2.loc[Ergebnisse_CDC_Her2["stripped_patient_id"].isnull(), "stripped_patient_id"] = Ergebnisse_CDC_Her2.loc[Ergebnisse_CDC_Her2["stripped_patient_id"].isnull(), "Patient-ID"].apply(utils.get_stripped_pat_no)

In [158]:
print("Patients not in results: ", set(df["stripped_patient_id"].unique()).difference(set(Ergebnisse_CDC_Her2["stripped_patient_id"].unique())))
print("Patients not in twb data: ", set(Ergebnisse_CDC_Her2["stripped_patient_id"].unique()).difference(set(df["stripped_patient_id"].unique())))

Patients not in results:  set()
Patients not in twb data:  {'3-19', '22-3', '9-1', '16-7', '1-3', '32-4', '1-6', '6-1', '15-8', None, '2-4'}


In [159]:
Ergebnisse_per_probe_patient = Ergebnisse_CDC_Her2.groupby(["Probe_cleaned", "stripped_patient_id"]).apply(len).reset_index()

In [160]:
Ergebnisse_per_probe_patient[Ergebnisse_per_probe_patient[0] > 1] 

Unnamed: 0,Probe_cleaned,stripped_patient_id,0
6,1Tag4-5,16-3,2
20,1Tag4-5,38-1,2
32,BL,12-1,2
54,BL,2-11,2
77,BL,27-2,2
103,BL,6-2,2
108,BL,7-6,2
126,C2,16-3,2
131,C2,2-10,2
140,C2,2-5,2


In [161]:
double_probe_patient = Ergebnisse_per_probe_patient[Ergebnisse_per_probe_patient[0] > 1][["Probe_cleaned", "stripped_patient_id"]]

### mehrere ergebnisse für gleiche patienten --> filereport_read_run_PRJEB55475_tsv

In [163]:
filereport = pd.read_csv("intega/filereport_read_run_PRJEB55475_tsv.txt",delim_whitespace=True)

In [165]:
filereport.head()

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp
0,ERR10747912,SAMEA112286929,ERX10200346,PRJEB55475,9606,Homo,sapiens,ftp.sra.ebi.ac.uk/vol1/fastq/ERR107/012/ERR107...,ftp.sra.ebi.ac.uk/vol1/run/ERR107/ERR10747912/...
1,ERR10747914,SAMEA112286931,ERX10200348,PRJEB55475,9606,Homo,sapiens,ftp.sra.ebi.ac.uk/vol1/fastq/ERR107/014/ERR107...,ftp.sra.ebi.ac.uk/vol1/run/ERR107/ERR10747914/...
2,ERR10747915,SAMEA112286932,ERX10200349,PRJEB55475,9606,Homo,sapiens,ftp.sra.ebi.ac.uk/vol1/fastq/ERR107/015/ERR107...,ftp.sra.ebi.ac.uk/vol1/run/ERR107/ERR10747915/...
3,ERR10747916,SAMEA112286933,ERX10200350,PRJEB55475,9606,Homo,sapiens,ftp.sra.ebi.ac.uk/vol1/fastq/ERR107/016/ERR107...,ftp.sra.ebi.ac.uk/vol1/run/ERR107/ERR10747916/...
4,ERR10747919,SAMEA112286936,ERX10200353,PRJEB55475,9606,Homo,sapiens,ftp.sra.ebi.ac.uk/vol1/fastq/ERR107/019/ERR107...,ftp.sra.ebi.ac.uk/vol1/run/ERR107/ERR10747919/...


In [166]:
filereport["filenames"] = filereport["sra_ftp"].apply(lambda x: x.split("/")[-1].split(".")[0])

In [167]:
filereport["stripped_patient_id"] = filereport["filenames"].apply(utils.get_stripped_pat_no)

In [170]:
for i, row in double_probe_patient.iterrows(): 
    id = row.stripped_patient_id
    probe = row.Probe_cleaned
    print(id, probe)
    if probe == "Baseline":
        probe = ["Baseline", "BL"]
    else:
        probe = [probe]
    
    filename_id = filereport[filereport["stripped_patient_id"] == id]["filenames"]
    print(list(filename_id))
    for file in filename_id:
        for p in probe:
            if p in file:
                print("Match: ",file)
    print("\n")

16-3 1Tag4-5
['NC-hs-TRB-0016-003-2-Zyklus-PB-gDNA_S32_L001_R2_001', 'Dona-hs-TRB-016-003-PB-Baseline-gDNA_S124_L001_R2_001']


38-1 1Tag4-5
['NC-hs-TRB-0038-001-Baseline-PB-gDNA_S11_L001_R2_001', 'NC-hs-TRB-0038-001-2-Zyklus-PB-gDNA_S12_L001_R2_001']


12-1 BL
['NC-hs-TRB-0012-001-Baseline-PB-gDNA_S23_L001_R2_001', 'Svenja-TRB-0012-001-C2-INTEGA_S119_L001_R2_001']


2-11 BL
['NC-hs-TRB-0002-011-Baseline-PB-gDNA_S16_L001_R2_001', 'Svenja-TRB-0002-011-C2-INTEGA_S153_L001_R2_001']


27-2 BL
['Svenja-TRB-0027-002-C2-INTEGA_S141_L001_R2_001', 'Svenja-TRB-0027-002-BL-INTEGA_S140_L001_R2_001']
Match:  Svenja-TRB-0027-002-BL-INTEGA_S140_L001_R2_001


6-2 BL
['NC-hs-TRB-0006-002-Baseline-PB-gDNA_S18_L001_R2_001']


7-6 BL
['NC-hs-TRB-0007-006-Baseline-PB-gDNA_S22_L001_R2_001']


16-3 C2
['NC-hs-TRB-0016-003-2-Zyklus-PB-gDNA_S32_L001_R2_001', 'Dona-hs-TRB-016-003-PB-Baseline-gDNA_S124_L001_R2_001']


2-10 C2
['Svenja-TRB-0002-010-BL-INTEGA_S152_L001_R2_001', 'NC-hs-TRB-0002-010-2-Zyklus-PB-gDNA

In [231]:
idx_to_drop = [143, 217,183]
# overwrite 109 with 110 
idx_to_overwrite = {109:110, 48:49, 85:86, 102:103, 140:141, 46:47, 31:32, 42:43, 73:74, 93:94, 96:97, 99:100}

### Overwrite rows with results found in file report

In [228]:
for i in idx_to_overwrite.keys():
    j = idx_to_overwrite[i]
    Ergebnisse_CDC_Her2.loc[i,'CTC1': 'Probe_cleaned'] = Ergebnisse_CDC_Her2.loc[j,'CTC1': 'Probe_cleaned']
    Ergebnisse_CDC_Her2.drop(j,inplace=True)

### Drop rows with double entry 

In [233]:
Ergebnisse_CDC_Her2.loc[idx_to_drop]

Unnamed: 0,Zentrum,Pat-ID,Patient-ID,Probe,NaN,Probenabnahme,Datum_Analyse,CTC1,HER2-0,HER2-1,...,ID,all.names,Clonality,Group,Diversity,Group.1,Richness,Group.2,stripped_patient_id,Probe_cleaned
143,16,3,0016-003,1Tag4-5,3.0,2019-08-07 00:00:00,2019-08-09 00:00:00,5,5.0,0.0,...,,,,,,,,,16-3,1Tag4-5
217,38,1,0038-001,1Tag4-5,3.0,2019-04-16 00:00:00,2019-04-18 00:00:00,0,,,...,,,,,,,,,38-1,1Tag4-5
183,27,2,0027-002,Baseline,1.0,2019-03-13 00:00:00,2019-03-15 00:00:00,10,0.0,1.0,...,,,,,,,,,27-2,BL


In [234]:
Ergebnisse_CDC_Her2.drop(idx_to_drop, inplace=True)

In [252]:
Ergebnisse_CDC_Her2["clones.txt.name"] = Ergebnisse_CDC_Her2["all.names"].apply(lambda x: x + ".clones.txt" if not pd.isnull(x) else x)

In [282]:
df_results = df.merge(Ergebnisse_CDC_Her2, on = ["clones.txt.name", "stripped_patient_id"], how = "left")

In [283]:
len(df_results)

249933

In [278]:
df_results[["Clonality_x","Clonality_y"]]

Unnamed: 0,Clonality_x,Clonality_y
0,0211129499744678,0.225941
1,0211129499744678,0.225941
2,0211129499744678,0.225941
3,0211129499744678,0.225941
4,0211129499744678,0.225941
...,...,...
249928,024876120545219,0.250719
249929,024876120545219,0.250719
249930,024876120545219,0.250719
249931,024876120545219,0.250719


## merge proben

In [268]:
proben_df

Unnamed: 0,Status,Patient,Tumorproben,Tumor,BL,C2,C5,EOT/PD,Notizen,Cycle_EOT,...,Datum EOT_sample,Datum_EOT_real,week C2 sample,week C5 sample,week EOT sample,week_to_EOT_C2_sample,week_to_EOT_C5_sample,week_to_EOT_EOT_sample,Unnamed: 21,ja: Probe vorhanden
0,,0001-001,1 Paraffinblock,seq,seq,seq,seq,seq,,C16,...,2018-11-29 00:00:00,2018-11-29 00:00:00,4.0,13.0,38.0,-34.0,-26.0,0.0,,NA: Probe nicht vorhanden
1,,0001-002,1 Paraffinblock,seq,seq,seq,seq,seq,,C19,...,2019-11-28 00:00:00,2019-11-28 00:00:00,3.0,9.0,40.0,-38.0,-32.0,0.0,,seq: Probe vorhanden und sequenziert mit GeneP...
2,,0001-003,1 Paraffinblock,seq,?,cf,Plasma,seq,"BL, C2 und C5 als 0038-001 beschriftet, Patien...",C16,...,2019-10-15 00:00:00,2019-10-15 00:00:00,3.0,9.0,35.0,-32.0,-26.0,0.0,,"cf: cfDNA isoliert, aber noch nicht mit GenePa..."
3,,0001-004,1 Parafinblock,seq,seq,seq,seq,,,C24,...,,2020-09-15 00:00:00,4.0,13.0,54.0,-51.0,-42.0,,,"g: gDNA isoliert, noch kein Gene Panel erstell..."
4,,0002-001,1 Parafinblock,seq,seq,seq,,,,C14,...,,2018-10-10 00:00:00,3.0,,31.0,-28.0,,,,"Panel: PanelUPCR bereits erstellt, jedoch noch..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,,0038-002,1 Paraffinblock,seq,seq,seq,seq,seq,,C31,...,2020-11-03 00:00:00,2020-11-03 00:00:00,3.0,12.0,64.0,-61.0,-52.0,0.0,,
94,,0038-003,1 Paraffinblock,seq,seq,cf,,seq,C3 statt C2,C7,...,2020-01-21 00:00:00,2020-01-21 00:00:00,9.0,,22.0,-13.0,,0.0,,
95,,0038-004,1 Paraffinblock,seq,seq,EOT,,seq,,C2,...,2019-12-11 00:00:00,2019-12-11 00:00:00,,,3.0,,,0.0,,
96,,0038-005,1 Paraffinblock,seq,seq,Panel,,,,C3,...,,2020-04-15 00:00:00,3.0,,5.0,-2.0,,,,


In [272]:
df_results.columns

Index([               'Umi.count',           'Umi.proportion',
                     'Read.count',          'Read.proportion',
       'CDR3.nucleotide.sequence', 'CDR3.amino.acid.sequence',
                         'V.gene',                   'J.gene',
                         'D.gene',                    'V.end',
       ...
                             'ID',                'all.names',
                    'Clonality_y',                    'Group',
                      'Diversity',                    'Group',
                       'Richness',                    'Group',
          'stripped_patient_id_y',            'Probe_cleaned'],
      dtype='object', length=104)

In [269]:
proben_df["stripped_patient_id"] = proben_df["Patient"].apply(utils.get_stripped_pat_no)

In [279]:
print("Patients not in disposition: ", set(df_results["stripped_patient_id"].unique()).difference(set(proben_df["stripped_patient_id"].unique())))
print("Patients not in twb data: ", set(proben_df["stripped_patient_id"].unique()).difference(set(df_results["stripped_patient_id"].unique())))

Patients not in disposition:  set()
Patients not in twb data:  {'24-1', '15-4', '32-2', '22-3', '9-1', '16-7', '16-4', '1-3', '32-4', '5-5', '6-1', '15-8', '35-4', '33-2', '35-5', '22-5', '2-4'}


In [285]:
df_results = df_results.merge(proben_df, on = ["stripped_patient_id"], how = "left")

In [290]:
df_results.head()

Unnamed: 0,Umi.count,Umi.proportion,Read.count,Read.proportion,CDR3.nucleotide.sequence,CDR3.amino.acid.sequence,V.gene,J.gene,D.gene,V.end,...,Datum EOT_sample,Datum_EOT_real,week C2 sample,week C5 sample,week EOT sample,week_to_EOT_C2_sample,week_to_EOT_C5_sample,week_to_EOT_EOT_sample,Unnamed: 21,ja: Probe vorhanden
0,8221.0,0.208125,8221.0,0.208125,TGCAGCGTTGAAGGTGGGACGCGCAATGAGCAGTTCTTC,CSVEGGTRNEQFF,TRBV29-1,TRBJ2-1,"TRBD1, TRBD2",195,...,,2018-10-10 00:00:00,3.0,,31.0,-28.0,,,,"Panel: PanelUPCR bereits erstellt, jedoch noch..."
1,930.0,0.023522,930.0,0.023522,TGTGCCAAAAATATTGGAGGTGACACCGGGGAGCTGTTTTTT,CAKNIGGDTGELFF,TRBV30,TRBJ2-2,TRBD2,177,...,,2018-10-10 00:00:00,3.0,,31.0,-28.0,,,,"Panel: PanelUPCR bereits erstellt, jedoch noch..."
2,707.0,0.017899,707.0,0.017899,TGCAGCGACCCGACGGTATGCCGGACGACGCCACCGGGGAGCTGTT...,CSDPTVCR~DATGELFF,TRBV29-1,TRBJ2-2,,189,...,,2018-10-10 00:00:00,3.0,,31.0,-28.0,,,,"Panel: PanelUPCR bereits erstellt, jedoch noch..."
3,357.0,0.009036,357.0,0.009036,TGTGCCAGCAGTGAGGGGAGCGGGAGTGGCACAGATACGCAGTATTTT,CASSEGSGSGTDTQYF,TRBV6-1,TRBJ2-3,TRBD2,190,...,,2018-10-10 00:00:00,3.0,,31.0,-28.0,,,,"Panel: PanelUPCR bereits erstellt, jedoch noch..."
4,320.0,0.008078,320.0,0.008078,TGTGCCAGCAGTGAAGGGGCCGGGGCCAACGTCCTGACTTTC,CASSEGAGANVLTF,TRBV6-1,TRBJ2-6,TRBD2,192,...,,2018-10-10 00:00:00,3.0,,31.0,-28.0,,,,"Panel: PanelUPCR bereits erstellt, jedoch noch..."


In [291]:
#df_results.to_pickle("intega/intega_40000_reads_with_out_of_frame_merged_dispo_results_proben.pkl")

# Load raw file info

In [293]:
df_raw = utils.read_clones_txt(np.unique(df["clones.txt.name"].values), "intega/Intega_final")

  0%|          | 0/132 [00:00<?, ?it/s]

In [295]:
df_results.columns

Index([               'Umi.count',           'Umi.proportion',
                     'Read.count',          'Read.proportion',
       'CDR3.nucleotide.sequence', 'CDR3.amino.acid.sequence',
                         'V.gene',                   'J.gene',
                         'D.gene',                    'V.end',
       ...
               'Datum EOT_sample',           'Datum_EOT_real',
                 'week C2 sample',           'week C5 sample',
                'week EOT sample',    'week_to_EOT_C2_sample',
          'week_to_EOT_C5_sample',   'week_to_EOT_EOT_sample',
                    'Unnamed: 21',      'ja: Probe vorhanden'],
      dtype='object', length=126)

In [296]:
df_raw.columns

Index(['cloneId', 'cloneCount', 'cloneFraction', 'nSeqCDR3', 'aaSeqCDR3',
       'bestVGene', 'bestDGene', 'bestJGene', 'vBestIdentityPercent',
       'lengthOfCDR3', 'targetSequences', 'allVGenes', 'allDGenes',
       'allJGenes', 'allVHits', 'allDHits', 'allJHits', 'allVAlignments',
       'allDAlignments', 'allJAlignments', 'clones.txt.name'],
      dtype='object')

## Add correct clone Id before adding info from raw files

In [302]:
for file in tqdm(df_results["clones.txt.name"].unique()):
    df_sub_file = df_results[df_results["clones.txt.name"] == file]
    df_raw_sub_file = df_raw[df_raw["clones.txt.name"] == file]
    for i, index_row in enumerate(df_sub_file.iterrows()):
        index, row = index_row[0], index_row[1]  
        if row["CDR3.nucleotide.sequence"] == df_raw_sub_file.iloc[i]["nSeqCDR3"]:
            df_results.loc[index,"cloneId"] = df_raw_sub_file.iloc[i]["cloneId"]

  0%|          | 0/132 [00:00<?, ?it/s]

In [303]:
df_results2 = df_results.merge(df_raw, left_on=["clones.txt.name", "cloneId", "CDR3.nucleotide.sequence"], right_on =["clones.txt.name", "cloneId", "nSeqCDR3"], how = "left")

In [305]:
#df_results2.to_pickle("intega/intega_40000_reads_with_out_of_frame_merged_dispo_results_proben_raw_data.pkl")

In [307]:
list(df_results2.columns)

['Umi.count',
 'Umi.proportion',
 'Read.count',
 'Read.proportion',
 'CDR3.nucleotide.sequence',
 'CDR3.amino.acid.sequence',
 'V.gene',
 'J.gene',
 'D.gene',
 'V.end',
 'J.start',
 'D5.end',
 'D3.end',
 'VD.insertions',
 'DJ.insertions',
 'Total.insertions',
 'sample',
 'cloneId',
 'clones.txt.name',
 '#Nucleotide clones',
 '#Aminoacid clonotypes',
 '%Aminoacid clonotypes',
 '#In-frames',
 '%In-frames',
 '#Out-of-frames',
 '%Out-of-frames',
 'Sum.reads',
 'Min.reads',
 '1st Qu.reads',
 'Median.reads',
 'Mean.reads',
 '3rd Qu.reads',
 'Max.reads',
 'Sum.UMIs',
 'Min.UMIs',
 '1st Qu.UMIs',
 'Median.UMIs',
 'Mean.UMIs',
 '3rd Qu.UMIs',
 'Max.UMIs',
 'Clonality_x',
 'twb.shannon',
 'twb.simp',
 'stripped_patient_id',
 'Pat-No.',
 'Age [y]',
 'Gender (M/F)',
 'Random No.',
 'Treatment as Randomized',
 'Treatment as Treated',
 'Prior Surgery (Yes/No)',
 'HER2 Status Local (IHC 2+/3+)',
 'HER2 Status Central',
 'Tumor Proportion Score',
 'Immune Cell Score',
 'Combined Positive Score',
 'Pri

# Get rid of redundant information

In [308]:
df_results2.drop(columns = ['Umi.count', 'Umi.proportion', 'Sum.UMIs', 'Min.UMIs', '1st Qu.UMIs',
       'Median.UMIs', 'Mean.UMIs', '3rd Qu.UMIs', 'Max.UMIs', 'nSeqCDR3'],inplace=True)

In [311]:
list(df_results2.columns)

['Read.count',
 'Read.proportion',
 'CDR3.nucleotide.sequence',
 'CDR3.amino.acid.sequence',
 'V.gene',
 'J.gene',
 'D.gene',
 'V.end',
 'J.start',
 'D5.end',
 'D3.end',
 'VD.insertions',
 'DJ.insertions',
 'Total.insertions',
 'sample',
 'cloneId',
 'clones.txt.name',
 '#Nucleotide clones',
 '#Aminoacid clonotypes',
 '%Aminoacid clonotypes',
 '#In-frames',
 '%In-frames',
 '#Out-of-frames',
 '%Out-of-frames',
 'Sum.reads',
 'Min.reads',
 '1st Qu.reads',
 'Median.reads',
 'Mean.reads',
 '3rd Qu.reads',
 'Max.reads',
 'Clonality_x',
 'twb.shannon',
 'twb.simp',
 'stripped_patient_id',
 'Pat-No.',
 'Age [y]',
 'Gender (M/F)',
 'Random No.',
 'Treatment as Randomized',
 'Treatment as Treated',
 'Prior Surgery (Yes/No)',
 'HER2 Status Local (IHC 2+/3+)',
 'HER2 Status Central',
 'Tumor Proportion Score',
 'Immune Cell Score',
 'Combined Positive Score',
 'Prior Drug Therapy',
 'Therapy Duration, Any Component [mo]',
 'Therapy Duration, All Scheduled Components [mo]',
 'Therapy Duration, Niv

In [315]:
df_results2[['Pat-No.','Pat-ID','Patient-ID', 'Probe', 'ID', 'all.names', 'Group',
 'Probe_cleaned', 'Patient']]

Unnamed: 0,Pat-No.,Pat-ID,Patient-ID,Probe,ID,all.names,Group,Group.1,Group.2,Probe_cleaned,Patient
0,0002-001,1,0002-001,Baseline,002-001,Dona-hs-TRB-002-001-PB-Baseline-gDNA_S111,BL,BL,BL,BL,0002-001
1,0002-001,1,0002-001,Baseline,002-001,Dona-hs-TRB-002-001-PB-Baseline-gDNA_S111,BL,BL,BL,BL,0002-001
2,0002-001,1,0002-001,Baseline,002-001,Dona-hs-TRB-002-001-PB-Baseline-gDNA_S111,BL,BL,BL,BL,0002-001
3,0002-001,1,0002-001,Baseline,002-001,Dona-hs-TRB-002-001-PB-Baseline-gDNA_S111,BL,BL,BL,BL,0002-001
4,0002-001,1,0002-001,Baseline,002-001,Dona-hs-TRB-002-001-PB-Baseline-gDNA_S111,BL,BL,BL,BL,0002-001
...,...,...,...,...,...,...,...,...,...,...,...
249928,0035-001,1,0035-001,Baseline,0035-001,Svenja-TRB-0035-001-BL-INTEGA_S145,BL,BL,BL,BL,0035-001
249929,0035-001,1,0035-001,Baseline,0035-001,Svenja-TRB-0035-001-BL-INTEGA_S145,BL,BL,BL,BL,0035-001
249930,0035-001,1,0035-001,Baseline,0035-001,Svenja-TRB-0035-001-BL-INTEGA_S145,BL,BL,BL,BL,0035-001
249931,0035-001,1,0035-001,Baseline,0035-001,Svenja-TRB-0035-001-BL-INTEGA_S145,BL,BL,BL,BL,0035-001


In [316]:
df_results2.drop(columns = ['Pat-No.','Pat-ID','Patient-ID','ID', 'all.names', 'Group','Patient'],inplace=True)

In [317]:
df_results2[['Clonality_x',
 'twb.shannon',
 'twb.simp','Clonality_y','Diversity', 'Richness',]]

Unnamed: 0,Clonality_x,twb.shannon,twb.simp,Clonality_y,Diversity,Richness
0,0211129499744678,822992454789389,220566329934599,0.225941,7.659172,952
1,0211129499744678,822992454789389,220566329934599,0.225941,7.659172,952
2,0211129499744678,822992454789389,220566329934599,0.225941,7.659172,952
3,0211129499744678,822992454789389,220566329934599,0.225941,7.659172,952
4,0211129499744678,822992454789389,220566329934599,0.225941,7.659172,952
...,...,...,...,...,...,...
249928,024876120545219,800753241247974,462592236125888,0.250719,7.448633,983
249929,024876120545219,800753241247974,462592236125888,0.250719,7.448633,983
249930,024876120545219,800753241247974,462592236125888,0.250719,7.448633,983
249931,024876120545219,800753241247974,462592236125888,0.250719,7.448633,983


In [318]:
df_results2.rename(columns={"Clonality_x" : "Clonality", 
                            "Clonality_y" : "Clonality_results",
                           "Diversity" : "Diversity_results", 
                           "Richness" : "Richness_results"}, inplace=True)

In [321]:
#df_results2.to_pickle("intega/intega_40000_reads_with_out_of_frame_merged_dispo_results_proben_raw_data_cleaned.pkl")

# Small Analysis

In [3]:
df = pd.read_pickle("intega/intega_40000_reads_with_out_of_frame_merged_dispo_results_proben_raw_data_cleaned.pkl")

In [6]:
np.asarray(df.columns)

array(['Read.count', 'Read.proportion', 'CDR3.nucleotide.sequence',
       'CDR3.amino.acid.sequence', 'V.gene', 'J.gene', 'D.gene', 'V.end',
       'J.start', 'D5.end', 'D3.end', 'VD.insertions', 'DJ.insertions',
       'Total.insertions', 'sample', 'cloneId', 'clones.txt.name',
       '#Nucleotide clones', '#Aminoacid clonotypes',
       '%Aminoacid clonotypes', '#In-frames', '%In-frames',
       '#Out-of-frames', '%Out-of-frames', 'Sum.reads', 'Min.reads',
       '1st Qu.reads', 'Median.reads', 'Mean.reads', '3rd Qu.reads',
       'Max.reads', 'Clonality', 'twb.shannon', 'twb.simp',
       'stripped_patient_id', 'Age [y]', 'Gender (M/F)', 'Random No.',
       'Treatment as Randomized', 'Treatment as Treated',
       'Prior Surgery (Yes/No)', 'HER2 Status Local (IHC 2+/3+)',
       'HER2 Status Central', 'Tumor Proportion Score',
       'Immune Cell Score', 'Combined Positive Score',
       'Prior Drug Therapy', 'Therapy Duration, Any Component [mo]',
       'Therapy Duration, All Sc

In [9]:
df

Unnamed: 0,Read.count,Read.proportion,CDR3.nucleotide.sequence,CDR3.amino.acid.sequence,V.gene,J.gene,D.gene,V.end,J.start,D5.end,...,targetSequences,allVGenes,allDGenes,allJGenes,allVHits,allDHits,allJHits,allVAlignments,allDAlignments,allJAlignments
0,8221.0,0.208125,TGCAGCGTTGAAGGTGGGACGCGCAATGAGCAGTTCTTC,CSVEGGTRNEQFF,TRBV29-1,TRBJ2-1,"TRBD1, TRBD2",195,205,197,...,TGTACTGGTACCGTCAGCAACCTGGACAGAGCCTGACACTGATCGC...,TRBV29-1,"TRBD2,TRBD1",TRBJ2-1,TRBV29-1*00,"TRBD1*00,TRBD2*00",TRBJ2-1*00,94|289|310|0|195|ST97A|961.0,12|17|36|197|202||25.0;16|21|48|197|202||25.0,26|70|70|205|249||220.0
1,930.0,0.023522,TGTGCCAAAAATATTGGAGGTGACACCGGGGAGCTGTTTTTT,CAKNIGGDTGELFF,TRBV30,TRBJ2-2,TRBD2,177,193,186,...,TACTGGTACCGACAGGCTGCAGGCAGGGGCCTCCAGCTGCTCTTCT...,TRBV30,TRBD2,TRBJ2-2,TRBV30*00,TRBD2*00,TRBJ2-2*00,99|276|304|0|177||885.0,26|31|48|186|191||25.0,23|71|71|193|241||240.0
2,707.0,0.017899,TGCAGCGACCCGACGGTATGCCGGACGACGCCACCGGGGAGCTGTT...,CSDPTVCR~DATGELFF,TRBV29-1,TRBJ2-2,,189,213,-1,...,TGTACTGGTACCGTCAGCAACCTGGACAGAGCCTGACACTGATCGC...,TRBV29-1,,TRBJ2-2,TRBV29-1*00,,TRBJ2-2*00,94|283|310|0|189|ST97A|931.0,,24|71|71|213|260||235.0
3,357.0,0.009036,TGTGCCAGCAGTGAGGGGAGCGGGAGTGGCACAGATACGCAGTATTTT,CASSEGSGSGTDTQYF,TRBV6-1,TRBJ2-3,TRBD2,190,204,194,...,TGTACTGGTATCGACAAGACCCAGGCATGGGACTGAGGCTGATTTA...,TRBV6-1,TRBD2,TRBJ2-3,TRBV6-1*00,TRBD2*00,TRBJ2-3*00,94|284|307|0|190||950.0,22|30|48|194|202||40.0,21|69|69|204|252|SA61CSC67G|212.0
4,320.0,0.008078,TGTGCCAGCAGTGAAGGGGCCGGGGCCAACGTCCTGACTTTC,CASSEGAGANVLTF,TRBV6-1,TRBJ2-6,TRBD2,192,197,192,...,TGTACTGGTATCGACAAGACCCAGGCATGGGACTGAGGCTGATTTA...,TRBV6-1,TRBD2,TRBJ2-6,TRBV6-1*00,TRBD2*00,TRBJ2-6*00,94|286|307|0|192||960.0,29|34|48|192|197||25.0,24|73|73|197|246||245.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249928,3.0,0.000056,TGCAGTGCTGGGGCCTCAGATACGCAGTATTTT,CSAGASDTQYF,TRBV20-1,TRBJ2-3,"TRBD1, TRBD2",194,201,194,...,ATGTTTACTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGAT...,TRBV20-1,"TRBD2,TRBD1",TRBJ2-3,TRBV20-1*00,"TRBD1*00,TRBD2*00",TRBJ2-3*00,96|288|313|0|194|I102AI102C|936.0,19|24|36|194|199||25.0;29|34|48|195|200||25.0,24|69|69|201|246|SC67G|211.0
249929,3.0,0.000056,TGCGCCAGCTCGAAAACTGTTTTTT,CASSK~KLFF,TRBV10-2,TRBJ1-4,,185,188,-1,...,TGTACTGGTATCGACAAGACCTGGGACATGGGCTGAGGCTGATCTA...,TRBV10-2,,TRBJ1-4,TRBV10-2*00,,TRBJ1-4*00,94|279|307|0|185|ST97A|911.0,,30|71|71|188|229||205.0
249930,3.0,0.000056,TGTGCCAGCAGCTTAGTGGTTGAAGGAGAGACCCAGTACTTC,CASSLVVEGETQYF,TRBV7-9,TRBJ2-5,,195,205,-1,...,TTTATTGGTACCGACAGACCCTGGGGCAGGGCCCAGAGTTTCTGAC...,TRBV7-9,,TRBJ2-5,TRBV7-9*00,,TRBJ2-5*00,94|289|310|0|195||975.0,,24|68|68|205|249|SC66G|206.0
249931,3.0,0.000056,TGCAGTGCCCCCCTGTATCAGGGTCCCGAGCAGTACTTC,CSAPLYQGPEQYF,TRBV20-1,TRBJ2-7,TRBD1,193,211,193,...,CTATGTTTTGGTATCGTCAGTTCCCGAAACAGAGTCTCATGCTGAT...,TRBV20-1,TRBD1,TRBJ2-7,TRBV20-1*00,TRBD1*00,TRBJ2-7*00,94|287|313|0|193||965.0,1|9|36|193|201||40.0,26|59|67|211|244||165.0
