# Create clone IDs for each patient.

Create clone IDs across all specimens (i.e. all timepoints, mixing peak and non-peak samples -- and including all replicates of those timepoints) from a patient.

Clone IDs are not unique across patients

In [1]:
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd
import os
from joblib import Parallel, delayed

In [2]:
from malid import config, helpers, etl, get_v_sequence, io, logger
from malid.datamodels import GeneLocus, healthy_label
from malid.sample_sequences import sample_sequences
from malid.trained_model_wrappers import ConvergentClusterClassifier

In [3]:
n_jobs = 40

# get specimen filepaths from specimen metadata list

## covid samples

In [4]:
covid_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.specimen_metadata_extra.tsv",
    sep="\t",
)
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak
0,5f21e814e1adeb2edc12613c,Kim_A,11,False
1,5f21e814e1adeb2edc12613d,Kim_A,17,True
2,5f21e815e1adeb2edc12613e,Kim_A,45,False
3,5f21e815e1adeb2edc12613f,Kim_B,10,False
4,5f21e815e1adeb2edc126140,Kim_B,19,True
...,...,...,...,...
91,6028546736266613226-242ac116-0001-012,Montague_8,32,False
92,5977479575117173226-242ac116-0001-012,Montague_8,37,False
93,5994186997898613226-242ac116-0001-012,Montague_8,37,False
94,6148719921208693226-242ac116-0001-012,Montague_9,5,False


In [5]:
covid_specimens.shape

(96, 4)

In [6]:
participant_df = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.participant_metadata.tsv",
    sep="\t",
)
participant_df

Unnamed: 0,participant_label,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name
0,Kim_A,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
1,Kim_B,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
2,Kim_C,PRJNA648677,C,F,53.0,Asian,Case,Covid19,Kim
3,Kim_D,PRJNA648677,D,M,24.0,Asian,Case,Covid19,Kim
4,Kim_E,PRJNA648677,E,M,48.0,Asian,Case,Covid19,Kim
5,Kim_F,PRJNA648677,F,F,40.0,Asian,Case,Covid19,Kim
6,Kim_G,PRJNA648677,G,F,59.0,Asian,Case,Covid19,Kim
7,Montague_1,PRJNA645245,1,F,62.0,,Mild,Covid19,Montague
8,Montague_2,PRJNA645245,2,F,37.0,,Mild,Covid19,Montague
9,Montague_3,PRJNA645245,3,M,47.0,,Moderate,Covid19,Montague


In [7]:
covid_specimens = pd.merge(
    covid_specimens, participant_df, how="left", validate="m:1", on="participant_label"
)
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name
0,5f21e814e1adeb2edc12613c,Kim_A,11,False,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
1,5f21e814e1adeb2edc12613d,Kim_A,17,True,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
2,5f21e815e1adeb2edc12613e,Kim_A,45,False,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
3,5f21e815e1adeb2edc12613f,Kim_B,10,False,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
4,5f21e815e1adeb2edc126140,Kim_B,19,True,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
...,...,...,...,...,...,...,...,...,...,...,...,...
91,6028546736266613226-242ac116-0001-012,Montague_8,32,False,PRJNA645245,8,M,37.0,,Moderate,Covid19,Montague
92,5977479575117173226-242ac116-0001-012,Montague_8,37,False,PRJNA645245,8,M,37.0,,Moderate,Covid19,Montague
93,5994186997898613226-242ac116-0001-012,Montague_8,37,False,PRJNA645245,8,M,37.0,,Moderate,Covid19,Montague
94,6148719921208693226-242ac116-0001-012,Montague_9,5,False,PRJNA645245,9,F,52.0,,Moderate,Covid19,Montague


In [8]:
covid_specimens.shape

(96, 12)

In [9]:
covid_specimens["disease_subtype"] = (
    covid_specimens["disease"]
    + " - "
    + covid_specimens["study_name"]
    + covid_specimens["is_peak"].replace({True: "", False: " (non-peak)"})
)
covid_specimens["gene_locus"] = GeneLocus.BCR.name
covid_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus
0,5f21e814e1adeb2edc12613c,Kim_A,11,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
1,5f21e814e1adeb2edc12613d,Kim_A,17,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
2,5f21e815e1adeb2edc12613e,Kim_A,45,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
3,5f21e815e1adeb2edc12613f,Kim_B,10,False,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR
4,5f21e815e1adeb2edc126140,Kim_B,19,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,6028546736266613226-242ac116-0001-012,Montague_8,32,False,PRJNA645245,8,M,37.0,,Covid19 - Montague (non-peak),Covid19,Montague,BCR
92,5977479575117173226-242ac116-0001-012,Montague_8,37,False,PRJNA645245,8,M,37.0,,Covid19 - Montague (non-peak),Covid19,Montague,BCR
93,5994186997898613226-242ac116-0001-012,Montague_8,37,False,PRJNA645245,8,M,37.0,,Covid19 - Montague (non-peak),Covid19,Montague,BCR
94,6148719921208693226-242ac116-0001-012,Montague_9,5,False,PRJNA645245,9,F,52.0,,Covid19 - Montague (non-peak),Covid19,Montague,BCR


In [10]:
covid_specimens["fname"] = covid_specimens.apply(
    lambda row: config.paths.external_raw_data
    / "covid_external_as_part_tables"
    / f"exported.part_table.{row['specimen_label']}.tsv",
    axis=1,
)
covid_specimens.head()

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,fname
0,5f21e814e1adeb2edc12613c,Kim_A,11,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
1,5f21e814e1adeb2edc12613d,Kim_A,17,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
2,5f21e815e1adeb2edc12613e,Kim_A,45,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
3,5f21e815e1adeb2edc12613f,Kim_B,10,False,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
4,5f21e815e1adeb2edc126140,Kim_B,19,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...


## healthy specimens

In [11]:
healthy_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_bcr.participant_metadata.tsv",
    sep="\t",
)

# process peak samples only
healthy_specimens = healthy_specimens[healthy_specimens["is_peak"] == True]

healthy_specimens["disease_subtype"] = (
    healthy_specimens["disease"]
    + " - "
    + healthy_specimens["study_name"]
    + healthy_specimens["is_peak"].replace({True: "", False: " (non-peak)"})
)

healthy_specimens["gene_locus"] = GeneLocus.BCR.name

healthy_specimens

Unnamed: 0,participant_label,specimen_label,study_name,disease,is_peak,timepoint,age,sex,ethnicity,ethnicity_condensed,disease_subtype,gene_locus
0,D103,D103_1,Briney,Healthy/Background,True,0,25,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR
1,326797,326797_1,Briney,Healthy/Background,True,0,21,F,Caucasian,Caucasian,Healthy/Background - Briney,BCR
2,327059,327059_1,Briney,Healthy/Background,True,0,26,M,African American / Caucasian,,Healthy/Background - Briney,BCR
3,326650,326650_1,Briney,Healthy/Background,True,0,18,F,Caucasian,Caucasian,Healthy/Background - Briney,BCR
4,326737,326737_1,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR
5,326780,326780_1,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR
6,316188,316188_1,Briney,Healthy/Background,True,0,30,F,African American,African,Healthy/Background - Briney,BCR


In [12]:
healthy_specimens["fname"] = healthy_specimens.apply(
    lambda row: config.paths.external_raw_data
    / "briney_healthy_as_part_tables"
    / f"exported.part_table.{row['specimen_label']}.tsv",
    axis=1,
)
healthy_specimens.head()

Unnamed: 0,participant_label,specimen_label,study_name,disease,is_peak,timepoint,age,sex,ethnicity,ethnicity_condensed,disease_subtype,gene_locus,fname
0,D103,D103_1,Briney,Healthy/Background,True,0,25,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
1,326797,326797_1,Briney,Healthy/Background,True,0,21,F,Caucasian,Caucasian,Healthy/Background - Briney,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
2,327059,327059_1,Briney,Healthy/Background,True,0,26,M,African American / Caucasian,,Healthy/Background - Briney,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
3,326650,326650_1,Briney,Healthy/Background,True,0,18,F,Caucasian,Caucasian,Healthy/Background - Briney,BCR,/users/maximz/code/boyd-immune-repertoire-clas...
4,326737,326737_1,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian,Healthy/Background - Briney,BCR,/users/maximz/code/boyd-immune-repertoire-clas...


## healthy TCR specimens

In [13]:
tcr_healthy_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_tcr_britanova.participant_metadata.tsv",
    sep="\t",
).assign(
    is_peak=True,
    gene_locus=GeneLocus.TCR.name,
)
tcr_healthy_specimens["fname"] = tcr_healthy_specimens["specimen_label"].apply(
    lambda specimen_label: config.paths.external_raw_data
    / "chudakov_aging"
    / f"{specimen_label}.txt.gz"
)

tcr_healthy_specimens

Unnamed: 0,specimen_label,sex,age,ethnicity_condensed,participant_label,disease,study_name,disease_subtype,is_peak,gene_locus,fname
0,A3-i101,F,36,Caucasian,p1,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
1,A3-i102,F,43,Caucasian,p2,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
2,A3-i106,F,43,Caucasian,p3,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
3,A3-i107,F,39,Caucasian,p4,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
4,A3-i110,F,34,Caucasian,p5,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
5,A2-i138,F,74,Caucasian,p15,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
6,A2-i139,M,75,Caucasian,p16,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
7,A2-i140,F,73,Caucasian,p17,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
8,A2-i141,M,71,Caucasian,p18,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
9,A4-i101,M,36,Caucasian,p19,Healthy/Background,Britanova,Healthy/Background - Britanova,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...


## Covid TCR specimens

In [14]:
tcr_covid_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid_tcr_shomuradova.participant_metadata.tsv",
    sep="\t",
).assign(
    is_peak=True,
    gene_locus=GeneLocus.TCR.name,
)
tcr_covid_specimens["fname"] = tcr_covid_specimens["specimen_label"].apply(
    lambda specimen_label: config.paths.external_raw_data
    / "shomuradova"
    / f"split.{specimen_label}.tsv"
)

tcr_covid_specimens

Unnamed: 0,specimen_label,participant_label,disease,study_name,disease_subtype,age,sex,ethnicity_condensed,is_peak,gene_locus,fname
0,5f07aa8839579433171763b4,p1437,Covid19,Shomuradova,Covid19 - mild,28,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
1,5f07aa8939579433171763b7,p1445,Covid19,Shomuradova,Covid19 - mild,32,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
2,5f07aa8a39579433171763ba,p1473,Covid19,Shomuradova,Covid19 - mild,31,F,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
3,5f07aa8c39579433171763c0,p1489,Covid19,Shomuradova,Covid19 - mild,27,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
4,6047f702136a6d924982945c,p1434,Covid19,Shomuradova,Covid19 - mild,28,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
5,6047f703136a6d924982945f,p1448,Covid19,Shomuradova,Covid19 - moderate/severe,37,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
6,6047f704136a6d9249829462,p1449,Covid19,Shomuradova,Covid19 - mild,34,F,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
7,6047f704136a6d9249829465,p1465,Covid19,Shomuradova,Covid19 - moderate/severe,19,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
8,6047f706136a6d924982946b,p1480,Covid19,Shomuradova,Covid19 - moderate/severe,29,M,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...
9,6047f707136a6d924982946e,p1481,Covid19,Shomuradova,Covid19 - moderate/severe,30,F,Caucasian,True,TCR,/users/maximz/code/boyd-immune-repertoire-clas...


## Adaptive healthy TCR specimens

In [15]:
adaptive_tcr_healthy_specimens = pd.DataFrame(
    {"fname": (config.paths.external_raw_data / "emerson").glob("*.tsv")}
).assign(
    disease=healthy_label,
    study_name="Emerson",
    is_peak=True,
    gene_locus=GeneLocus.TCR.name,
    # Flag that these are a different platform than what we expect
    different_platform=True,
)

# extract specimen label from filename
adaptive_tcr_healthy_specimens = adaptive_tcr_healthy_specimens.assign(
    specimen_label=adaptive_tcr_healthy_specimens["fname"].apply(lambda path: path.stem)
)
assert not adaptive_tcr_healthy_specimens["specimen_label"].duplicated().any()
# participants are 1:1 with specimens
adaptive_tcr_healthy_specimens["participant_label"] = adaptive_tcr_healthy_specimens[
    "specimen_label"
]

# TODO: add sex, age, ethnicity

adaptive_tcr_healthy_specimens["disease_subtype"] = (
    adaptive_tcr_healthy_specimens["disease"]
    + " - "
    + adaptive_tcr_healthy_specimens["study_name"]
    + adaptive_tcr_healthy_specimens["is_peak"].replace(
        {True: "", False: " (non-peak)"}
    )
)

adaptive_tcr_healthy_specimens

Unnamed: 0,fname,disease,study_name,is_peak,gene_locus,different_platform,specimen_label,participant_label,disease_subtype
0,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,P00311,P00311,Healthy/Background - Emerson
1,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,Keck0119_MC1,Keck0119_MC1,Healthy/Background - Emerson
2,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,P00104,P00104,Healthy/Background - Emerson
3,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,P00511,P00511,Healthy/Background - Emerson
4,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,Keck0013_MC1,Keck0013_MC1,Healthy/Background - Emerson
...,...,...,...,...,...,...,...,...,...
781,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,P00331,P00331,Healthy/Background - Emerson
782,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,Keck0086_MC1,Keck0086_MC1,Healthy/Background - Emerson
783,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,P00183,P00183,Healthy/Background - Emerson
784,/users/maximz/code/boyd-immune-repertoire-clas...,Healthy/Background,Emerson,True,TCR,True,P00121,P00121,Healthy/Background - Emerson


## Adaptive Covid TCR specimens

In [16]:
adaptive_tcr_covid_specimens = pd.read_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.adaptive_covid_tcr.specimens.tsv",
    sep="\t",
).assign(
    study_name="ImmuneCode",
    is_peak=True,
    gene_locus=GeneLocus.TCR.name,
    # Flag that these are a different platform than what we expect
    different_platform=True,
)
adaptive_tcr_covid_specimens["fname"] = adaptive_tcr_covid_specimens[
    "specimen_label"
].apply(
    lambda specimen_label: config.paths.external_raw_data
    / "immunecode"
    / "reps"
    / "ImmuneCODE-Review-002"
    / f"{specimen_label}.tsv"
)

adaptive_tcr_covid_specimens

Unnamed: 0,participant_label,specimen_label,disease,disease_subtype,age,sex,ethnicity_condensed,study_name,is_peak,gene_locus,different_platform,fname
0,ImmuneCode-190921,860011232_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,53,F,Hispanic/Latino,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
1,ImmuneCode-026,INCOV026-AC-3_TCRB,Covid19,COVID-19-ISB,33,M,Hispanic/Latino,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
2,ImmuneCode-321977,860011116_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,52,F,Caucasian,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
3,ImmuneCode-087,INCOV087-BL-3_TCRB,Covid19,COVID-19-ISB,56,M,Hispanic/Latino,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
4,ImmuneCode-0000051,BS-EQ-0014-T2-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,55,M,Caucasian,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
...,...,...,...,...,...,...,...,...,...,...,...,...
88,ImmuneCode-ADIRP0001958,ADIRP0001958_TCRB,Covid19,COVID-19-Adaptive,49,M,Caucasian,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
89,ImmuneCode-775827,860011106_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,57,F,Caucasian,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
90,ImmuneCode-0000446,BS-GIGI_10-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,75,M,Caucasian,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...
91,ImmuneCode-0000160,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized - ICU,72,M,Caucasian,ImmuneCode,True,TCR,True,/users/maximz/code/boyd-immune-repertoire-clas...


## Instructions for adding more

If we have external cohorts that are BCR+TCR, we should have one row per locus per specimen.

Set a `specimen_label_by_locus` column that is the globally-unique specimen label tailored to a particular locus, e.g. `$SPECIMENLABEL-IGH` or `$SPECIMENLABEL-TRB` format.

And set a `specimen_label` column that is equivalent across different-loci rows for that specimen.

The row's `gene_locus` column should be set to the locus of that row (must be the name of a valid `GeneLocus` enum value), and the `fname` column should be set to the path to the file containing the data for that locus.

## merge

In [17]:
all_specimens = pd.concat(
    [
        covid_specimens,
        healthy_specimens,
        tcr_healthy_specimens,
        tcr_covid_specimens,
        adaptive_tcr_healthy_specimens,
        adaptive_tcr_covid_specimens,
    ],
    axis=0,
)

# fillna
all_specimens["different_platform"].fillna(False, inplace=True)

all_specimens

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,fname,ethnicity,different_platform
0,5f21e814e1adeb2edc12613c,Kim_A,11.0,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False
1,5f21e814e1adeb2edc12613d,Kim_A,17.0,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False
2,5f21e815e1adeb2edc12613e,Kim_A,45.0,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False
3,5f21e815e1adeb2edc12613f,Kim_B,10.0,False,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False
4,5f21e815e1adeb2edc126140,Kim_B,19.0,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,ADIRP0001958_TCRB,ImmuneCode-ADIRP0001958,,True,,,M,49.0,Caucasian,COVID-19-Adaptive,Covid19,ImmuneCode,TCR,/users/maximz/code/boyd-immune-repertoire-clas...,,True
89,860011106_TCRB,ImmuneCode-775827,,True,,,F,57.0,Caucasian,COVID-19-HUniv12Oct - Hospitalized,Covid19,ImmuneCode,TCR,/users/maximz/code/boyd-immune-repertoire-clas...,,True
90,BS-GIGI_10-replacement_TCRB,ImmuneCode-0000446,,True,,,M,75.0,Caucasian,COVID-19-NIH/NIAID - Hospitalized,Covid19,ImmuneCode,TCR,/users/maximz/code/boyd-immune-repertoire-clas...,,True
91,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,ImmuneCode-0000160,,True,,,M,72.0,Caucasian,COVID-19-NIH/NIAID - Hospitalized - ICU,Covid19,ImmuneCode,TCR,/users/maximz/code/boyd-immune-repertoire-clas...,,True


In [18]:
# Fillna for cohorts that are single-locus
if "specimen_label_by_locus" not in all_specimens:
    # in case we had no BCR+TCR combined cohorts that set this field already
    all_specimens["specimen_label_by_locus"] = all_specimens["specimen_label"]
else:
    all_specimens["specimen_label_by_locus"].fillna(
        all_specimens["specimen_label"], inplace=True
    )

In [19]:
# make sure input fnames exist
assert all_specimens["fname"].apply(os.path.exists).all()

In [20]:
all_specimens.shape

(1038, 17)

In [21]:
# Set age_group column as well, just as in assemble_etl_metadata
all_specimens["age"].describe()

count    252.000000
mean      50.861111
std       17.042265
min        8.000000
25%       37.000000
50%       52.000000
75%       62.000000
max       89.000000
Name: age, dtype: float64

In [22]:
all_specimens["age_group"] = pd.cut(
    all_specimens["age"],
    bins=[0, 20, 30, 40, 50, 60, 70, 80, 100],
    labels=["<20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80+"],
    right=False,
)
all_specimens["age_group"].value_counts()

50-60    62
60-70    41
30-40    37
40-50    37
20-30    33
70-80    29
80+      10
<20       3
Name: age_group, dtype: int64

In [23]:
all_specimens["age_group"].cat.categories

Index(['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'], dtype='object')

In [24]:
all_specimens["age"].isna().value_counts()

True     786
False    252
Name: age, dtype: int64

In [25]:
all_specimens["age_group"].isna().value_counts()

True     786
False    252
Name: age_group, dtype: int64

In [26]:
for age_group, grp in all_specimens.groupby("age_group"):
    print(age_group, grp["age"].min(), grp["age"].max())

<20 8.0 19.0
20-30 20.0 29.0
30-40 30.0 39.0
40-50 40.0 49.0
50-60 50.0 59.0
60-70 60.0 68.0
70-80 70.0 79.0
80+ 80.0 89.0


In [27]:
# Just as in assemble_etl_metadata:
# Null out "age_group" column for extreme ages with small sample size.

# Note that we are not getting rid of these specimens altogether,
# but marking age_group NaN will disable their use for demographics-controlling models

orig_shapes = all_specimens.shape[0], all_specimens["age_group"].isna().sum()
mask = all_specimens["age_group"].isin(["80+"])
all_specimens.loc[mask, "age_group"] = np.nan
new_shapes = all_specimens.shape[0], all_specimens["age_group"].isna().sum()

# sanity checks:
# - we did not drop any specimens
assert orig_shapes[0] == new_shapes[0]
# - but we did null out some age_group entries
assert orig_shapes[1] < new_shapes[1]
# - we nulled out the right amount
assert new_shapes[1] - orig_shapes[1] == mask.sum()

In [28]:
# export for later processing
all_specimens.drop(["fname"], axis=1).to_csv(
    config.paths.metadata_dir / "generated.external_cohorts.all_specimens.tsv",
    sep="\t",
    index=None,
)

In [29]:
# confirm all specimen labels are unique within each locus (may have one BCR and one TCR line per specimen)
# TODO: in the future, allow for replicates of each specimen
assert not all_specimens["specimen_label_by_locus"].duplicated().any()
for locus, grp in all_specimens.groupby("gene_locus"):
    assert not grp["specimen_label"].duplicated().any()

In [30]:
# Which specimens are in multiple loci?
all_specimens[all_specimens["specimen_label"].duplicated(keep=False)]

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,fname,ethnicity,different_platform,specimen_label_by_locus,age_group


In [31]:
all_specimens["study_name"].value_counts()

Emerson        786
ImmuneCode      93
Montague        80
Britanova       39
Shomuradova     17
Kim             16
Briney           7
Name: study_name, dtype: int64

In [32]:
all_specimens["disease"].value_counts()

Healthy/Background    832
Covid19               206
Name: disease, dtype: int64

In [33]:
all_specimens["gene_locus"].value_counts()

TCR    935
BCR    103
Name: gene_locus, dtype: int64

In [34]:
all_specimens["disease_subtype"].value_counts()

Healthy/Background - Emerson                786
Covid19 - Montague (non-peak)                52
Healthy/Background - Britanova               39
COVID-19-NIH/NIAID - Hospitalized            39
Covid19 - Montague                           28
COVID-19-HUniv12Oct - Hospitalized           26
Covid19 - mild                               10
Covid19 - Kim (non-peak)                      9
COVID-19-ISB                                  9
Covid19 - Kim                                 7
Healthy/Background - Briney                   7
Covid19 - moderate/severe                     7
COVID-19-NIH/NIAID - Hospitalized - ICU       7
COVID-19-HUniv12Oct - Hospitalized - ICU      6
COVID-19-Adaptive                             5
COVID-19-HUniv12Oct                           1
Name: disease_subtype, dtype: int64

In [35]:
all_specimens["different_platform"].value_counts()

True     879
False    159
Name: different_platform, dtype: int64

In [36]:
all_specimens.groupby(["different_platform", "disease_subtype"]).size()

different_platform  disease_subtype                         
False               Covid19 - Kim                                 7
                    Covid19 - Kim (non-peak)                      9
                    Covid19 - Montague                           28
                    Covid19 - Montague (non-peak)                52
                    Covid19 - mild                               10
                    Covid19 - moderate/severe                     7
                    Healthy/Background - Briney                   7
                    Healthy/Background - Britanova               39
True                COVID-19-Adaptive                             5
                    COVID-19-HUniv12Oct                           1
                    COVID-19-HUniv12Oct - Hospitalized           26
                    COVID-19-HUniv12Oct - Hospitalized - ICU      6
                    COVID-19-ISB                                  9
                    COVID-19-NIH/NIAID - Hospitalized  

In [37]:
for demographics_column in ["age", "age_group", "sex", "ethnicity_condensed"]:
    print(demographics_column)
    print(all_specimens[demographics_column].value_counts())
    print(all_specimens[demographics_column].isna().value_counts())
    print()

age
37.0    15
55.0    13
51.0    11
62.0    11
48.0     9
        ..
32.0     1
22.0     1
74.0     1
18.0     1
42.0     1
Name: age, Length: 67, dtype: int64
True     786
False    252
Name: age, dtype: int64

age_group
50-60    62
60-70    41
30-40    37
40-50    37
20-30    33
70-80    29
<20       3
80+       0
Name: age_group, dtype: int64
True     796
False    242
Name: age_group, dtype: int64

sex
M    135
F    117
Name: sex, dtype: int64
True     786
False    252
Name: sex, dtype: int64

ethnicity_condensed
Caucasian          137
Asian               18
Hispanic/Latino     12
African              1
Name: ethnicity_condensed, dtype: int64
True     870
False    168
Name: ethnicity_condensed, dtype: int64



In [38]:
# TODO: Separate all above into a separate "compile all metadata" notebook.

# process specimen, drop duplicates, and cluster each group (-> clones)


In [39]:
def process_specimen(
    fname: Path, gene_locus: GeneLocus, study_name: str, specimen_label: str
):
    # defensive cast
    fname = Path(fname)

    # each specimen is one "repertoire_id"
    df = pd.read_csv(
        fname,
        sep="\t",
        # Solve "DtypeWarning: Columns (9,17,25,33) have mixed types. Specify dtype option on import or set low_memory=False."
        dtype={
            "pre_seq_nt_q": "object",
            "pre_seq_nt_v": "object",
            "pre_seq_nt_d": "object",
            "pre_seq_nt_j": "object",
        },
        # special N/A values for Adaptive data
        na_values=["no data", "unknown", "unresolved"],
    )

    # Rename columns
    if "specimen_label" not in df.columns:
        if "sample_name" in df.columns:
            df.rename(
                columns={
                    "sample_name": "specimen_label",
                },
                inplace=True,
            )
        elif "repertoire_id" in df.columns:
            df.rename(
                columns={
                    "repertoire_id": "specimen_label",
                },
                inplace=True,
            )
        else:
            df = df.assign(specimen_label=specimen_label)

    # confirm only one specimen included here
    if not (df["specimen_label"] == specimen_label).all():
        raise ValueError(
            f"Processing specimen {specimen_label}, but specimen_label column in {fname} does not match this."
        )

    # Recognize sample type
    if study_name == "Britanova" and gene_locus == GeneLocus.TCR:
        # Special case for this study:
        # No raw sequences available to run IgBlast ourselves.

        # Rename columns
        df.rename(
            columns={
                "v": "v_gene",
                "j": "j_gene",
                "cdr3nt": "cdr3_seq_nt_q",
                "cdr3aa": "cdr3_seq_aa_q",
                "count": "num_reads",
            },
            inplace=True,
        )

        # Pulling in CDR1 and CDR2 for TCRB data in etl._compute_columns expects v_segment to be set with allele info.
        # etl._compute_columns guarantees that each V gene has a CDR1 and CDR2 available for its dominant *01 allele.
        df["v_segment"] = df["v_gene"] + "*01"
        df["j_segment"] = df["j_gene"] + "*01"

        # Trim CDR3 AA: remove ends
        # and replace field that's entirely space (or empty) with NaN
        # (maybe we should be trimming cdr3-nt too, but that's only used for clustering within patient to set clone IDs, so it doesn't matter)
        df.dropna(subset=["cdr3_seq_aa_q"], inplace=True)
        df["cdr3_seq_aa_q"] = (
            df["cdr3_seq_aa_q"]
            .str.slice(start=1, stop=-1)
            .replace(r"^\s*$", np.nan, regex=True)
        )
        df.dropna(subset=["cdr3_seq_aa_q"], inplace=True)

        # Mark these all as productive TCRB
        df["productive"] = True
        df["extracted_isotype"] = "TCRB"

    else:
        # We have run sequences through our IgBlast.
        # Note that our legacy IgBlast parser uses the location of our Boydlab primers to parse the CDR3 sequence
        # So if you run shorter sequences like Adaptive seuqences, we won't parse IgBlast's CDR3 calls, but we will still get V/J calls.
        is_adaptive = "bio_identity" in df.columns

        if study_name in ["Briney", "Kim", "Montague"]:
            # Special case: iReceptor / VDJserver / Briney studies that went through legacy Postgres-based annotations,
            # i.e. igblast parses already merged in.

            # TODO: update runbook for these to be consistent with new schema where we merge igblast parses on the fly,
            # then eliminate this special case.

            # Igblast parse exported from Postgres database: cast to bool to be compatible
            df["productive"] = df["productive"].replace({"t": True, "f": False})

            # get v_sequence (same way we produce v_sequence in internal pipeline's sort script)
            (
                df["v_sequence"],
                df["d_sequence"],
                df["j_sequence"],
            ) = get_v_sequence.complete_sequences(df)

        else:
            if is_adaptive:
                # Adaptive, reprocessed through our IgBlast
                # See https://www.adaptivebiotech.com/wp-content/uploads/2019/07/MRK-00342_immunoSEQ_TechNote_DataExport_WEB_REV.pdf

                # Our IgBlast gives some different V gene calls, but generally doesn't provide CDR3 calls for these short sequences.
                # That's because our parser looks for the location of our primers.
                # We'll use the V/J gene and productive calls from our IgBlast, while using Adaptive's CDR3 call.

                if "cdr3_rearrangement" in df.columns:
                    df.rename(
                        columns={"cdr3_rearrangement": "cdr3_seq_nt_q"}, inplace=True
                    )
                elif "rearrangement" in df.columns:
                    # this is the entire detected rearrangement, not just cdr3, but will suffice for finding nucleotide uniques
                    df.rename(columns={"rearrangement": "cdr3_seq_nt_q"}, inplace=True)
                else:
                    raise ValueError(
                        f"No nucleotide rearrangement column found for {specimen_label}"
                    )

                # Define whether productive from Adaptive metadata
                df["productive"] = df["frame_type"] == "In"

                # Count copies using templates or seq_reads field, whichever is available
                if not df["templates"].isna().any():
                    df["num_reads"] = df["templates"]
                elif not df["seq_reads"].isna().any():
                    df["num_reads"] = df["seq_reads"]
                else:
                    raise ValueError(
                        f"Could not choose templates/seq_reads column from {specimen_label}"
                    )

                # Also extract Adaptive's V/J gene calls (-> "original_*" columns) and CDR3 AA call.
                # Follow tcrdist3's import_adaptive_file pattern (https://github.com/kmayerb/tcrdist3/blob/55d56fa621ec19b25a31ee1a3e61ef60e2575837/tcrdist/adpt_funcs.py#L24):
                # Don't just parse v_gene and j_gene; parse bio_identity instead.

                # Per https://tcrdist3.readthedocs.io/en/latest/adaptive.html:
                # "Adaptive’s output files can contain gene-level names within the ‘bioidentity’ field like TCRBV15-X, when there is ambiguity about the gene-level assignment."
                # Format example:
                # {'v_gene': 'unresolved',
                # 'v_gene_ties': 'TCRBV12-03/12-04,TCRBV12-04',
                # 'bio_identity': 'CATSAISSNQPQHF+TCRBV12-X+TCRBJ01-05'}
                df[["cdr3_seq_aa_q", "original_v_segment", "original_j_segment"]] = df[
                    "bio_identity"
                ].str.split("+", expand=True)

                # Trim CDR3: remove ends
                # and replace field that's entirely space (or empty) with NaN
                df.dropna(subset=["cdr3_seq_aa_q"], inplace=True)
                df["cdr3_seq_aa_q"] = (
                    df["cdr3_seq_aa_q"]
                    .str.slice(start=1, stop=-1)
                    .replace(r"^\s*$", np.nan, regex=True)
                )

            # Merge in igblast parses to get better V/J gene calls.
            parse_fnames = list(
                (fname.parent / "igblast_splits").glob(
                    f"split.{specimen_label}.*.parsed.tsv"
                )
            )
            if len(parse_fnames) == 0:
                raise ValueError(
                    f"No igblast parse files found for {specimen_label} from {study_name}"
                )
            df_parse = pd.concat(
                [pd.read_csv(fname, sep="\t") for fname in parse_fnames], axis=0
            )

            # extract fasta ID
            df_parse[["specimen_label", "rownum"]] = df_parse["id"].str.split(
                "|", expand=True
            )
            df_parse["rownum"] = df_parse["rownum"].astype(int)
            assert not df_parse["rownum"].duplicated().any()

            # For now we are assuming df and df_parse both have exactly one specimen
            assert (df_parse["specimen_label"] == specimen_label).all()

            if not is_adaptive:
                # get v_sequence (same way we produce v_sequence in internal pipeline's sort script)
                # this will be used to compute v_mut for BCR
                (
                    df_parse["v_sequence"],
                    df_parse["d_sequence"],
                    df_parse["j_sequence"],
                ) = get_v_sequence.complete_sequences(df_parse)
            else:
                # Adaptive->IgBlast reprocessing does not have the necessary sequence info for us,
                # because our Igblast-output parser fails to extract some sequence regions since Adaptive sequences are shorter.
                df_parse["v_sequence"] = pd.Series(dtype="str")
                df_parse["d_sequence"] = pd.Series(dtype="str")
                df_parse["j_sequence"] = pd.Series(dtype="str")

            orig_shape = df.shape
            df = pd.merge(
                df.rename(
                    # store original metadata as "original_*" columns
                    columns={
                        "productive": "original_productive",
                        "v_gene": "original_v_gene",
                        "j_gene": "original_j_gene",
                        "cdr3_seq_nt_q": "original_cdr3_seq_nt_q",
                        "cdr3_seq_aa_q": "original_cdr3_seq_aa_q",
                        # possible original Adaptive metadata:
                        "v_gene_ties": "original_v_gene_ties",
                        "j_gene_ties": "original_j_gene_ties",
                    }
                ),
                df_parse.set_index("rownum")[
                    [
                        "v_segment",
                        "j_segment",
                        "productive",
                        "v_sequence",
                        "d_sequence",
                        "j_sequence",
                    ]
                    + (
                        # Adaptive IgBlast rerun does not have sequences when interpreted by our parser,
                        # because Adaptive's sequences are shorter than ours
                        [
                            "cdr1_seq_aa_q",
                            "cdr2_seq_aa_q",
                            "cdr3_seq_nt_q",
                            "cdr3_seq_aa_q",
                        ]
                        if not is_adaptive
                        else []
                    )
                ],
                left_index=True,
                right_index=True,
                how="inner",
                validate="1:1",
            )
            assert df.shape[0] == min(orig_shape[0], df_parse.shape[0])

            # For Adaptive data, since CDR3 information was missing in IgBlast parses,
            # set cdr3_seq_nt_q and cdr3_seq_aa_q to the original values.
            if is_adaptive:
                df["cdr3_seq_nt_q"] = df["original_cdr3_seq_nt_q"]
                df["cdr3_seq_aa_q"] = df["original_cdr3_seq_aa_q"]
                # Also set other missing columns for consistency.
                # Not sure if these are true assumptions, but we will subset to TRBV V genes later, so should be ok
                df["locus"] = "TRB"
                df["extracted_isotype"] = "TCRB"

        # drop the external study's sequence_id (keep the int primary key instead)
        df = df.drop(columns="sequence_id", errors="ignore")

        # rename columns for consistency
        df.rename(
            columns={
                "c_call": "extracted_isotype",
                "id": "sequence_id",
            },
            inplace=True,
        )
        if gene_locus == GeneLocus.TCR:
            # sanity check
            if not (df["locus"] == "TRB").all():
                raise ValueError(
                    f"Locus field was not TRB for {specimen_label} ({study_name})"
                )
            # set isotype flag if iReceptor c_call is blank
            df["extracted_isotype"].fillna("TCRB", inplace=True)

        if study_name == "Montague":
            # extracted_isotype is not provided, but we know these are IgG
            if not all(df["extracted_isotype"].isna()):
                raise ValueError("We expect no isotype calls for Montague")
            df["extracted_isotype"].fillna("IGHG", inplace=True)

    # replace extracted_isotype values to have consistent prefix: IgG -> IGHG, IgA -> IGHA, etc.
    df["extracted_isotype"] = df["extracted_isotype"].str.replace(
        "^Ig", "IGH", regex=True
    )

    # filter
    df_orig_shape = df.shape
    df = df.loc[(~pd.isnull(df["extracted_isotype"])) & (df["productive"] == True)]
    if df.shape[0] == 0:
        raise ValueError(f"Filtering failed for {specimen_label} from {study_name}")

    # compute important columns
    # note that this converts v_segment, j_segment (with alleles) to v_gene, j_gene columns (no alleles).
    df = etl._compute_columns(df=df, gene_locus=gene_locus)

    # Replace indistinguishable TRBV gene names with the version that we use in our data.
    df["v_gene"] = df["v_gene"].replace(io.v_gene_renames)

    # since we are going to call clones by nucleotide sequences here rather than in the usual bioinformatics pipeline,
    # let's also preprocess the NT characters.
    # (note for Adaptive data: we had to remove the prefix/suffix from the CDR3 AA sequence.
    # we aren't bothering to do that with the nucleotide sequence, since we are just using that to set clone IDs
    # it's not being passed to the downstream language model, so it doesn't have to be consistent with the rest of our data.)
    df["cdr3_seq_nt_q_trim"] = etl._trim_sequences(df["cdr3_seq_nt_q"])
    df.dropna(
        subset=[
            "cdr3_seq_nt_q_trim",
        ],
        how="any",
        inplace=True,
    )

    # get trimmed lengths
    df["cdr3_nt_sequence_trim_len"] = df["cdr3_seq_nt_q_trim"].str.len()

    # Now that everything has gone through our IgBlast, these filters should be no-ops:
    # Downselect only to V genes that are in our standard dataset
    invalid_v_genes = set(df["v_gene"].unique()) - set(
        helpers.all_observed_v_genes()[gene_locus]
    )
    if len(invalid_v_genes) > 0:
        logger.warning(
            f"Dropping V genes from {specimen_label} ({study_name}) that aren't in our standard data: {invalid_v_genes}"
        )
        df = df.loc[df["v_gene"].isin(helpers.all_observed_v_genes()[gene_locus])]

    # And downselect only to J genes that are in our standard dataset
    invalid_j_genes = set(df["j_gene"].unique()) - set(
        helpers.all_observed_j_genes()[gene_locus]
    )
    if len(invalid_j_genes) > 0:
        logger.warning(
            f"Dropping J genes from {specimen_label} ({study_name}) that aren't in our standard data: {invalid_j_genes}"
        )
        df = df.loc[df["j_gene"].isin(helpers.all_observed_j_genes()[gene_locus])]

    # make each row a single unique VDJ sequence - drop duplicates
    # save number of reads to a column called `num_reads`, to measure clonal expansion later
    dedupe_cols = [
        "specimen_label",
        "extracted_isotype",
        "v_gene",
        "j_gene",
        "cdr1_seq_aa_q_trim",
        "cdr2_seq_aa_q_trim",
        "cdr3_seq_aa_q_trim",
        "cdr3_seq_nt_q_trim",
        "productive",
    ]
    if "num_reads" in df.columns:
        read_counts = (
            df.groupby(dedupe_cols, observed=True)["num_reads"].sum().reset_index()
        )
    else:
        read_counts = (
            df.groupby(dedupe_cols, observed=True).size().reset_index(name="num_reads")
        )
    df.drop_duplicates(
        subset=dedupe_cols,
        keep="first",
        inplace=True,
    )
    # sanity check
    if not all(df.groupby(dedupe_cols, observed=True).size() == 1):
        raise ValueError(f"Deduplicate failed for {specimen_label} from {study_name}")
    expected_shape = df.shape
    # merge in the num_reads column, replacing (dropping) the existing num_reads column if it exists
    df = pd.merge(
        df.drop(columns="num_reads", errors="ignore"),
        read_counts,
        how="left",
        on=dedupe_cols,
        validate="1:1",
    )
    if df.shape[0] != expected_shape[0]:
        raise ValueError(
            f"Merge post dedupe failed for {specimen_label} from {study_name}"
        )

    return df, (fname, df_orig_shape, df.shape, gene_locus)

In [40]:
all_specimens.head()

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name,gene_locus,fname,ethnicity,different_platform,specimen_label_by_locus,age_group
0,5f21e814e1adeb2edc12613c,Kim_A,11.0,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False,5f21e814e1adeb2edc12613c,50-60
1,5f21e814e1adeb2edc12613d,Kim_A,17.0,True,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False,5f21e814e1adeb2edc12613d,50-60
2,5f21e815e1adeb2edc12613e,Kim_A,45.0,False,PRJNA648677,A,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False,5f21e815e1adeb2edc12613e,50-60
3,5f21e815e1adeb2edc12613f,Kim_B,10.0,False,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim (non-peak),Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False,5f21e815e1adeb2edc12613f,50-60
4,5f21e815e1adeb2edc126140,Kim_B,19.0,True,PRJNA648677,B,M,55.0,Asian,Covid19 - Kim,Covid19,Kim,BCR,/users/maximz/code/boyd-immune-repertoire-clas...,,False,5f21e815e1adeb2edc126140,50-60


In [41]:
def run_on_all_specimens_from_a_person(participant_label, specimens):
    dfs = []
    info_per_specimen = []

    # This isn't true yet, but later on we might have BCR and TCR versions of the same specimen.
    # Get combined gene locus flag for all gene loci used in this group of specimens
    gene_loci = [GeneLocus[name] for name in specimens["gene_locus"].unique()]
    # combine flags
    gene_loci = GeneLocus.combine_flags_list_into_single_multiflag_value(gene_loci)
    GeneLocus.validate(gene_loci)

    # Load each specimen on its own. We expect one entry for BCR and another for TCR.
    for _, specimen in specimens.iterrows():
        gene_locus_for_this_specimen = GeneLocus[specimen["gene_locus"]]
        df, annotation = process_specimen(
            fname=specimen["fname"],
            gene_locus=gene_locus_for_this_specimen,
            study_name=specimen["study_name"],
            # note: passing in specimen label by locus. because on disk separated by (and named by) locus.
            specimen_label=specimen["specimen_label_by_locus"],
        )

        # assign metadata
        df = df.assign(
            participant_label=specimen["participant_label"],
            timepoint=specimen["timepoint"],
            is_peak=specimen["is_peak"],
            disease=specimen["disease"],
            disease_subtype=specimen["disease_subtype"],
            # at this point, rename specimen_label from current value (specimen_label_by_locus) to non-locus value (specimen_label)
            specimen_label=specimen["specimen_label"],
        )

        dfs.append((gene_locus_for_this_specimen, df))
        info_per_specimen.append(annotation)

    # cluster all specimens from this patient staple sequences from the specimens together.
    # but run clustering separately on different gene loci, because of different distance criteria.
    dfs_clustered_for_each_gene_locus = [
        ConvergentClusterClassifier._cluster_training_set(
            df=pd.concat([df for (gl, df) in dfs if gl == gene_locus], axis=0),
            sequence_identity_threshold=config.sequence_identity_thresholds.call_nucleotide_clones_with_patient[
                gene_locus
            ],
            validate_same_fold=False,
            higher_order_group_columns=[
                "v_gene",
                "j_gene",
                "cdr3_nt_sequence_trim_len",
            ],
            sequence_column="cdr3_seq_nt_q_trim",
            inplace=True,
        )
        for gene_locus in gene_loci
    ]

    # we can then combine clustering results across gene loci because V genes will be different (and V gene is included in cluster ID).
    df = pd.concat(dfs_clustered_for_each_gene_locus, axis=0)

    # Express global clone ID as string instead
    df.drop("global_resulting_cluster_ID", axis=1, inplace=True)
    df["igh_or_tcrb_clone_id"] = (
        df[
            [
                "specimen_label",
                "v_gene",
                "j_gene",
                "cdr3_nt_sequence_trim_len",
                "cluster_id_within_clustering_group",
            ]
        ]
        .astype(str)
        .apply(tuple, axis=1)
        .apply("|".join)
    )

    # Clustering also created a num_clone_members=1 column
    # Drop that so we can set it properly in sample_sequences
    df.drop("num_clone_members", axis=1, inplace=True)

    # Report total number of clones
    logger.info(
        f"Participant {participant_label} ({gene_loci}) has {df['igh_or_tcrb_clone_id'].nunique()} unique clones from specimens: {info_per_specimen}"
    )

    # Sample clones from each specimen,
    # with filters like dropping low-SHM naive B cells.
    # df may be empty after this
    df = pd.concat(
        [
            sample_sequences(specimen_df, required_gene_loci=gene_loci)
            for specimen_label, specimen_df in df.groupby("specimen_label")
        ],
        axis=0,
    )

    if df.shape[0] != 0:
        # Determine output filename: one file per external cohort participant.
        # Gene loci are combined into one file. They can be separated by the isotype column.
        fname_out = (
            config.paths.external_processed_data / f"{participant_label}.parquet"
        )
        if fname_out.exists():
            logger.warning(f"Path already exists, overwriting: {fname_out}")

        # Report number of sampled sequences
        logger.info(
            f"Participant {participant_label} ({gene_loci}) has {df.shape[0]} sampled sequences -> {fname_out}."
        )

        # Write
        df.to_parquet(fname_out, index=None)
        return fname_out
    else:
        logger.warning(
            f"Participant {participant_label} ({gene_loci}) had no sampled sequences. Skipping."
        )
        return None

In [42]:
# run on all specimens from same patient (combine all timepoints - even if mixed peak/non-peak status)
fnames_output = Parallel(n_jobs=n_jobs, backend="multiprocessing")(
    delayed(run_on_all_specimens_from_a_person)(
        participant_label=participant_label, specimens=specimens
    )
    for participant_label, specimens in all_specimens.groupby(
        ["participant_label"], observed=True
    )
)

  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:45:05,230 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBVA*01': 1}


2022-12-28 17:45:19,263 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2}


2022-12-28 17:45:26,464 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2}


2022-12-28 17:45:27,927 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBVA*01': 2}




2022-12-28 17:45:40,324 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 5}




2022-12-28 17:45:47,890 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*02': 7}






2022-12-28 17:45:55,178 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000160 (GeneLocus.TCR) has 3536 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB.tsv'), (4570, 68), (3646, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:45:55,825 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000160 (GeneLocus.TCR) has 3534 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000160.parquet.




2022-12-28 17:45:56,753 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4}


2022-12-28 17:45:58,747 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 1}




2022-12-28 17:46:06,524 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000430 (GeneLocus.TCR) has 4983 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-44-T1-replacement_TCRB.tsv'), (6454, 68), (5200, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:46:07,223 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000430 (GeneLocus.TCR) has 4976 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000430.parquet.




2022-12-28 17:46:12,605 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000098 (GeneLocus.TCR) has 9094 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0009-T2-replacement_TCRB.tsv'), (11906, 68), (9492, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:46:13,432 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000098 (GeneLocus.TCR) has 9087 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000098.parquet.


2022-12-28 17:46:13,389 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000141 (GeneLocus.TCR) has 10331 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0022-T0-replacement_TCRB.tsv'), (13385, 68), (10766, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:46:16,104 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBVA*01': 1}


2022-12-28 17:46:18,391 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000141 (GeneLocus.TCR) has 10325 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000141.parquet.






2022-12-28 17:46:22,443 - assign_clone_ids.ipynb - INFO - Participant 316188 (GeneLocus.BCR) has 43692 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.316188_1.tsv'), (167775, 45), (80096, 56), <GeneLocus.BCR: 1>)]


2022-12-28 17:46:22,789 - malid.sample_sequences - INFO - Removing 316188 specimen 316188_1 because it did not have enough clones. Clone count by isotype: {'IGHG': 15612, 'IGHA': 33, 'IGHD-M': 31118}




2022-12-28 17:46:23,221 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000143 (GeneLocus.TCR) has 11832 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_44-replacement_TCRB.tsv'), (15648, 68), (12563, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:46:19,106 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 7}


2022-12-28 17:46:24,279 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000143 (GeneLocus.TCR) has 11816 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000143.parquet.


2022-12-28 17:46:32,155 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000083 (GeneLocus.TCR) has 14220 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0002-T1-replacement_TCRB.tsv'), (20136, 68), (15490, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:46:34,015 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000083 (GeneLocus.TCR) has 14200 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000083.parquet.




2022-12-28 17:46:33,113 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 2}


2022-12-28 17:46:38,225 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBVA*01': 1}


2022-12-28 17:46:42,703 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000102 (GeneLocus.TCR) has 19094 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-10-T1-replacement_TCRB.tsv'), (25284, 68), (20460, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:46:42,842 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000078 (GeneLocus.TCR) has 20128 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GN-0010-T0-replacement_TCRB.tsv'), (26661, 68), (21338, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:46:44,515 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000078 (GeneLocus.TCR) has 20106 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000078.parquet.


2022-12-28 17:46:48,296 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000102 (GeneLocus.TCR) has 19070 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000102.parquet.


2022-12-28 17:46:47,497 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 5}


2022-12-28 17:46:53,441 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000145 (GeneLocus.TCR) has 23447 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_42-replacement_TCRB.tsv'), (32936, 68), (25590, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:46:55,742 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000145 (GeneLocus.TCR) has 23416 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000145.parquet.




2022-12-28 17:47:00,908 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000108 (GeneLocus.TCR) has 27015 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-31-T0-replacement_TCRB.tsv'), (37676, 68), (30036, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:03,733 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000108 (GeneLocus.TCR) has 26992 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000108.parquet.


2022-12-28 17:47:06,875 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBVA*01': 1}




2022-12-28 17:47:12,179 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000429 (GeneLocus.TCR) has 31755 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-43-T0-replacement_TCRB.tsv'), (44446, 68), (34973, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:13,870 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4}






2022-12-28 17:47:14,999 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000458 (GeneLocus.TCR) has 9282 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_23-replacement_TCRB.tsv'), (12233, 68), (9620, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:15,863 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000429 (GeneLocus.TCR) has 31717 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000429.parquet.


2022-12-28 17:47:15,951 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000458 (GeneLocus.TCR) has 9273 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000458.parquet.


2022-12-28 17:47:16,947 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000101 (GeneLocus.TCR) has 31810 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0011-T1-replacement_TCRB.tsv'), (46767, 68), (35119, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:23,196 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000101 (GeneLocus.TCR) has 31780 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000101.parquet.




2022-12-28 17:47:20,431 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 24, 'TRBVA*01': 2}


2022-12-28 17:47:21,752 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 7}


2022-12-28 17:47:25,186 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 9}




2022-12-28 17:47:26,091 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000092 (GeneLocus.TCR) has 35683 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GN-0002-T0-replacement_TCRB.tsv'), (52781, 68), (39345, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:24,120 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 17:47:27,139 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000456 (GeneLocus.TCR) has 16567 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_21-replacement_TCRB.tsv'), (21981, 68), (17403, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:26,305 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBVA*01': 1}


2022-12-28 17:47:28,458 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000456 (GeneLocus.TCR) has 16548 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000456.parquet.


2022-12-28 17:47:31,055 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000092 (GeneLocus.TCR) has 35621 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000092.parquet.


  df = pd.read_csv(



2022-12-28 17:47:35,617 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000091 (GeneLocus.TCR) has 41806 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GN-01-T0-replacement_TCRB.tsv'), (60677, 68), (47075, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:47:32,818 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 4, 'TRBV8-1*01': 1, 'TRBVA*01': 1}








2022-12-28 17:47:39,305 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000091 (GeneLocus.TCR) has 41756 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000091.parquet.




2022-12-28 17:47:49,171 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 5}








2022-12-28 17:47:52,509 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000463 (GeneLocus.TCR) has 21964 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_28-replacement_TCRB.tsv'), (30665, 68), (23690, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:53,168 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 6, 'TRBVA*01': 2}


2022-12-28 17:47:53,496 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000134 (GeneLocus.TCR) has 48211 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0017-T0-replacement_TCRB.tsv'), (72059, 68), (56603, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:53,897 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000142 (GeneLocus.TCR) has 49482 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-23-T1-replacement_TCRB.tsv'), (70988, 68), (56668, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:55,066 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000463 (GeneLocus.TCR) has 21934 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000463.parquet.




2022-12-28 17:47:53,900 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBVA*01': 5}


2022-12-28 17:47:57,631 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000400 (GeneLocus.TCR) has 50304 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_41-replacement_TCRB.tsv'), (73516, 68), (59442, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:47:57,829 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000134 (GeneLocus.TCR) has 48152 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000134.parquet.


2022-12-28 17:47:58,231 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000142 (GeneLocus.TCR) has 49409 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000142.parquet.


2022-12-28 17:47:58,668 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 4}


2022-12-28 17:47:56,297 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 16, 'TRBV8-1*01': 1, 'TRBVA*01': 1}




2022-12-28 17:48:00,741 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000074 (GeneLocus.TCR) has 53441 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0026-T0-replacement_TCRB.tsv'), (75322, 68), (62383, 80), <GeneLocus.TCR: 2>)]






2022-12-28 17:48:04,955 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000446 (GeneLocus.TCR) has 41934 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_10-replacement_TCRB.tsv'), (59001, 68), (47269, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:04,978 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000400 (GeneLocus.TCR) has 50228 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000400.parquet.


  df = pd.read_csv(



  df = pd.read_csv(





2022-12-28 17:48:06,502 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000074 (GeneLocus.TCR) has 53384 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000074.parquet.






2022-12-28 17:48:09,173 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000391 (GeneLocus.TCR) has 53252 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-29-T1-replacement_TCRB.tsv'), (81179, 68), (62717, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:09,942 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000446 (GeneLocus.TCR) has 41875 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000446.parquet.


2022-12-28 17:48:11,425 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 19, 'TRBVA*01': 3}


2022-12-28 17:48:12,642 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9}




  df = pd.read_csv(



2022-12-28 17:48:14,937 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000391 (GeneLocus.TCR) has 53191 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000391.parquet.


  df = pd.read_csv(



2022-12-28 17:48:18,359 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000069 (GeneLocus.TCR) has 67047 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GN-0005-T0-replacement_TCRB.tsv'), (99468, 68), (81456, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:19,003 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBVA*01': 1}


2022-12-28 17:48:20,182 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000395 (GeneLocus.TCR) has 64643 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-34-T0-replacement_TCRB.tsv'), (96887, 68), (77547, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:18,131 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 10, 'TRBVA*01': 2}


2022-12-28 17:48:21,700 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000066 (GeneLocus.TCR) has 65879 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_55-replacement_TCRB.tsv'), (100007, 68), (77393, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:20,926 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 3}


2022-12-28 17:48:23,281 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000495 (GeneLocus.TCR) has 29817 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_96-replacement_TCRB.tsv'), (39996, 68), (32494, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:24,309 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000069 (GeneLocus.TCR) has 66942 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000069.parquet.




  df = pd.read_csv(



2022-12-28 17:48:26,723 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000495 (GeneLocus.TCR) has 29777 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000495.parquet.


2022-12-28 17:48:26,723 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000395 (GeneLocus.TCR) has 64526 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000395.parquet.


2022-12-28 17:48:28,907 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 20, 'TRBVA*01': 1}


2022-12-28 17:48:29,061 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000066 (GeneLocus.TCR) has 65765 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000066.parquet.










2022-12-28 17:48:33,192 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBVA*01': 3}


2022-12-28 17:48:33,075 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 11, 'TRBV7-5*02': 21, 'TRBVA*01': 3}


2022-12-28 17:48:34,715 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBVA*01': 3}


2022-12-28 17:48:35,376 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000479 (GeneLocus.TCR) has 43074 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_80-replacement_TCRB.tsv'), (64475, 68), (49848, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:35,132 - assign_clone_ids.ipynb - INFO - Participant D103 (GeneLocus.BCR) has 315836 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.D103_1.tsv'), (681577, 45), (410216, 56), <GeneLocus.BCR: 1>)]


2022-12-28 17:48:37,709 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000090 (GeneLocus.TCR) has 73052 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-16-T2-replacement_TCRB.tsv'), (116715, 68), (87301, 80), <GeneLocus.TCR: 2>)]






2022-12-28 17:48:36,635 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 27, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 17:48:39,562 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000479 (GeneLocus.TCR) has 43003 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000479.parquet.


2022-12-28 17:48:40,445 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000135 (GeneLocus.TCR) has 82643 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-18-T2-replacement_TCRB.tsv'), (119976, 68), (100752, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:40,825 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000480 (GeneLocus.TCR) has 46020 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_81-replacement_TCRB.tsv'), (67673, 68), (52262, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:42,172 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 10, 'TRBVA*01': 2}


2022-12-28 17:48:42,754 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 17:48:42,977 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000042 (GeneLocus.TCR) has 81351 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-15-T1-replacement_TCRB.tsv'), (127496, 68), (102251, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:43,971 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000136 (GeneLocus.TCR) has 82328 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-19-T0-replacement_TCRB.tsv'), (127967, 68), (101361, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:44,818 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 32, 'TRBVA*01': 4}


  df = pd.read_csv(



2022-12-28 17:48:45,080 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 3}


2022-12-28 17:48:45,390 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000090 (GeneLocus.TCR) has 72934 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000090.parquet.






2022-12-28 17:48:46,259 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000441 (GeneLocus.TCR) has 69903 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_05-replacement_TCRB.tsv'), (109591, 68), (84234, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:46,146 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 25, 'TRBV8-1*01': 1, 'TRBVA*01': 3}




2022-12-28 17:48:46,642 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000480 (GeneLocus.TCR) has 45954 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000480.parquet.


2022-12-28 17:48:47,277 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 20, 'TRBVA*01': 4}


2022-12-28 17:48:49,725 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 6, 'TRBV7-5*02': 8, 'TRBVA*01': 3}


2022-12-28 17:48:49,735 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBVA*01': 2}


2022-12-28 17:48:50,128 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000135 (GeneLocus.TCR) has 82546 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000135.parquet.


2022-12-28 17:48:50,368 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000042 (GeneLocus.TCR) has 81254 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000042.parquet.


2022-12-28 17:48:50,411 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBVA*01': 2}




  df = pd.read_csv(



2022-12-28 17:48:51,972 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000136 (GeneLocus.TCR) has 82206 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000136.parquet.






2022-12-28 17:48:53,084 - assign_clone_ids.ipynb - INFO - Participant D103 (GeneLocus.BCR) has 139227 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/D103.parquet.


2022-12-28 17:48:53,718 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000470 (GeneLocus.TCR) has 63585 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_35-replacement_TCRB.tsv'), (95214, 68), (74212, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:48:53,797 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000469 (GeneLocus.TCR) has 64916 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_34-replacement_TCRB.tsv'), (102061, 68), (78448, 80), <GeneLocus.TCR: 2>)]






2022-12-28 17:48:54,756 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000423 (GeneLocus.TCR) has 87019 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-38-T2-replacement_TCRB.tsv'), (140272, 68), (106533, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:48:55,173 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000441 (GeneLocus.TCR) has 69817 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000441.parquet.


2022-12-28 17:48:56,877 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-147304 (GeneLocus.TCR) has 40478 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011110_TCRB.tsv'), (56476, 68), (45069, 80), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 17:48:58,147 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 7, 'TRBV7-5*02': 15, 'TRBVA*01': 1}


2022-12-28 17:48:59,317 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 23, 'TRBV7-5*02': 1, 'TRBVA*01': 2}




2022-12-28 17:49:00,007 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 2}


2022-12-28 17:48:59,690 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000470 (GeneLocus.TCR) has 63505 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000470.parquet.


2022-12-28 17:49:01,043 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000392 (GeneLocus.TCR) has 96838 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-33-T2-replacement_TCRB.tsv'), (151335, 68), (120909, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:01,363 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 17:49:01,808 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000449 (GeneLocus.TCR) has 81688 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_14-replacement_TCRB.tsv'), (128409, 68), (102600, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:02,274 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000469 (GeneLocus.TCR) has 64831 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000469.parquet.


2022-12-28 17:49:02,274 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-147304 (GeneLocus.TCR) has 40423 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-147304.parquet.


2022-12-28 17:49:02,502 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 6, 'TRBVA*01': 1}


2022-12-28 17:49:02,513 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000490 (GeneLocus.TCR) has 71800 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_91-replacement_TCRB.tsv'), (104478, 68), (90135, 80), <GeneLocus.TCR: 2>)]








2022-12-28 17:49:05,257 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000423 (GeneLocus.TCR) has 86881 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000423.parquet.


2022-12-28 17:49:02,940 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 27, 'TRBVA*01': 4}




  df = pd.read_csv(







2022-12-28 17:49:06,277 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000462 (GeneLocus.TCR) has 90641 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GIGI_27-replacement_TCRB.tsv'), (138744, 68), (110995, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:08,798 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000115 (GeneLocus.TCR) has 104749 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-32-T1-replacement_TCRB.tsv'), (169520, 68), (132838, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:09,160 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000449 (GeneLocus.TCR) has 81575 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000449.parquet.


2022-12-28 17:49:09,161 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000490 (GeneLocus.TCR) has 71742 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000490.parquet.


2022-12-28 17:49:09,315 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-294925 (GeneLocus.TCR) has 43911 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011489_TCRB.tsv'), (63842, 68), (50733, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:10,584 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000392 (GeneLocus.TCR) has 96714 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000392.parquet.




2022-12-28 17:49:12,898 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 11, 'TRBVA*01': 1}


2022-12-28 17:49:13,826 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000093 (GeneLocus.TCR) has 108242 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-GN-0003-T0-replacement_TCRB.tsv'), (173992, 68), (139403, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:14,068 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-294925 (GeneLocus.TCR) has 43864 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-294925.parquet.




2022-12-28 17:49:15,491 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-036 (GeneLocus.TCR) has 82393 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV036-BL-3_TCRB.tsv'), (128304, 68), (100979, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:16,265 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000144 (GeneLocus.TCR) has 108300 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0028-T0-replacement_TCRB.tsv'), (178491, 68), (136586, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:16,329 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-087 (GeneLocus.TCR) has 82361 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV087-BL-3_TCRB.tsv'), (128952, 68), (102190, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:16,796 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000462 (GeneLocus.TCR) has 90499 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000462.parquet.




2022-12-28 17:49:19,191 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-310581 (GeneLocus.TCR) has 39797 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011124_TCRB.tsv'), (55320, 68), (45100, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:19,233 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000115 (GeneLocus.TCR) has 104611 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000115.parquet.


2022-12-28 17:49:19,241 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-006 (GeneLocus.TCR) has 100093 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV006-BL-5_TCRB.tsv'), (157880, 68), (127725, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:49:21,096 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBVA*01': 4}


2022-12-28 17:49:21,821 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 40, 'TRBVA*01': 2}


2022-12-28 17:49:23,176 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-310581 (GeneLocus.TCR) has 39753 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-310581.parquet.


2022-12-28 17:49:23,465 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-036 (GeneLocus.TCR) has 82294 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-036.parquet.


2022-12-28 17:49:23,571 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBV8-1*01': 1, 'TRBVA*01': 2}


2022-12-28 17:49:24,395 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 9, 'TRBV7-5*02': 36, 'TRBVA*01': 3}


2022-12-28 17:49:25,687 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 19, 'TRBVA*01': 4}


2022-12-28 17:49:26,181 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-087 (GeneLocus.TCR) has 82238 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-087.parquet.


2022-12-28 17:49:26,188 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000093 (GeneLocus.TCR) has 108099 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000093.parquet.






2022-12-28 17:49:28,880 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBVA*01': 2}








2022-12-28 17:49:30,222 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-006 (GeneLocus.TCR) has 99919 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-006.parquet.


2022-12-28 17:49:30,222 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000144 (GeneLocus.TCR) has 108135 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000144.parquet.






  df = pd.read_csv(



2022-12-28 17:49:31,656 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-566820 (GeneLocus.TCR) has 19696 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011277_TCRB.tsv'), (26469, 68), (21170, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:49:33,690 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-566820 (GeneLocus.TCR) has 19669 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-566820.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:49:35,612 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-304752 (GeneLocus.TCR) has 58563 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011117_TCRB.tsv'), (86604, 68), (71091, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:37,629 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-121157 (GeneLocus.TCR) has 95681 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011310_TCRB.tsv'), (143286, 68), (115235, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:37,885 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000051 (GeneLocus.TCR) has 139223 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-0014-T2-replacement_TCRB.tsv'), (233995, 68), (188823, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:49:40,083 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 25, 'TRBVA*01': 4}


2022-12-28 17:49:40,886 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-304752 (GeneLocus.TCR) has 58486 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-304752.parquet.


2022-12-28 17:49:41,169 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-286053 (GeneLocus.TCR) has 92002 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011131_TCRB.tsv'), (146324, 68), (117176, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:44,109 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-051 (GeneLocus.TCR) has 108801 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV051-AC-3_TCRB.tsv'), (177071, 68), (143382, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:44,731 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000103 (GeneLocus.TCR) has 147321 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/BS-EQ-30-T1-replacement_TCRB.tsv'), (258010, 68), (202602, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:49:46,356 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-121157 (GeneLocus.TCR) has 95543 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-121157.parquet.


2022-12-28 17:49:46,945 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 2}


2022-12-28 17:49:47,189 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-073 (GeneLocus.TCR) has 110186 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV073-BL-3_TCRB.tsv'), (188172, 68), (148576, 80), <GeneLocus.TCR: 2>)]








2022-12-28 17:49:49,180 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-745617 (GeneLocus.TCR) has 11120 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011133_TCRB.tsv'), (14691, 68), (11668, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:49:49,904 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-286053 (GeneLocus.TCR) has 91889 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-286053.parquet.


2022-12-28 17:49:50,508 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-745617 (GeneLocus.TCR) has 11107 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-745617.parquet.


2022-12-28 17:49:50,923 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 26, 'TRBVA*01': 6}


2022-12-28 17:49:51,285 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000051 (GeneLocus.TCR) has 139027 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000051.parquet.


2022-12-28 17:49:52,310 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 16}


2022-12-28 17:49:52,754 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 24, 'TRBVA*01': 7}


  df = pd.read_csv(



2022-12-28 17:49:53,400 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 25, 'TRBVA*01': 2}




2022-12-28 17:49:54,805 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-051 (GeneLocus.TCR) has 108622 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-051.parquet.




2022-12-28 17:49:55,635 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 37, 'TRBVA*01': 7}




2022-12-28 17:49:57,491 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 4, 'TRBVA*01': 1}


2022-12-28 17:49:57,679 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-712578 (GeneLocus.TCR) has 35388 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011123_TCRB.tsv'), (50375, 68), (39589, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:02,050 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-712578 (GeneLocus.TCR) has 35335 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-712578.parquet.


2022-12-28 17:50:02,050 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-073 (GeneLocus.TCR) has 109999 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-073.parquet.




2022-12-28 17:50:02,050 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-0000103 (GeneLocus.TCR) has 147058 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-0000103.parquet.




2022-12-28 17:50:03,070 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-321977 (GeneLocus.TCR) has 74747 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011116_TCRB.tsv'), (112920, 68), (90216, 80), <GeneLocus.TCR: 2>)]






2022-12-28 17:50:05,582 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-351635 (GeneLocus.TCR) has 75072 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011112_TCRB.tsv'), (113636, 68), (90716, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:06,201 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-796689 (GeneLocus.TCR) has 40240 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011120_TCRB.tsv'), (56967, 68), (47572, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:50:08,447 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-034 (GeneLocus.TCR) has 135897 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV034-AC-3_TCRB.tsv'), (231292, 68), (182168, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:50:09,967 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-321977 (GeneLocus.TCR) has 74664 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-321977.parquet.


2022-12-28 17:50:10,115 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-796689 (GeneLocus.TCR) has 40195 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-796689.parquet.


2022-12-28 17:50:13,263 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-351635 (GeneLocus.TCR) has 74975 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-351635.parquet.


2022-12-28 17:50:14,574 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6}




  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:50:20,135 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-854938 (GeneLocus.TCR) has 39710 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011129_TCRB.tsv'), (56097, 68), (44180, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:20,702 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-034 (GeneLocus.TCR) has 135692 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-034.parquet.


2022-12-28 17:50:21,494 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 11, 'TRBV7-5*02': 9, 'TRBVA*01': 1}


2022-12-28 17:50:24,452 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-854938 (GeneLocus.TCR) has 39661 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-854938.parquet.


2022-12-28 17:50:25,665 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-100828 (GeneLocus.TCR) has 134952 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011496_TCRB.tsv'), (231362, 68), (177904, 80), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 17:50:29,029 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-002 (GeneLocus.TCR) has 163013 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV002-AC-5_TCRB.tsv'), (278291, 68), (226935, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:29,808 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 24, 'TRBVA*01': 5}


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:50:32,383 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBVA*01': 6}




2022-12-28 17:50:37,550 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-659607 (GeneLocus.TCR) has 90686 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011325_TCRB.tsv'), (139533, 68), (109786, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:37,870 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-100828 (GeneLocus.TCR) has 134707 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-100828.parquet.


  df = pd.read_csv(







  df = pd.read_csv(



2022-12-28 17:50:45,780 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-659607 (GeneLocus.TCR) has 90578 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-659607.parquet.


2022-12-28 17:50:46,918 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-002 (GeneLocus.TCR) has 162839 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-002.parquet.


  df = pd.read_csv(



2022-12-28 17:50:52,455 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 54, 'TRBVA*01': 4}


2022-12-28 17:50:52,564 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-354617 (GeneLocus.TCR) has 119884 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011227_TCRB.tsv'), (192777, 68), (156771, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:53,179 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV7-5*02': 31, 'TRBVA*01': 2}


2022-12-28 17:50:53,570 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 13, 'TRBVA*01': 2}


2022-12-28 17:50:53,636 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-506345 (GeneLocus.TCR) has 104943 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011240_TCRB.tsv'), (172064, 68), (137863, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:50:56,785 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBV7-5*02': 31, 'TRBVA*01': 3}


2022-12-28 17:50:56,797 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 4}


2022-12-28 17:50:56,799 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 31, 'TRBV7-5*02': 40, 'TRBVA*01': 9}






2022-12-28 17:50:58,081 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 103, 'TRBVA*01': 11}


  df = pd.read_csv(



2022-12-28 17:50:59,115 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 34, 'TRBVA*01': 7}


  df = pd.read_csv(



2022-12-28 17:51:01,262 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 41, 'TRBV7-5*02': 1, 'TRBVA*01': 7}








2022-12-28 17:51:03,743 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-354617 (GeneLocus.TCR) has 119721 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-354617.parquet.


2022-12-28 17:51:04,379 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-506345 (GeneLocus.TCR) has 104798 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-506345.parquet.










2022-12-28 17:51:07,763 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0002464 (GeneLocus.TCR) has 76602 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/ADIRP0002464_TCRB.tsv'), (114326, 68), (93573, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:51:08,948 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 26, 'TRBVA*01': 1}


2022-12-28 17:51:11,400 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 18, 'TRBVA*01': 3}


2022-12-28 17:51:11,419 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 76, 'TRBVA*01': 2}


2022-12-28 17:51:11,419 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBV7-5*02': 44, 'TRBV8-2*01': 2, 'TRBVA*01': 9}






2022-12-28 17:51:12,978 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 45, 'TRBV8-1*01': 1, 'TRBVA*01': 7}


  df = pd.read_csv(



2022-12-28 17:51:13,083 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 30, 'TRBVA*01': 2}










  df = pd.read_csv(







2022-12-28 17:51:16,381 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-869115 (GeneLocus.TCR) has 101268 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011132_TCRB.tsv'), (163044, 68), (129773, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:51:18,311 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0002464 (GeneLocus.TCR) has 76481 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-ADIRP0002464.parquet.










2022-12-28 17:51:24,567 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 91, 'TRBVA*01': 9}








2022-12-28 17:51:26,840 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 48, 'TRBVA*01': 5}




2022-12-28 17:51:27,204 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-869115 (GeneLocus.TCR) has 101136 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-869115.parquet.






2022-12-28 17:51:29,714 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 40, 'TRBV7-5*02': 1, 'TRBV8-1*01': 2, 'TRBVA*01': 6}


2022-12-28 17:51:30,815 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 33, 'TRBV7-5*02': 51, 'TRBVA*01': 2}


2022-12-28 17:51:33,146 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 20, 'TRBV8-1*01': 2, 'TRBV8-2*01': 1, 'TRBVA*01': 6}


2022-12-28 17:51:33,364 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 5, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 17:51:34,816 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-351469 (GeneLocus.TCR) has 149097 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011499_TCRB.tsv'), (270799, 68), (209584, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:51:35,890 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBVA*01': 1}


2022-12-28 17:51:37,103 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 53, 'TRBVA*01': 8}


2022-12-28 17:51:37,268 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 34, 'TRBV7-5*02': 59, 'TRBVA*01': 12}


2022-12-28 17:51:37,534 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 23, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 17:51:37,787 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 7, 'TRBV7-5*02': 10, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 3}


2022-12-28 17:51:38,098 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBVA*01': 1}


2022-12-28 17:51:38,422 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 135, 'TRBV8-2*01': 1, 'TRBVA*01': 5}


2022-12-28 17:51:38,955 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 11, 'TRBV8-2*01': 2, 'TRBVA*01': 1}


2022-12-28 17:51:39,646 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 61, 'TRBVA*01': 2}


2022-12-28 17:51:40,107 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 19, 'TRBV8-1*01': 1, 'TRBVA*01': 2}








2022-12-28 17:51:41,156 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-491531 (GeneLocus.TCR) has 152608 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011242_TCRB.tsv'), (280527, 68), (225059, 80), <GeneLocus.TCR: 2>)]






2022-12-28 17:51:42,656 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-376904 (GeneLocus.TCR) has 153602 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011115_TCRB.tsv'), (272263, 68), (207837, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:51:43,577 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-210401 (GeneLocus.TCR) has 181893 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011209_TCRB.tsv'), (348059, 68), (279590, 80), <GeneLocus.TCR: 2>)]






2022-12-28 17:51:46,413 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0001958 (GeneLocus.TCR) has 151366 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/ADIRP0001958_TCRB.tsv'), (248916, 68), (204903, 80), <GeneLocus.TCR: 2>)]




























2022-12-28 17:51:53,956 - assign_clone_ids.ipynb - INFO - Participant Keck0007_MC1 (GeneLocus.TCR) has 93586 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0007_MC1.tsv'), (118239, 132), (95090, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:51:56,031 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-491531 (GeneLocus.TCR) has 152398 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-491531.parquet.




2022-12-28 17:51:57,863 - assign_clone_ids.ipynb - INFO - Participant Keck0004_MC1 (GeneLocus.TCR) has 121977 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0004_MC1.tsv'), (152891, 132), (124052, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:51:58,293 - assign_clone_ids.ipynb - INFO - Participant Keck0002_MC1 (GeneLocus.TCR) has 89311 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0002_MC1.tsv'), (112887, 132), (90837, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:00,516 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-351469 (GeneLocus.TCR) has 148866 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-351469.parquet.


2022-12-28 17:52:01,188 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0001958 (GeneLocus.TCR) has 151136 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-ADIRP0001958.parquet.


2022-12-28 17:52:02,167 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-210401 (GeneLocus.TCR) has 181542 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-210401.parquet.






  df = pd.read_csv(



2022-12-28 17:52:04,301 - assign_clone_ids.ipynb - INFO - Participant Keck0007_MC1 (GeneLocus.TCR) has 93435 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0007_MC1.parquet.


2022-12-28 17:52:04,679 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-376904 (GeneLocus.TCR) has 153344 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-376904.parquet.


2022-12-28 17:52:05,182 - assign_clone_ids.ipynb - INFO - Participant Keck0001_MC1 (GeneLocus.TCR) has 102155 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0001_MC1.tsv'), (131027, 132), (103460, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:05,303 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-791336 (GeneLocus.TCR) has 157869 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011497_TCRB.tsv'), (264662, 68), (216566, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:07,295 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-773780 (GeneLocus.TCR) has 187168 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011494_TCRB.tsv'), (319729, 68), (260971, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:52:09,820 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-085 (GeneLocus.TCR) has 217730 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV085-BL-3_TCRB.tsv'), (408879, 68), (326448, 80), <GeneLocus.TCR: 2>)]




2022-12-28 17:52:11,427 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-190921 (GeneLocus.TCR) has 186142 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011232_TCRB.tsv'), (343198, 68), (272070, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:52:13,118 - assign_clone_ids.ipynb - INFO - Participant Keck0004_MC1 (GeneLocus.TCR) has 121827 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0004_MC1.parquet.


2022-12-28 17:52:13,177 - assign_clone_ids.ipynb - INFO - Participant Keck0003_MC1 (GeneLocus.TCR) has 189501 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0003_MC1.tsv'), (233497, 132), (193816, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:52:15,092 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0000093 (GeneLocus.TCR) has 155275 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/ADIR0000093_TCRB.tsv'), (269820, 68), (216349, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:52:16,185 - assign_clone_ids.ipynb - INFO - Participant Keck0006_MC1 (GeneLocus.TCR) has 136070 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0006_MC1.tsv'), (165913, 132), (138508, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:17,390 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-339526 (GeneLocus.TCR) has 199324 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011388_TCRB.tsv'), (365826, 68), (283575, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:17,465 - assign_clone_ids.ipynb - INFO - Participant Keck0002_MC1 (GeneLocus.TCR) has 89180 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0002_MC1.parquet.






2022-12-28 17:52:18,163 - assign_clone_ids.ipynb - INFO - Participant Keck0008_MC1 (GeneLocus.TCR) has 135881 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0008_MC1.tsv'), (173707, 132), (138519, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:18,989 - assign_clone_ids.ipynb - INFO - Participant Keck0005_MC1 (GeneLocus.TCR) has 138985 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0005_MC1.tsv'), (173852, 132), (141263, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:22,075 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 9, 'TRBV7-5*02': 11, 'TRBVA*01': 4}


2022-12-28 17:52:22,999 - assign_clone_ids.ipynb - INFO - Participant Keck0001_MC1 (GeneLocus.TCR) has 101974 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0001_MC1.parquet.


2022-12-28 17:52:23,456 - assign_clone_ids.ipynb - INFO - Participant 326737 (GeneLocus.BCR) has 311125 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.326737_1.tsv'), (1497206, 45), (608990, 56), <GeneLocus.BCR: 1>)]


2022-12-28 17:52:25,066 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-462527 (GeneLocus.TCR) has 194989 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011354_TCRB.tsv'), (341283, 68), (272730, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:52:27,566 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-791336 (GeneLocus.TCR) has 157623 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-791336.parquet.






  df = pd.read_csv(



2022-12-28 17:52:30,637 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0000093 (GeneLocus.TCR) has 155075 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-ADIRP0000093.parquet.


  df = pd.read_csv(



2022-12-28 17:52:32,656 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-773780 (GeneLocus.TCR) has 186843 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-773780.parquet.


2022-12-28 17:52:34,201 - assign_clone_ids.ipynb - INFO - Participant Keck0003_MC1 (GeneLocus.TCR) has 189241 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0003_MC1.parquet.


  df = pd.read_csv(



2022-12-28 17:52:35,969 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-085 (GeneLocus.TCR) has 217354 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-085.parquet.


2022-12-28 17:52:36,007 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-190921 (GeneLocus.TCR) has 185911 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-190921.parquet.


2022-12-28 17:52:36,212 - assign_clone_ids.ipynb - INFO - Participant Keck0006_MC1 (GeneLocus.TCR) has 135883 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0006_MC1.parquet.


2022-12-28 17:52:37,027 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-026 (GeneLocus.TCR) has 221276 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/INCOV026-AC-3_TCRB.tsv'), (437975, 68), (351821, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:37,760 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-339526 (GeneLocus.TCR) has 199055 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-339526.parquet.


  df = pd.read_csv(



2022-12-28 17:52:41,073 - assign_clone_ids.ipynb - INFO - Participant Keck0008_MC1 (GeneLocus.TCR) has 135681 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0008_MC1.parquet.


  df = pd.read_csv(



2022-12-28 17:52:41,979 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-723143 (GeneLocus.TCR) has 212989 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011223_TCRB.tsv'), (392239, 68), (316922, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:52:43,528 - assign_clone_ids.ipynb - INFO - Participant Keck0005_MC1 (GeneLocus.TCR) has 138801 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0005_MC1.parquet.


2022-12-28 17:52:43,791 - assign_clone_ids.ipynb - INFO - Participant Keck0009_MC1 (GeneLocus.TCR) has 121298 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0009_MC1.tsv'), (154799, 132), (123274, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:45,161 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0002436 (GeneLocus.TCR) has 205711 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/ADIRP0002436_TCRB.tsv'), (367116, 68), (295903, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:48,661 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-462527 (GeneLocus.TCR) has 194663 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-462527.parquet.


  df = pd.read_csv(



2022-12-28 17:52:49,928 - assign_clone_ids.ipynb - INFO - Participant 326737 (GeneLocus.BCR) has 218842 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/326737.parquet.


2022-12-28 17:52:50,279 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-679547 (GeneLocus.TCR) has 260054 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011340_TCRB.tsv'), (492645, 68), (397418, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:52:51,618 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0000439 (GeneLocus.TCR) has 206312 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/ADIRP0000439_TCRB.tsv'), (386640, 68), (304211, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:52:56,406 - assign_clone_ids.ipynb - INFO - Participant Keck0009_MC1 (GeneLocus.TCR) has 121118 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0009_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:52:59,127 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-026 (GeneLocus.TCR) has 220980 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-026.parquet.


  df = pd.read_csv(



2022-12-28 17:53:02,509 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-723143 (GeneLocus.TCR) has 212623 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-723143.parquet.


  df = pd.read_csv(



2022-12-28 17:53:05,365 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0002436 (GeneLocus.TCR) has 205217 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-ADIRP0002436.parquet.


  df = pd.read_csv(



2022-12-28 17:53:11,524 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-ADIRP0000439 (GeneLocus.TCR) has 205934 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-ADIRP0000439.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:53:13,121 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-679547 (GeneLocus.TCR) has 259428 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-679547.parquet.


2022-12-28 17:53:14,267 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-775827 (GeneLocus.TCR) has 281567 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011106_TCRB.tsv'), (547618, 68), (436461, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:53:20,215 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-696163 (GeneLocus.TCR) has 268985 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011130_TCRB.tsv'), (521834, 68), (411890, 80), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:53:27,506 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-588140 (GeneLocus.TCR) has 265362 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011495_TCRB.tsv'), (538822, 68), (415648, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:53:32,164 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 6, 'TRBV7-5*02': 2}






2022-12-28 17:53:39,792 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-775827 (GeneLocus.TCR) has 281091 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-775827.parquet.


2022-12-28 17:53:41,153 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-635695 (GeneLocus.TCR) has 281273 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/immunecode/reps/ImmuneCODE-Review-002/860011121_TCRB.tsv'), (563842, 68), (456709, 80), <GeneLocus.TCR: 2>)]


2022-12-28 17:53:42,086 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 10}


2022-12-28 17:53:42,541 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV8-2*01': 1}


2022-12-28 17:53:44,954 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 10, 'TRBV7-5*02': 4, 'TRBVA*01': 2}


2022-12-28 17:53:45,772 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-696163 (GeneLocus.TCR) has 268501 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-696163.parquet.


2022-12-28 17:53:46,733 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 12, 'TRBVA*01': 1}




2022-12-28 17:53:46,429 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 13, 'TRBVA*01': 2}




2022-12-28 17:53:47,035 - assign_clone_ids.ipynb - INFO - Participant Keck0020_MC1 (GeneLocus.TCR) has 68156 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0020_MC1.tsv'), (85667, 132), (69336, 144), <GeneLocus.TCR: 2>)]
















2022-12-28 17:53:53,171 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-588140 (GeneLocus.TCR) has 264846 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-588140.parquet.


2022-12-28 17:53:55,603 - assign_clone_ids.ipynb - INFO - Participant Keck0020_MC1 (GeneLocus.TCR) has 68074 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0020_MC1.parquet.


  df = pd.read_csv(



2022-12-28 17:53:58,600 - assign_clone_ids.ipynb - INFO - Participant Keck0036_MC1 (GeneLocus.TCR) has 96183 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0036_MC1.tsv'), (122527, 132), (97459, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:53:58,708 - assign_clone_ids.ipynb - INFO - Participant Keck0039_MC1 (GeneLocus.TCR) has 77241 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0039_MC1.tsv'), (100061, 132), (78307, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:54:01,939 - assign_clone_ids.ipynb - INFO - Participant Keck0022_MC1 (GeneLocus.TCR) has 107521 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0022_MC1.tsv'), (133926, 132), (109026, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:54:05,630 - assign_clone_ids.ipynb - INFO - Participant Keck0015_MC1 (GeneLocus.TCR) has 112070 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0015_MC1.tsv'), (146077, 132), (114248, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:07,068 - assign_clone_ids.ipynb - INFO - Participant Keck0029_MC1 (GeneLocus.TCR) has 112103 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0029_MC1.tsv'), (146453, 132), (114364, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:07,599 - assign_clone_ids.ipynb - INFO - Participant Keck0039_MC1 (GeneLocus.TCR) has 77136 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0039_MC1.parquet.


2022-12-28 17:54:09,194 - assign_clone_ids.ipynb - INFO - Participant Keck0036_MC1 (GeneLocus.TCR) has 96023 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0036_MC1.parquet.


2022-12-28 17:54:09,533 - assign_clone_ids.ipynb - INFO - Participant ImmuneCode-635695 (GeneLocus.TCR) has 280893 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/ImmuneCode-635695.parquet.


2022-12-28 17:54:10,413 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 12, 'TRBVA*01': 2}


2022-12-28 17:54:11,666 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15}


2022-12-28 17:54:12,279 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV8-1*01': 1, 'TRBVA*01': 3}


2022-12-28 17:54:13,221 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 8, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 2}


2022-12-28 17:54:13,509 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 3, 'TRBV7-5*01': 8}


2022-12-28 17:54:13,739 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBVA*01': 5}


2022-12-28 17:54:13,778 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBVA*01': 3}


2022-12-28 17:54:13,837 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBV8-1*01': 1}


2022-12-28 17:54:13,959 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 27}






2022-12-28 17:54:14,573 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 7, 'TRBV7-5*02': 7, 'TRBVA*01': 1}


2022-12-28 17:54:15,202 - assign_clone_ids.ipynb - INFO - Participant Keck0022_MC1 (GeneLocus.TCR) has 107349 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0022_MC1.parquet.


2022-12-28 17:54:15,382 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 8, 'TRBV7-5*02': 5, 'TRBVA*01': 2}


2022-12-28 17:54:15,573 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 14, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 1}




2022-12-28 17:54:16,718 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 21, 'TRBVA*01': 3}


2022-12-28 17:54:16,964 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 14, 'TRBV7-5*02': 8, 'TRBVA*01': 4}


















2022-12-28 17:54:19,783 - assign_clone_ids.ipynb - INFO - Participant Keck0015_MC1 (GeneLocus.TCR) has 111867 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0015_MC1.parquet.




2022-12-28 17:54:20,127 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 28, 'TRBV8-1*01': 1, 'TRBVA*01': 6}


  df = pd.read_csv(









2022-12-28 17:54:21,161 - assign_clone_ids.ipynb - INFO - Participant Keck0029_MC1 (GeneLocus.TCR) has 111917 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0029_MC1.parquet.






2022-12-28 17:54:23,697 - assign_clone_ids.ipynb - INFO - Participant Keck0024_MC1 (GeneLocus.TCR) has 68672 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0024_MC1.tsv'), (88649, 132), (69940, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:54:24,840 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 39, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 17:54:25,401 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 18, 'TRBVA*01': 2}


2022-12-28 17:54:25,745 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 8, 'TRBV8-1*01': 1, 'TRBVA*01': 1}






2022-12-28 17:54:26,363 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9}


2022-12-28 17:54:27,633 - assign_clone_ids.ipynb - INFO - Participant Keck0027_MC1 (GeneLocus.TCR) has 58974 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0027_MC1.tsv'), (75724, 132), (59793, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:29,092 - assign_clone_ids.ipynb - INFO - Participant Keck0011_MC1 (GeneLocus.TCR) has 92727 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0011_MC1.tsv'), (120225, 132), (94276, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:29,198 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17}


  df = pd.read_csv(













2022-12-28 17:54:31,858 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 16, 'TRBV8-1*01': 1}


2022-12-28 17:54:32,236 - assign_clone_ids.ipynb - INFO - Participant Keck0018_MC1 (GeneLocus.TCR) has 102013 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0018_MC1.tsv'), (128567, 132), (103679, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:32,262 - assign_clone_ids.ipynb - INFO - Participant Keck0028_MC1 (GeneLocus.TCR) has 92797 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0028_MC1.tsv'), (117430, 132), (94242, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:54:33,175 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBVA*01': 1}


2022-12-28 17:54:33,398 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 17:54:33,674 - assign_clone_ids.ipynb - INFO - Participant Keck0024_MC1 (GeneLocus.TCR) has 68590 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0024_MC1.parquet.




2022-12-28 17:54:34,348 - assign_clone_ids.ipynb - INFO - Participant Keck0027_MC1 (GeneLocus.TCR) has 58879 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0027_MC1.parquet.


2022-12-28 17:54:34,511 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 6, 'TRBV7-5*02': 10, 'TRBVA*01': 4}




2022-12-28 17:54:34,604 - assign_clone_ids.ipynb - INFO - Participant Keck0017_MC1 (GeneLocus.TCR) has 100926 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0017_MC1.tsv'), (130068, 132), (102594, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:36,042 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 8, 'TRBVA*01': 6}


2022-12-28 17:54:37,205 - assign_clone_ids.ipynb - INFO - Participant Keck0033_MC1 (GeneLocus.TCR) has 109298 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0033_MC1.tsv'), (142414, 132), (111042, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:54:37,616 - assign_clone_ids.ipynb - INFO - Participant Keck0016_MC1 (GeneLocus.TCR) has 118618 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0016_MC1.tsv'), (145584, 132), (120619, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:37,759 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBVA*01': 2}




  df = pd.read_csv(



2022-12-28 17:54:38,417 - assign_clone_ids.ipynb - INFO - Participant Keck0010_MC1 (GeneLocus.TCR) has 115658 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0010_MC1.tsv'), (146922, 132), (117598, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:54:38,755 - assign_clone_ids.ipynb - INFO - Participant Keck0035_MC1 (GeneLocus.TCR) has 103948 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0035_MC1.tsv'), (127021, 132), (105364, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:39,202 - assign_clone_ids.ipynb - INFO - Participant Keck0014_MC1 (GeneLocus.TCR) has 121176 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0014_MC1.tsv'), (151441, 132), (123078, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:39,798 - assign_clone_ids.ipynb - INFO - Participant Keck0038_MC1 (GeneLocus.TCR) has 110688 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0038_MC1.tsv'), (141825, 132), (112379, 144), <GeneLocus.TCR: 2>)]










2022-12-28 17:54:40,587 - assign_clone_ids.ipynb - INFO - Participant Keck0011_MC1 (GeneLocus.TCR) has 92569 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0011_MC1.parquet.


2022-12-28 17:54:40,880 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 7, 'TRBV7-5*02': 14, 'TRBVA*01': 1}






2022-12-28 17:54:42,726 - assign_clone_ids.ipynb - INFO - Participant Keck0026_MC1 (GeneLocus.TCR) has 76001 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0026_MC1.tsv'), (98394, 132), (77320, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:54:43,124 - assign_clone_ids.ipynb - INFO - Participant Keck0041_MC1 (GeneLocus.TCR) has 87522 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0041_MC1.tsv'), (108610, 132), (88895, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:43,268 - assign_clone_ids.ipynb - INFO - Participant Keck0042_MC1 (GeneLocus.TCR) has 92450 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0042_MC1.tsv'), (118945, 132), (93708, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:43,752 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4}


2022-12-28 17:54:44,091 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 33, 'TRBVA*01': 2}


2022-12-28 17:54:45,153 - assign_clone_ids.ipynb - INFO - Participant Keck0019_MC1 (GeneLocus.TCR) has 134019 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0019_MC1.tsv'), (170401, 132), (136084, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:54:45,559 - assign_clone_ids.ipynb - INFO - Participant Keck0018_MC1 (GeneLocus.TCR) has 101842 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0018_MC1.parquet.






2022-12-28 17:54:47,245 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 24, 'TRBVA*01': 7}


2022-12-28 17:54:47,676 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV8-1*01': 1, 'TRBVA*01': 1}


2022-12-28 17:54:47,712 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 30, 'TRBV8-2*01': 1, 'TRBVA*01': 2}


2022-12-28 17:54:47,894 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBVA*01': 3}


2022-12-28 17:54:49,140 - assign_clone_ids.ipynb - INFO - Participant Keck0028_MC1 (GeneLocus.TCR) has 92662 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0028_MC1.parquet.






2022-12-28 17:54:51,828 - assign_clone_ids.ipynb - INFO - Participant Keck0017_MC1 (GeneLocus.TCR) has 100699 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0017_MC1.parquet.




2022-12-28 17:54:52,058 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 14, 'TRBV7-5*02': 14, 'TRBVA*01': 2}


2022-12-28 17:54:52,061 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 24}


2022-12-28 17:54:52,093 - assign_clone_ids.ipynb - INFO - Participant Keck0026_MC1 (GeneLocus.TCR) has 75919 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0026_MC1.parquet.




2022-12-28 17:54:52,491 - assign_clone_ids.ipynb - INFO - Participant Keck0016_MC1 (GeneLocus.TCR) has 118416 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0016_MC1.parquet.


  df = pd.read_csv(



2022-12-28 17:54:53,714 - assign_clone_ids.ipynb - INFO - Participant Keck0043_MC1 (GeneLocus.TCR) has 85846 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0043_MC1.tsv'), (110013, 132), (87462, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:53,999 - assign_clone_ids.ipynb - INFO - Participant Keck0010_MC1 (GeneLocus.TCR) has 115489 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0010_MC1.parquet.


2022-12-28 17:54:54,048 - assign_clone_ids.ipynb - INFO - Participant Keck0052_MC1 (GeneLocus.TCR) has 34749 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0052_MC1.tsv'), (43443, 132), (35006, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:54:54,333 - assign_clone_ids.ipynb - INFO - Participant Keck0014_MC1 (GeneLocus.TCR) has 121016 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0014_MC1.parquet.






2022-12-28 17:54:55,251 - assign_clone_ids.ipynb - INFO - Participant Keck0041_MC1 (GeneLocus.TCR) has 87424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0041_MC1.parquet.






2022-12-28 17:54:57,027 - assign_clone_ids.ipynb - INFO - Participant Keck0042_MC1 (GeneLocus.TCR) has 92275 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0042_MC1.parquet.


2022-12-28 17:54:57,824 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 14, 'TRBV7-5*02': 18, 'TRBVA*01': 8}


2022-12-28 17:54:58,078 - assign_clone_ids.ipynb - INFO - Participant Keck0012_MC1 (GeneLocus.TCR) has 93331 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0012_MC1.tsv'), (116567, 132), (94754, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:58,105 - assign_clone_ids.ipynb - INFO - Participant Keck0034_MC1 (GeneLocus.TCR) has 165802 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0034_MC1.tsv'), (211927, 132), (169093, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:54:59,447 - assign_clone_ids.ipynb - INFO - Participant Keck0035_MC1 (GeneLocus.TCR) has 103836 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0035_MC1.parquet.


2022-12-28 17:54:59,597 - assign_clone_ids.ipynb - INFO - Participant Keck0033_MC1 (GeneLocus.TCR) has 109129 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0033_MC1.parquet.


2022-12-28 17:54:59,848 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBVA*01': 2}


2022-12-28 17:55:00,198 - assign_clone_ids.ipynb - INFO - Participant Keck0045_MC1 (GeneLocus.TCR) has 92203 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0045_MC1.tsv'), (120687, 132), (93616, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:00,995 - assign_clone_ids.ipynb - INFO - Participant Keck0052_MC1 (GeneLocus.TCR) has 34688 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0052_MC1.parquet.


2022-12-28 17:55:02,375 - assign_clone_ids.ipynb - INFO - Participant Keck0038_MC1 (GeneLocus.TCR) has 110466 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0038_MC1.parquet.


2022-12-28 17:55:02,475 - assign_clone_ids.ipynb - INFO - Participant Keck0019_MC1 (GeneLocus.TCR) has 133756 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0019_MC1.parquet.






2022-12-28 17:55:03,340 - assign_clone_ids.ipynb - INFO - Participant Keck0023_MC1 (GeneLocus.TCR) has 106467 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0023_MC1.tsv'), (130394, 132), (107660, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:05,274 - assign_clone_ids.ipynb - INFO - Participant Keck0025_MC1 (GeneLocus.TCR) has 105247 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0025_MC1.tsv'), (130835, 132), (107609, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:05,429 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 15, 'TRBV7-5*02': 27, 'TRBV8-1*01': 2}


2022-12-28 17:55:05,691 - assign_clone_ids.ipynb - INFO - Participant Keck0031_MC1 (GeneLocus.TCR) has 172129 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0031_MC1.tsv'), (220652, 132), (175511, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:07,526 - assign_clone_ids.ipynb - INFO - Participant Keck0051_MC1 (GeneLocus.TCR) has 77357 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0051_MC1.tsv'), (97035, 132), (78367, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:55:07,978 - assign_clone_ids.ipynb - INFO - Participant Keck0030_MC1 (GeneLocus.TCR) has 112302 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0030_MC1.tsv'), (145075, 132), (114184, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:09,493 - assign_clone_ids.ipynb - INFO - Participant Keck0044_MC1 (GeneLocus.TCR) has 116265 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0044_MC1.tsv'), (148952, 132), (118738, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:09,314 - assign_clone_ids.ipynb - INFO - Participant Keck0043_MC1 (GeneLocus.TCR) has 85718 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0043_MC1.parquet.


2022-12-28 17:55:10,659 - assign_clone_ids.ipynb - INFO - Participant Keck0047_MC1 (GeneLocus.TCR) has 125161 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0047_MC1.tsv'), (158835, 132), (127660, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:10,945 - assign_clone_ids.ipynb - INFO - Participant Keck0012_MC1 (GeneLocus.TCR) has 93200 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0012_MC1.parquet.


2022-12-28 17:55:10,976 - assign_clone_ids.ipynb - INFO - Participant Keck0032_MC1 (GeneLocus.TCR) has 120852 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0032_MC1.tsv'), (150015, 132), (123105, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:55:12,168 - assign_clone_ids.ipynb - INFO - Participant Keck0046_MC1 (GeneLocus.TCR) has 98780 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0046_MC1.tsv'), (128753, 132), (100454, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:12,648 - assign_clone_ids.ipynb - INFO - Participant Keck0045_MC1 (GeneLocus.TCR) has 92063 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0045_MC1.parquet.


2022-12-28 17:55:12,831 - assign_clone_ids.ipynb - INFO - Participant Keck0021_MC1 (GeneLocus.TCR) has 260809 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0021_MC1.tsv'), (336295, 132), (269200, 144), <GeneLocus.TCR: 2>)]






2022-12-28 17:55:15,456 - assign_clone_ids.ipynb - INFO - Participant Keck0050_MC1 (GeneLocus.TCR) has 100817 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0050_MC1.tsv'), (130012, 132), (102535, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:17,566 - assign_clone_ids.ipynb - INFO - Participant Keck0025_MC1 (GeneLocus.TCR) has 105124 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0025_MC1.parquet.






2022-12-28 17:55:20,951 - assign_clone_ids.ipynb - INFO - Participant Keck0030_MC1 (GeneLocus.TCR) has 112108 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0030_MC1.parquet.


2022-12-28 17:55:21,286 - assign_clone_ids.ipynb - INFO - Participant Keck0023_MC1 (GeneLocus.TCR) has 106286 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0023_MC1.parquet.


2022-12-28 17:55:22,259 - assign_clone_ids.ipynb - INFO - Participant Keck0051_MC1 (GeneLocus.TCR) has 77234 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0051_MC1.parquet.


2022-12-28 17:55:22,675 - assign_clone_ids.ipynb - INFO - Participant Keck0053_MC1 (GeneLocus.TCR) has 118383 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0053_MC1.tsv'), (149590, 132), (120171, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:23,449 - assign_clone_ids.ipynb - INFO - Participant Keck0048_MC1 (GeneLocus.TCR) has 133959 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0048_MC1.tsv'), (168693, 132), (136316, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:55:27,735 - assign_clone_ids.ipynb - INFO - Participant Keck0047_MC1 (GeneLocus.TCR) has 124961 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0047_MC1.parquet.


2022-12-28 17:55:27,735 - assign_clone_ids.ipynb - INFO - Participant Keck0044_MC1 (GeneLocus.TCR) has 116111 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0044_MC1.parquet.


2022-12-28 17:55:27,735 - assign_clone_ids.ipynb - INFO - Participant Keck0032_MC1 (GeneLocus.TCR) has 120680 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0032_MC1.parquet.


2022-12-28 17:55:27,736 - assign_clone_ids.ipynb - INFO - Participant Keck0046_MC1 (GeneLocus.TCR) has 98655 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0046_MC1.parquet.


2022-12-28 17:55:29,083 - assign_clone_ids.ipynb - INFO - Participant Keck0040_MC1 (GeneLocus.TCR) has 149228 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0040_MC1.tsv'), (189835, 132), (152096, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:29,494 - assign_clone_ids.ipynb - INFO - Participant Keck0050_MC1 (GeneLocus.TCR) has 100675 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0050_MC1.parquet.


2022-12-28 17:55:29,585 - assign_clone_ids.ipynb - INFO - Participant Keck0034_MC1 (GeneLocus.TCR) has 165508 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0034_MC1.parquet.


  df = pd.read_csv(



2022-12-28 17:55:32,945 - assign_clone_ids.ipynb - INFO - Participant 326797 (GeneLocus.BCR) has 465744 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.326797_1.tsv'), (2233482, 45), (900736, 56), <GeneLocus.BCR: 1>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:55:40,503 - assign_clone_ids.ipynb - INFO - Participant Keck0037_MC1 (GeneLocus.TCR) has 195732 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0037_MC1.tsv'), (240823, 132), (200128, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:41,368 - assign_clone_ids.ipynb - INFO - Participant Keck0031_MC1 (GeneLocus.TCR) has 171900 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0031_MC1.parquet.


  df = pd.read_csv(



2022-12-28 17:55:43,161 - assign_clone_ids.ipynb - INFO - Participant Keck0013_MC1 (GeneLocus.TCR) has 213478 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0013_MC1.tsv'), (273504, 132), (219501, 144), <GeneLocus.TCR: 2>)]


2022-12-28 17:55:44,325 - assign_clone_ids.ipynb - INFO - Participant Keck0049_MC1 (GeneLocus.TCR) has 191900 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0049_MC1.tsv'), (251590, 132), (197155, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 17:55:48,860 - assign_clone_ids.ipynb - INFO - Participant Keck0048_MC1 (GeneLocus.TCR) has 133765 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0048_MC1.parquet.


2022-12-28 17:55:48,860 - assign_clone_ids.ipynb - INFO - Participant Keck0053_MC1 (GeneLocus.TCR) has 118237 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0053_MC1.parquet.


2022-12-28 17:55:48,860 - assign_clone_ids.ipynb - INFO - Participant Keck0021_MC1 (GeneLocus.TCR) has 260445 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0021_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:55:53,348 - assign_clone_ids.ipynb - INFO - Participant Keck0040_MC1 (GeneLocus.TCR) has 149045 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0040_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:56:18,203 - assign_clone_ids.ipynb - INFO - Participant Keck0013_MC1 (GeneLocus.TCR) has 213164 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0013_MC1.parquet.


2022-12-28 17:56:18,203 - assign_clone_ids.ipynb - INFO - Participant Keck0037_MC1 (GeneLocus.TCR) has 195414 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0037_MC1.parquet.


2022-12-28 17:56:18,203 - assign_clone_ids.ipynb - INFO - Participant Keck0049_MC1 (GeneLocus.TCR) has 191640 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0049_MC1.parquet.


2022-12-28 17:56:18,204 - assign_clone_ids.ipynb - INFO - Participant 326797 (GeneLocus.BCR) has 377318 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/326797.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 17:59:33,025 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 10, 'TRBVA*01': 2}






2022-12-28 17:59:43,265 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 13, 'TRBVA*01': 1}


2022-12-28 17:59:43,935 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV10-1*03': 1, 'TRBV7-5*01': 11, 'TRBV7-5*02': 20, 'TRBVA*01': 1}




2022-12-28 17:59:49,820 - assign_clone_ids.ipynb - INFO - Participant Keck0063_MC1 (GeneLocus.TCR) has 103395 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0063_MC1.tsv'), (134784, 132), (104847, 144), <GeneLocus.TCR: 2>)]




2022-12-28 17:59:55,624 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 2, 'TRBV7-5*02': 24, 'TRBVA*01': 1}


2022-12-28 18:00:01,618 - assign_clone_ids.ipynb - INFO - Participant Keck0063_MC1 (GeneLocus.TCR) has 103220 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0063_MC1.parquet.






2022-12-28 18:00:04,364 - assign_clone_ids.ipynb - INFO - Participant Keck0066_MC1 (GeneLocus.TCR) has 122692 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0066_MC1.tsv'), (153713, 132), (124463, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:00:05,609 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 18, 'TRBVA*01': 2}


2022-12-28 18:00:09,378 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBVA*01': 3}


2022-12-28 18:00:10,874 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 12, 'TRBV8-1*01': 1, 'TRBVA*01': 1}


2022-12-28 18:00:12,885 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 18, 'TRBVA*01': 6}


2022-12-28 18:00:13,417 - assign_clone_ids.ipynb - INFO - Participant Keck0054_MC1 (GeneLocus.TCR) has 183342 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0054_MC1.tsv'), (234311, 132), (187314, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:00:13,966 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV8-1*01': 1, 'TRBVA*01': 3}






2022-12-28 18:00:14,856 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 14, 'TRBVA*01': 5}


2022-12-28 18:00:16,243 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV7-5*02': 20, 'TRBVA*01': 2}


2022-12-28 18:00:17,892 - assign_clone_ids.ipynb - INFO - Participant Keck0066_MC1 (GeneLocus.TCR) has 122512 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0066_MC1.parquet.










2022-12-28 18:00:22,258 - assign_clone_ids.ipynb - INFO - Participant Keck0067_MC1 (GeneLocus.TCR) has 148213 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0067_MC1.tsv'), (184159, 132), (151332, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:00:24,546 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 34, 'TRBV8-1*01': 1, 'TRBVA*01': 8}


  df = pd.read_csv(









2022-12-28 18:00:26,895 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*02': 48, 'TRBV8-2*01': 1, 'TRBVA*01': 6}


2022-12-28 18:00:30,193 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 1}


2022-12-28 18:00:30,605 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 7, 'TRBV7-5*02': 22, 'TRBVA*01': 2}


2022-12-28 18:00:31,006 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 19, 'TRBV8-1*01': 1, 'TRBVA*01': 3}


2022-12-28 18:00:31,876 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 36, 'TRBVA*01': 4}


2022-12-28 18:00:34,067 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 24, 'TRBVA*01': 5}


2022-12-28 18:00:34,479 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 3, 'TRBV7-5*01': 37, 'TRBVA*01': 8}


2022-12-28 18:00:34,607 - assign_clone_ids.ipynb - INFO - Participant Keck0054_MC1 (GeneLocus.TCR) has 183078 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0054_MC1.parquet.




2022-12-28 18:00:36,143 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 9, 'TRBV7-5*02': 13}






2022-12-28 18:00:41,206 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 22, 'TRBVA*01': 4}


2022-12-28 18:00:41,641 - assign_clone_ids.ipynb - INFO - Participant Keck0065_MC1 (GeneLocus.TCR) has 171617 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0065_MC1.tsv'), (214097, 132), (175762, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:00:43,349 - assign_clone_ids.ipynb - INFO - Participant Keck0067_MC1 (GeneLocus.TCR) has 148050 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0067_MC1.parquet.




2022-12-28 18:00:43,938 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 54, 'TRBV8-1*01': 1, 'TRBVA*01': 5}






2022-12-28 18:00:44,467 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 19, 'TRBV7-5*02': 25, 'TRBV8-1*01': 1, 'TRBVA*01': 3}


2022-12-28 18:00:45,371 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 24, 'TRBV7-5*02': 35, 'TRBVA*01': 3}


2022-12-28 18:00:47,350 - assign_clone_ids.ipynb - INFO - Participant Keck0058_MC1 (GeneLocus.TCR) has 181346 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0058_MC1.tsv'), (228466, 132), (184795, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:00:47,874 - assign_clone_ids.ipynb - INFO - Participant Keck0061_MC1 (GeneLocus.TCR) has 189642 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0061_MC1.tsv'), (233449, 132), (193468, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:00:49,782 - assign_clone_ids.ipynb - INFO - Participant Keck0060_MC1 (GeneLocus.TCR) has 193468 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0060_MC1.tsv'), (235143, 132), (197928, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:00:50,509 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 29, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 18:00:52,779 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 34, 'TRBV8-1*01': 3, 'TRBVA*01': 3}




2022-12-28 18:00:53,188 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 2}


2022-12-28 18:00:54,029 - assign_clone_ids.ipynb - INFO - Participant Keck0075_MC1 (GeneLocus.TCR) has 178411 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0075_MC1.tsv'), (225259, 132), (182560, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:00:54,223 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16}


2022-12-28 18:00:54,263 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 12, 'TRBV8-2*01': 1, 'TRBVA*01': 3}










2022-12-28 18:00:57,377 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 18, 'TRBV7-5*02': 31, 'TRBV8-2*01': 1, 'TRBVA*01': 7}




2022-12-28 18:00:57,875 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 40, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 18:00:58,342 - assign_clone_ids.ipynb - INFO - Participant Keck0064_MC1 (GeneLocus.TCR) has 182835 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0064_MC1.tsv'), (236466, 132), (186279, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:01:00,007 - assign_clone_ids.ipynb - INFO - Participant Keck0056_MC1 (GeneLocus.TCR) has 206644 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0056_MC1.tsv'), (252170, 132), (211923, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:01:00,769 - assign_clone_ids.ipynb - INFO - Participant Keck0079_MC1 (GeneLocus.TCR) has 193652 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0079_MC1.tsv'), (249687, 132), (198663, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:01:06,836 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 19, 'TRBVA*01': 8}


2022-12-28 18:01:07,441 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 69, 'TRBV8-1*01': 1, 'TRBVA*01': 3}


2022-12-28 18:01:08,435 - assign_clone_ids.ipynb - INFO - Participant Keck0059_MC1 (GeneLocus.TCR) has 217597 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0059_MC1.tsv'), (279412, 132), (223983, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:09,065 - assign_clone_ids.ipynb - INFO - Participant Keck0065_MC1 (GeneLocus.TCR) has 171341 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0065_MC1.parquet.






2022-12-28 18:01:10,251 - assign_clone_ids.ipynb - INFO - Participant Keck0058_MC1 (GeneLocus.TCR) has 181031 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0058_MC1.parquet.


2022-12-28 18:01:10,844 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV10-1*03': 1, 'TRBV7-5*01': 13, 'TRBV7-5*02': 35, 'TRBVA*01': 6}


2022-12-28 18:01:10,951 - assign_clone_ids.ipynb - INFO - Participant Keck0061_MC1 (GeneLocus.TCR) has 189394 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0061_MC1.parquet.


2022-12-28 18:01:10,968 - assign_clone_ids.ipynb - INFO - Participant Keck0085_MC1 (GeneLocus.TCR) has 204621 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0085_MC1.tsv'), (255486, 132), (209402, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:01:12,872 - assign_clone_ids.ipynb - INFO - Participant Keck0060_MC1 (GeneLocus.TCR) has 193209 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0060_MC1.parquet.


2022-12-28 18:01:13,295 - assign_clone_ids.ipynb - INFO - Participant Keck0057_MC1 (GeneLocus.TCR) has 225568 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0057_MC1.tsv'), (280104, 132), (230996, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:01:14,828 - assign_clone_ids.ipynb - INFO - Participant Keck0062_MC1 (GeneLocus.TCR) has 229867 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0062_MC1.tsv'), (277100, 132), (236447, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:15,808 - assign_clone_ids.ipynb - INFO - Participant Keck0068_MC1 (GeneLocus.TCR) has 219198 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0068_MC1.tsv'), (281136, 132), (224716, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:18,238 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 13, 'TRBV7-5*02': 31, 'TRBV8-1*01': 1, 'TRBVA*01': 3}








2022-12-28 18:01:26,734 - assign_clone_ids.ipynb - INFO - Participant Keck0090_MC1 (GeneLocus.TCR) has 215501 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0090_MC1.tsv'), (278222, 132), (221165, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:26,644 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 38, 'TRBV8-2*01': 1, 'TRBVA*01': 8}


2022-12-28 18:01:27,087 - assign_clone_ids.ipynb - INFO - Participant Keck0079_MC1 (GeneLocus.TCR) has 193380 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0079_MC1.parquet.


2022-12-28 18:01:29,620 - assign_clone_ids.ipynb - INFO - Participant Keck0056_MC1 (GeneLocus.TCR) has 206387 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0056_MC1.parquet.






2022-12-28 18:01:31,364 - assign_clone_ids.ipynb - INFO - Participant Keck0075_MC1 (GeneLocus.TCR) has 178134 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0075_MC1.parquet.


2022-12-28 18:01:31,999 - assign_clone_ids.ipynb - INFO - Participant Keck0086_MC1 (GeneLocus.TCR) has 195628 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0086_MC1.tsv'), (252308, 132), (200340, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:01:32,933 - assign_clone_ids.ipynb - INFO - Participant Keck0055_MC1 (GeneLocus.TCR) has 261658 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0055_MC1.tsv'), (344520, 132), (270669, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:33,583 - assign_clone_ids.ipynb - INFO - Participant Keck0078_MC1 (GeneLocus.TCR) has 218411 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0078_MC1.tsv'), (272602, 132), (224382, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:34,618 - assign_clone_ids.ipynb - INFO - Participant Keck0059_MC1 (GeneLocus.TCR) has 217304 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0059_MC1.parquet.


2022-12-28 18:01:34,720 - assign_clone_ids.ipynb - INFO - Participant Keck0064_MC1 (GeneLocus.TCR) has 182418 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0064_MC1.parquet.






2022-12-28 18:01:36,041 - assign_clone_ids.ipynb - INFO - Participant Keck0085_MC1 (GeneLocus.TCR) has 204364 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0085_MC1.parquet.


2022-12-28 18:01:37,152 - assign_clone_ids.ipynb - INFO - Participant Keck0074_MC1 (GeneLocus.TCR) has 222963 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0074_MC1.tsv'), (276716, 132), (228546, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:01:41,894 - assign_clone_ids.ipynb - INFO - Participant Keck0068_MC1 (GeneLocus.TCR) has 218872 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0068_MC1.parquet.


2022-12-28 18:01:41,482 - assign_clone_ids.ipynb - INFO - Participant Keck0062_MC1 (GeneLocus.TCR) has 229414 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0062_MC1.parquet.


2022-12-28 18:01:41,183 - assign_clone_ids.ipynb - INFO - Participant Keck0057_MC1 (GeneLocus.TCR) has 225245 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0057_MC1.parquet.


2022-12-28 18:01:40,043 - assign_clone_ids.ipynb - INFO - Participant Keck0089_MC1 (GeneLocus.TCR) has 211906 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0089_MC1.tsv'), (271416, 132), (216842, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:41,149 - assign_clone_ids.ipynb - INFO - Participant Keck0072_MC1 (GeneLocus.TCR) has 232614 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0072_MC1.tsv'), (300189, 132), (239046, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:44,206 - assign_clone_ids.ipynb - INFO - Participant Keck0084_MC1 (GeneLocus.TCR) has 274168 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0084_MC1.tsv'), (333594, 132), (282390, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:48,882 - assign_clone_ids.ipynb - INFO - Participant Keck0088_MC1 (GeneLocus.TCR) has 242277 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0088_MC1.tsv'), (312925, 132), (249045, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:53,047 - assign_clone_ids.ipynb - INFO - Participant Keck0090_MC1 (GeneLocus.TCR) has 215157 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0090_MC1.parquet.


2022-12-28 18:01:53,753 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 30, 'TRBV7-5*02': 38, 'TRBVA*01': 6}


  df = pd.read_csv(



2022-12-28 18:01:55,858 - assign_clone_ids.ipynb - INFO - Participant Keck0081_MC1 (GeneLocus.TCR) has 277135 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0081_MC1.tsv'), (341262, 132), (285885, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:01:56,971 - assign_clone_ids.ipynb - INFO - Participant Keck0077_MC1 (GeneLocus.TCR) has 249839 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0077_MC1.tsv'), (327003, 132), (258091, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:01:57,110 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 106, 'TRBVA*01': 9}


2022-12-28 18:01:58,089 - assign_clone_ids.ipynb - INFO - Participant Keck0073_MC1 (GeneLocus.TCR) has 260914 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0073_MC1.tsv'), (327154, 132), (268821, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:01:58,697 - assign_clone_ids.ipynb - INFO - Participant Keck0091_MC1 (GeneLocus.TCR) has 222797 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0091_MC1.tsv'), (281954, 132), (228704, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:02:01,182 - assign_clone_ids.ipynb - INFO - Participant Keck0086_MC1 (GeneLocus.TCR) has 195406 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0086_MC1.parquet.


2022-12-28 18:02:03,528 - assign_clone_ids.ipynb - INFO - Participant Keck0055_MC1 (GeneLocus.TCR) has 261228 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0055_MC1.parquet.


2022-12-28 18:02:08,944 - assign_clone_ids.ipynb - INFO - Participant Keck0087_MC1 (GeneLocus.TCR) has 261310 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0087_MC1.tsv'), (330044, 132), (267971, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:02:12,191 - assign_clone_ids.ipynb - INFO - Participant Keck0078_MC1 (GeneLocus.TCR) has 218017 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0078_MC1.parquet.


2022-12-28 18:02:16,925 - assign_clone_ids.ipynb - INFO - Participant Keck0076_MC1 (GeneLocus.TCR) has 274443 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0076_MC1.tsv'), (350891, 132), (284751, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:02:16,962 - assign_clone_ids.ipynb - INFO - Participant Keck0074_MC1 (GeneLocus.TCR) has 222682 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0074_MC1.parquet.


2022-12-28 18:02:16,963 - assign_clone_ids.ipynb - INFO - Participant Keck0084_MC1 (GeneLocus.TCR) has 273612 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0084_MC1.parquet.


2022-12-28 18:02:18,328 - assign_clone_ids.ipynb - INFO - Participant Keck0071_MC1 (GeneLocus.TCR) has 307756 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0071_MC1.tsv'), (388511, 132), (318358, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:02:23,145 - assign_clone_ids.ipynb - INFO - Participant Keck0072_MC1 (GeneLocus.TCR) has 232300 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0072_MC1.parquet.


2022-12-28 18:02:23,145 - assign_clone_ids.ipynb - INFO - Participant Keck0089_MC1 (GeneLocus.TCR) has 211617 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0089_MC1.parquet.


2022-12-28 18:02:25,468 - assign_clone_ids.ipynb - INFO - Participant Keck0083_MC1 (GeneLocus.TCR) has 322860 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0083_MC1.tsv'), (424530, 132), (334771, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:02:23,375 - assign_clone_ids.ipynb - INFO - Participant Keck0091_MC1 (GeneLocus.TCR) has 222479 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0091_MC1.parquet.








2022-12-28 18:02:31,370 - assign_clone_ids.ipynb - INFO - Participant Keck0082_MC1 (GeneLocus.TCR) has 318172 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0082_MC1.tsv'), (393165, 132), (329813, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:02:35,883 - assign_clone_ids.ipynb - INFO - Participant Keck0073_MC1 (GeneLocus.TCR) has 260538 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0073_MC1.parquet.


2022-12-28 18:02:35,883 - assign_clone_ids.ipynb - INFO - Participant Keck0088_MC1 (GeneLocus.TCR) has 241962 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0088_MC1.parquet.


2022-12-28 18:02:35,883 - assign_clone_ids.ipynb - INFO - Participant Keck0077_MC1 (GeneLocus.TCR) has 249534 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0077_MC1.parquet.


2022-12-28 18:02:38,494 - assign_clone_ids.ipynb - INFO - Participant Keck0081_MC1 (GeneLocus.TCR) has 276779 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0081_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:02:55,769 - assign_clone_ids.ipynb - INFO - Participant Keck0080_MC1 (GeneLocus.TCR) has 415813 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0080_MC1.tsv'), (534749, 132), (432293, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:03:09,379 - assign_clone_ids.ipynb - INFO - Participant Keck0087_MC1 (GeneLocus.TCR) has 260897 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0087_MC1.parquet.


2022-12-28 18:03:09,379 - assign_clone_ids.ipynb - INFO - Participant Keck0071_MC1 (GeneLocus.TCR) has 307261 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0071_MC1.parquet.


2022-12-28 18:03:09,379 - assign_clone_ids.ipynb - INFO - Participant Keck0083_MC1 (GeneLocus.TCR) has 322403 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0083_MC1.parquet.


2022-12-28 18:03:09,379 - assign_clone_ids.ipynb - INFO - Participant Keck0076_MC1 (GeneLocus.TCR) has 274031 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0076_MC1.parquet.


2022-12-28 18:03:09,379 - assign_clone_ids.ipynb - INFO - Participant Keck0082_MC1 (GeneLocus.TCR) has 317740 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0082_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(





2022-12-28 18:03:38,859 - assign_clone_ids.ipynb - INFO - Participant Keck0070_MC1 (GeneLocus.TCR) has 531336 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0070_MC1.tsv'), (686257, 132), (558944, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:03:23,987 - assign_clone_ids.ipynb - INFO - Participant 326650 (GeneLocus.BCR) has 1325534 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.326650_1.tsv'), (3005561, 45), (1834861, 56), <GeneLocus.BCR: 1>)]


2022-12-28 18:03:44,941 - assign_clone_ids.ipynb - INFO - Participant Keck0069_MC1 (GeneLocus.TCR) has 558470 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0069_MC1.tsv'), (719943, 132), (585902, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:04:09,376 - assign_clone_ids.ipynb - INFO - Participant Keck0080_MC1 (GeneLocus.TCR) has 415067 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0080_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:04:29,103 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 18:04:25,162 - assign_clone_ids.ipynb - INFO - Participant 326780 (GeneLocus.BCR) has 803615 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.326780_1.tsv'), (3498945, 45), (1582210, 56), <GeneLocus.BCR: 1>)]


  df = pd.read_csv(



2022-12-28 18:04:37,503 - assign_clone_ids.ipynb - INFO - Participant Keck0070_MC1 (GeneLocus.TCR) has 530478 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0070_MC1.parquet.


2022-12-28 18:04:41,700 - assign_clone_ids.ipynb - INFO - Participant 326650 (GeneLocus.BCR) has 652899 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/326650.parquet.






  df = pd.read_csv(



2022-12-28 18:04:53,559 - assign_clone_ids.ipynb - INFO - Participant Keck0069_MC1 (GeneLocus.TCR) has 557494 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0069_MC1.parquet.


  df = pd.read_csv(



2022-12-28 18:05:16,101 - assign_clone_ids.ipynb - INFO - Participant Keck0092_MC1 (GeneLocus.TCR) has 281872 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0092_MC1.tsv'), (358821, 132), (290247, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:05:47,884 - assign_clone_ids.ipynb - INFO - Participant Keck0092_MC1 (GeneLocus.TCR) has 281208 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0092_MC1.parquet.


2022-12-28 18:05:51,963 - assign_clone_ids.ipynb - INFO - Participant 326780 (GeneLocus.BCR) has 712452 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/326780.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:07:25,932 - assign_clone_ids.ipynb - INFO - Participant Kim_B (GeneLocus.BCR) has 184286 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e815e1adeb2edc12613f.tsv'), (96630, 46), (39871, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e815e1adeb2edc126140.tsv'), (248183, 46), (165525, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:07:32,170 - assign_clone_ids.ipynb - INFO - Participant Kim_B (GeneLocus.BCR) has 41566 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_B.parquet.


2022-12-28 18:07:56,785 - assign_clone_ids.ipynb - INFO - Participant Kim_F (GeneLocus.BCR) has 114763 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e818e1adeb2edc126148.tsv'), (177586, 46), (40965, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e818e1adeb2edc126149.tsv'), (178506, 46), (94355, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:08:02,181 - assign_clone_ids.ipynb - INFO - Participant Kim_F (GeneLocus.BCR) has 40539 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_F.parquet.


2022-12-28 18:08:11,414 - assign_clone_ids.ipynb - INFO - Participant Kim_D (GeneLocus.BCR) has 158023 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e816e1adeb2edc126143.tsv'), (244087, 46), (97246, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e817e1adeb2edc126144.tsv'), (141689, 46), (81671, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:08:15,741 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 3}


2022-12-28 18:08:18,343 - assign_clone_ids.ipynb - INFO - Participant Kim_D (GeneLocus.BCR) has 58137 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_D.parquet.


2022-12-28 18:08:20,344 - assign_clone_ids.ipynb - INFO - Participant Montague_10 (GeneLocus.BCR) has 9252 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6199958881049973226-242ac116-0001-012.tsv'), (5394, 46), (1932, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6219758680284533226-242ac116-0001-012.tsv'), (28823, 46), (7772, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6162506766228853226-242ac116-0001-012.tsv'), (60120, 46), (15824, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.

2022-12-28 18:08:20,443 - malid.sample_sequences - INFO - Removing Montague_10 specimen 6162506766228853226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2111.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:08:20,553 - malid.sample_sequences - INFO - Removing Montague_10 specimen 6180202031488373226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4902.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:08:20,584 - malid.sample_sequences - INFO - Removing Montague_10 specimen 6199958881049973226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 459.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:08:20,612 - malid.sample_sequences - INFO - Removing Montague_10 specimen 6219758680284533226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 1510.0, 'IGHA': 0.0, 'IGHD-M': 0.0}








2022-12-28 18:08:22,822 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 11, 'TRBV7-5*02': 20, 'TRBVA*01': 4}




2022-12-28 18:08:32,634 - assign_clone_ids.ipynb - INFO - Participant Kim_C (GeneLocus.BCR) has 245516 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e816e1adeb2edc126141.tsv'), (244025, 46), (152329, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e816e1adeb2edc126142.tsv'), (247883, 46), (113311, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:08:32,966 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 27, 'TRBV8-1*01': 1, 'TRBVA*01': 2}


2022-12-28 18:08:37,163 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 2, 'TRBV7-5*01': 11, 'TRBV7-5*02': 19, 'TRBVA*01': 2}


2022-12-28 18:08:37,757 - assign_clone_ids.ipynb - INFO - Participant Keck0108_MC1 (GeneLocus.TCR) has 140545 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0108_MC1.tsv'), (182801, 132), (143693, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:08:38,679 - assign_clone_ids.ipynb - INFO - Participant Kim_C (GeneLocus.BCR) has 43214 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_C.parquet.


2022-12-28 18:08:40,610 - assign_clone_ids.ipynb - INFO - Participant Kim_G (GeneLocus.BCR) has 227925 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e818e1adeb2edc12614a.tsv'), (227838, 46), (95076, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e819e1adeb2edc12614b.tsv'), (248305, 46), (151426, 57), <GeneLocus.BCR: 1>)]






2022-12-28 18:08:44,889 - assign_clone_ids.ipynb - INFO - Participant Kim_G (GeneLocus.BCR) has 36666 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_G.parquet.




2022-12-28 18:08:47,013 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 19, 'TRBVA*01': 2}


2022-12-28 18:08:48,794 - assign_clone_ids.ipynb - INFO - Participant Keck0103_MC1 (GeneLocus.TCR) has 160824 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0103_MC1.tsv'), (202793, 132), (163589, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:08:49,877 - assign_clone_ids.ipynb - INFO - Participant Kim_A (GeneLocus.BCR) has 184696 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e814e1adeb2edc12613c.tsv'), (293006, 46), (78510, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e814e1adeb2edc12613d.tsv'), (238375, 46), (57835, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e815e1adeb2edc12613e.tsv'), (213216, 46), (92707, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:08:53,187 - assign_clone_ids.ipynb - INFO - Participant Keck0108_MC1 (GeneLocus.TCR) has 140333 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0108_MC1.parquet.


2022-12-28 18:08:57,163 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 7, 'TRBV7-5*02': 30, 'TRBV8-2*01': 1, 'TRBVA*01': 4}






2022-12-28 18:08:57,914 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 24}


2022-12-28 18:09:00,525 - assign_clone_ids.ipynb - INFO - Participant Kim_A (GeneLocus.BCR) has 88289 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_A.parquet.


2022-12-28 18:09:01,015 - assign_clone_ids.ipynb - INFO - Participant Keck0109_MC1 (GeneLocus.TCR) has 159032 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0109_MC1.tsv'), (205781, 132), (162320, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:09:06,137 - assign_clone_ids.ipynb - INFO - Participant Keck0103_MC1 (GeneLocus.TCR) has 160473 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0103_MC1.parquet.








2022-12-28 18:09:08,807 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 20, 'TRBV8-1*01': 1, 'TRBVA*01': 2}


2022-12-28 18:09:10,050 - assign_clone_ids.ipynb - INFO - Participant Keck0100_MC1 (GeneLocus.TCR) has 179335 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0100_MC1.tsv'), (222506, 132), (183598, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:09:18,802 - assign_clone_ids.ipynb - INFO - Participant Keck0109_MC1 (GeneLocus.TCR) has 158836 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0109_MC1.parquet.






2022-12-28 18:09:20,978 - assign_clone_ids.ipynb - INFO - Participant Keck0098_MC1 (GeneLocus.TCR) has 204651 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0098_MC1.tsv'), (256124, 132), (208578, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:09:28,876 - assign_clone_ids.ipynb - INFO - Participant Keck0110_MC1 (GeneLocus.TCR) has 193625 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0110_MC1.tsv'), (241507, 132), (197989, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:09:29,300 - assign_clone_ids.ipynb - INFO - Participant Kim_E (GeneLocus.BCR) has 342008 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e817e1adeb2edc126145.tsv'), (203462, 46), (94919, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e817e1adeb2edc126146.tsv'), (232864, 46), (131156, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5f21e818e1adeb2edc126147.tsv'), (181123, 46), (138535, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:09:30,007 - assign_clone_ids.ipynb - INFO - Participant Keck0100_MC1 (GeneLocus.TCR) has 178922 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0100_MC1.parquet.


2022-12-28 18:09:30,474 - assign_clone_ids.ipynb - INFO - Participant Keck0112_MC1 (GeneLocus.TCR) has 184619 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0112_MC1.tsv'), (230602, 132), (189308, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:09:32,419 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 40, 'TRBVA*01': 1}


2022-12-28 18:09:38,635 - assign_clone_ids.ipynb - INFO - Participant Kim_E (GeneLocus.BCR) has 74734 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Kim_E.parquet.






  df = pd.read_csv(



2022-12-28 18:09:43,773 - assign_clone_ids.ipynb - INFO - Participant Keck0098_MC1 (GeneLocus.TCR) has 204271 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0098_MC1.parquet.


2022-12-28 18:09:44,727 - assign_clone_ids.ipynb - INFO - Participant Keck0102_MC1 (GeneLocus.TCR) has 207245 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0102_MC1.tsv'), (259978, 132), (212087, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:09:49,209 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 12, 'TRBV7-5*02': 12, 'TRBVA*01': 1}


2022-12-28 18:09:50,120 - assign_clone_ids.ipynb - INFO - Participant Keck0110_MC1 (GeneLocus.TCR) has 193353 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0110_MC1.parquet.


2022-12-28 18:09:52,627 - assign_clone_ids.ipynb - INFO - Participant Keck0112_MC1 (GeneLocus.TCR) has 184353 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0112_MC1.parquet.


2022-12-28 18:09:57,260 - assign_clone_ids.ipynb - INFO - Participant Montague_11 (GeneLocus.BCR) has 23345 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6272329079987573226-242ac116-0001-012.tsv'), (106191, 46), (34449, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6286545421737333226-242ac116-0001-012.tsv'), (290209, 46), (77804, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6235177612877173226-242ac116-0001-012.tsv'), (18582, 46), (7716, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/expo

2022-12-28 18:09:57,467 - malid.sample_sequences - INFO - Removing Montague_11 specimen 6235177612877173226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2313.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:09:57,637 - malid.sample_sequences - INFO - Removing Montague_11 specimen 6255707556552053226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 8146.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:09:57,747 - malid.sample_sequences - INFO - Removing Montague_11 specimen 6272329079987573226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4710.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:09:57,921 - malid.sample_sequences - INFO - Removing Montague_11 specimen 6286545421737333226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 7867.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:09:58,602 - assign_clone_ids.ipynb - INFO - Participant Montague_1 (GeneLocus.BCR) has 18614 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5578047616589173226-242ac116-0001-012.tsv'), (135481, 46), (38779, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5594669140024693226-242ac116-0001-012.tsv'), (232099, 46), (59811, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5549400184724853226-242ac116-0001-012.tsv'), (148738, 46), (34354, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exp

2022-12-28 18:09:58,825 - malid.sample_sequences - INFO - Removing Montague_1 specimen 5549400184724853226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3484.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:09:58,940 - malid.sample_sequences - INFO - Removing Montague_1 specimen 5563272929090933226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4053.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:09:59,038 - malid.sample_sequences - INFO - Removing Montague_1 specimen 5578047616589173226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4726.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:09:59,169 - malid.sample_sequences - INFO - Removing Montague_1 specimen 5594669140024693226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6148.0, 'IGHA': 0.0, 'IGHD-M': 0.0}






2022-12-28 18:10:08,843 - assign_clone_ids.ipynb - INFO - Participant Keck0102_MC1 (GeneLocus.TCR) has 206879 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0102_MC1.parquet.


2022-12-28 18:10:09,689 - assign_clone_ids.ipynb - INFO - Participant Keck0106_MC1 (GeneLocus.TCR) has 217572 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0106_MC1.tsv'), (272551, 132), (223973, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:10:12,777 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 19, 'TRBV7-5*02': 19, 'TRBVA*01': 1}


2022-12-28 18:10:13,458 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 16, 'TRBV7-5*02': 29, 'TRBVA*01': 7}


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:10:21,922 - assign_clone_ids.ipynb - INFO - Participant Montague_12 (GeneLocus.BCR) has 15027 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6333961860685173226-242ac116-0001-012.tsv'), (4431, 46), (1699, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6347448057994613226-242ac116-0001-012.tsv'), (75248, 46), (16792, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6301749605965173226-242ac116-0001-012.tsv'), (213547, 46), (62444, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/export

2022-12-28 18:10:22,160 - malid.sample_sequences - INFO - Removing Montague_12 specimen 6301749605965173226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5770.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:10:22,322 - malid.sample_sequences - INFO - Removing Montague_12 specimen 6318371129400693226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6503.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:10:22,356 - malid.sample_sequences - INFO - Removing Montague_12 specimen 6333961860685173226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 393.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:10:22,398 - malid.sample_sequences - INFO - Removing Montague_12 specimen 6347448057994613226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2139.0, 'IGHA': 0.0, 'IGHD-M': 0.0}










2022-12-28 18:10:31,142 - assign_clone_ids.ipynb - INFO - Participant Keck0111_MC1 (GeneLocus.TCR) has 249683 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0111_MC1.tsv'), (299633, 132), (255741, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:10:33,218 - assign_clone_ids.ipynb - INFO - Participant Keck0106_MC1 (GeneLocus.TCR) has 217323 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0106_MC1.parquet.


  df = pd.read_csv(



2022-12-28 18:10:42,607 - assign_clone_ids.ipynb - INFO - Participant Montague_19 (GeneLocus.BCR) has 9750 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6835699940203893226-242ac116-0001-012.tsv'), (316668, 46), (82142, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:10:43,263 - malid.sample_sequences - INFO - Removing Montague_19 specimen 6835699940203893226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 9652.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




  df = pd.read_csv(



2022-12-28 18:10:52,590 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 15, 'TRBV7-5*02': 23, 'TRBVA*01': 1}


2022-12-28 18:10:56,498 - assign_clone_ids.ipynb - INFO - Participant Keck0099_MC1 (GeneLocus.TCR) has 255031 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0099_MC1.tsv'), (331751, 132), (261520, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:10:59,535 - assign_clone_ids.ipynb - INFO - Participant Keck0111_MC1 (GeneLocus.TCR) has 249343 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0111_MC1.parquet.


2022-12-28 18:11:00,576 - assign_clone_ids.ipynb - INFO - Participant Keck0107_MC1 (GeneLocus.TCR) has 260990 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0107_MC1.tsv'), (324727, 132), (267715, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:11:26,075 - assign_clone_ids.ipynb - INFO - Participant Keck0099_MC1 (GeneLocus.TCR) has 254648 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0099_MC1.parquet.


  df = pd.read_csv(



2022-12-28 18:11:34,637 - assign_clone_ids.ipynb - INFO - Participant Keck0107_MC1 (GeneLocus.TCR) has 260531 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0107_MC1.parquet.


2022-12-28 18:11:35,073 - assign_clone_ids.ipynb - INFO - Participant Keck0115_MC1 (GeneLocus.TCR) has 247264 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0115_MC1.tsv'), (321447, 132), (254682, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:11:47,387 - assign_clone_ids.ipynb - INFO - Participant Montague_9 (GeneLocus.BCR) has 7486 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6148719921208693226-242ac116-0001-012.tsv'), (34573, 46), (9624, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6074502886333813226-242ac116-0001-012.tsv'), (201629, 46), (49142, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:11:47,926 - malid.sample_sequences - INFO - Removing Montague_9 specimen 6074502886333813226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6170.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:11:47,968 - malid.sample_sequences - INFO - Removing Montague_9 specimen 6148719921208693226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 1216.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




  df = pd.read_csv(



2022-12-28 18:11:50,014 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 17, 'TRBV7-5*02': 17, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 18:12:03,007 - assign_clone_ids.ipynb - INFO - Participant Keck0115_MC1 (GeneLocus.TCR) has 246841 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0115_MC1.parquet.






  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:12:35,638 - assign_clone_ids.ipynb - INFO - Participant Keck0117_MC1 (GeneLocus.TCR) has 261950 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0117_MC1.tsv'), (331264, 132), (269304, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:12:36,124 - assign_clone_ids.ipynb - INFO - Participant Montague_16 (GeneLocus.BCR) has 17478 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6654108722929013226-242ac116-0001-012.tsv'), (117392, 46), (29894, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6667637869911413226-242ac116-0001-012.tsv'), (189534, 46), (46831, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6625633089756533226-242ac116-0001-012.tsv'), (182314, 46), (41567, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/ex

2022-12-28 18:12:36,335 - malid.sample_sequences - INFO - Removing Montague_16 specimen 6625633089756533226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4705.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:36,425 - malid.sample_sequences - INFO - Removing Montague_16 specimen 6639763532160373226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4105.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:36,506 - malid.sample_sequences - INFO - Removing Montague_16 specimen 6654108722929013226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2994.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:36,617 - malid.sample_sequences - INFO - Removing Montague_16 specimen 6667637869911413226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5493.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




  df = pd.read_csv(



2022-12-28 18:12:51,808 - assign_clone_ids.ipynb - INFO - Participant Montague_17 (GeneLocus.BCR) has 14124 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6724073740180853226-242ac116-0001-012.tsv'), (205918, 46), (42972, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6738676628987253226-242ac116-0001-012.tsv'), (108175, 46), (26105, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6680952268529013226-242ac116-0001-012.tsv'), (14596, 46), (2333, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/expo

2022-12-28 18:12:51,967 - malid.sample_sequences - INFO - Removing Montague_17 specimen 6680952268529013226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 175.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:51,999 - malid.sample_sequences - INFO - Removing Montague_17 specimen 6696585949486453226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 1014.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:52,118 - malid.sample_sequences - INFO - Removing Montague_17 specimen 6710329844833653226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5155.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:52,237 - malid.sample_sequences - INFO - Removing Montague_17 specimen 6724073740180853226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4750.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:52,311 - malid.sample_sequences - INFO - Removing Montague_17 specimen 6738676628987253226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2836.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:12:54,485 - assign_clone_ids.ipynb - INFO - Participant Montague_6 (GeneLocus.BCR) has 16023 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5884579432504693226-242ac116-0001-012.tsv'), (228329, 46), (68085, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5907944054594933226-242ac116-0001-012.tsv'), (285778, 46), (63909, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:12:55,022 - malid.sample_sequences - INFO - Removing Montague_6 specimen 5884579432504693226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 8618.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:55,188 - malid.sample_sequences - INFO - Removing Montague_6 specimen 5907944054594933226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 7077.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:12:56,752 - assign_clone_ids.ipynb - INFO - Participant Montague_13 (GeneLocus.BCR) has 29712 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6392545214602613226-242ac116-0001-012.tsv'), (107965, 46), (25794, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6410541127572853226-242ac116-0001-012.tsv'), (212974, 46), (45956, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6361063104322933226-242ac116-0001-012.tsv'), (264083, 46), (81177, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/ex

2022-12-28 18:12:57,173 - malid.sample_sequences - INFO - Removing Montague_13 specimen 6361063104322933226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 9917.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:57,405 - malid.sample_sequences - INFO - Removing Montague_13 specimen 6374978798361973226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 10450.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:57,502 - malid.sample_sequences - INFO - Removing Montague_13 specimen 6392545214602613226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2996.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:12:57,613 - malid.sample_sequences - INFO - Removing Montague_13 specimen 6410541127572853226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5587.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




  df = pd.read_csv(



2022-12-28 18:13:01,806 - assign_clone_ids.ipynb - INFO - Participant Montague_15 (GeneLocus.BCR) has 21637 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6592003495828853226-242ac116-0001-012.tsv'), (142658, 46), (43131, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6606649334308213226-242ac116-0001-012.tsv'), (188553, 46), (50587, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6543040868654453226-242ac116-0001-012.tsv'), (13116, 46), (3542, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/expo

2022-12-28 18:13:01,740 - assign_clone_ids.ipynb - INFO - Participant Montague_4 (GeneLocus.BCR) has 13944 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5828143562235253226-242ac116-0001-012.tsv'), (61608, 46), (14052, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5842102205947253226-242ac116-0001-012.tsv'), (175161, 46), (32159, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5801214117289333226-242ac116-0001-012.tsv'), (111334, 46), (23394, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/expo

2022-12-28 18:13:02,258 - malid.sample_sequences - INFO - Removing Montague_4 specimen 5801214117289333226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2574.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,283 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6486218451328373226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 421.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,328 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6500348893732213226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 458.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,356 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6514393436790133226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 1328.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,358 - malid.sample_sequences - INFO - Removing Montague_4 specimen 5814829163617653226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4482.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,376 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6528180281810293226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 426.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,397 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6543040868654453226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 419.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,407 - malid.sample_sequences - INFO - Removing Montague_4 specimen 5828143562235253226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2389.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,461 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6563399013637493226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2373.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,483 - malid.sample_sequences - INFO - Removing Montague_4 specimen 5842102205947253226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4421.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:13:02,562 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6577873053425013226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5115.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,677 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6592003495828853226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5210.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:02,804 - malid.sample_sequences - INFO - Removing Montague_15 specimen 6606649334308213226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5597.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:13:06,785 - assign_clone_ids.ipynb - INFO - Participant Keck0117_MC1 (GeneLocus.TCR) has 261564 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0117_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:13:29,310 - assign_clone_ids.ipynb - INFO - Participant 327059 (GeneLocus.BCR) has 1682884 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/briney_healthy_as_part_tables/exported.part_table.327059_1.tsv'), (4242001, 45), (2619359, 56), <GeneLocus.BCR: 1>)]


2022-12-28 18:13:34,361 - assign_clone_ids.ipynb - INFO - Participant Montague_18 (GeneLocus.BCR) has 20384 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6795112499256693226-242ac116-0001-012.tsv'), (94573, 46), (23010, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6821827195837813226-242ac116-0001-012.tsv'), (151416, 46), (35750, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6753537215831413226-242ac116-0001-012.tsv'), (210036, 46), (43789, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exp

2022-12-28 18:13:34,598 - malid.sample_sequences - INFO - Removing Montague_18 specimen 6753537215831413226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4728.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:34,689 - malid.sample_sequences - INFO - Removing Montague_18 specimen 6768183054310773226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3696.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:34,791 - malid.sample_sequences - INFO - Removing Montague_18 specimen 6781110905871733226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3937.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:34,860 - malid.sample_sequences - INFO - Removing Montague_18 specimen 6795112499256693226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3080.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:13:34,948 - malid.sample_sequences - INFO - Removing Montague_18 specimen 6821827195837813226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4809.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:14:51,522 - assign_clone_ids.ipynb - INFO - Participant Montague_5 (GeneLocus.BCR) has 20301 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5856060849659253226-242ac116-0001-012.tsv'), (362992, 46), (101252, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5869504097295733226-242ac116-0001-012.tsv'), (432636, 46), (98751, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:14:52,110 - malid.sample_sequences - INFO - Removing Montague_5 specimen 5856060849659253226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 9711.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:14:52,383 - malid.sample_sequences - INFO - Removing Montague_5 specimen 5869504097295733226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 10239.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:14:53,149 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV7-5*02': 31, 'TRBV8-2*01': 1, 'TRBVA*01': 6}


  df = pd.read_csv(



2022-12-28 18:15:04,754 - assign_clone_ids.ipynb - INFO - Participant Montague_14 (GeneLocus.BCR) has 31746 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6458344113577333226-242ac116-0001-012.tsv'), (217847, 46), (60398, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6472560455327093226-242ac116-0001-012.tsv'), (294342, 46), (69640, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6428064594140533226-242ac116-0001-012.tsv'), (255359, 46), (78374, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/ex

2022-12-28 18:15:05,340 - malid.sample_sequences - INFO - Removing Montague_14 specimen 6428064594140533226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 8073.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:05,620 - malid.sample_sequences - INFO - Removing Montague_14 specimen 6445072664632693226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 10274.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


  df = pd.read_csv(



2022-12-28 18:15:05,795 - malid.sample_sequences - INFO - Removing Montague_14 specimen 6458344113577333226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6463.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:05,978 - malid.sample_sequences - INFO - Removing Montague_14 specimen 6472560455327093226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6530.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:15:10,605 - assign_clone_ids.ipynb - INFO - Participant 327059 (GeneLocus.BCR) has 909580 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/327059.parquet.


2022-12-28 18:15:11,316 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 19, 'TRBV7-5*02': 1, 'TRBVA*01': 3}




2022-12-28 18:15:17,439 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 42, 'TRBV8-1*01': 1, 'TRBVA*01': 7}


2022-12-28 18:15:20,623 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 19, 'TRBV7-5*02': 39, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 18:15:23,367 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 4}


2022-12-28 18:15:23,386 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 52, 'TRBV8-1*01': 1, 'TRBVA*01': 3}


2022-12-28 18:15:23,463 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 34, 'TRBVA*01': 3}


2022-12-28 18:15:23,565 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 34, 'TRBVA*01': 4}


2022-12-28 18:15:24,954 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV24/OR9-2*01': 1, 'TRBV7-5*01': 22, 'TRBV7-5*02': 27, 'TRBV8-1*01': 2, 'TRBVA*01': 6}


2022-12-28 18:15:26,534 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 23, 'TRBV7-5*02': 22, 'TRBVA*01': 6}


2022-12-28 18:15:27,113 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 28, 'TRBVA*01': 1}




2022-12-28 18:15:28,422 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBVA*01': 2}


2022-12-28 18:15:28,707 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 37, 'TRBV7-5*02': 41, 'TRBV8-1*01': 1, 'TRBVA*01': 6}


2022-12-28 18:15:28,800 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 22, 'TRBV7-5*02': 46, 'TRBVA*01': 5}


2022-12-28 18:15:28,976 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 15, 'TRBVA*01': 1}






2022-12-28 18:15:30,208 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV8-2*01': 1, 'TRBVA*01': 3}


2022-12-28 18:15:31,593 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 1, 'TRBVA*01': 3}


2022-12-28 18:15:32,142 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 15, 'TRBVA*01': 1}








2022-12-28 18:15:33,729 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 26, 'TRBV7-5*02': 61, 'TRBVA*01': 2}






2022-12-28 18:15:35,190 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 10, 'TRBVA*01': 2}






2022-12-28 18:15:37,416 - assign_clone_ids.ipynb - INFO - Participant P00008 (GeneLocus.TCR) has 62420 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00008.tsv'), (91222, 132), (71837, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:15:39,401 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBVA*01': 2}












2022-12-28 18:15:42,421 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 15, 'TRBV7-5*02': 19, 'TRBVA*01': 4}








2022-12-28 18:15:44,149 - assign_clone_ids.ipynb - INFO - Participant P00010 (GeneLocus.TCR) has 84952 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00010.tsv'), (116885, 132), (87947, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:15:44,923 - assign_clone_ids.ipynb - INFO - Participant P00008 (GeneLocus.TCR) has 62329 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00008.parquet.


2022-12-28 18:15:46,625 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 36, 'TRBVA*01': 2}




2022-12-28 18:15:47,463 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 3}




  df = pd.read_csv(



2022-12-28 18:15:49,329 - assign_clone_ids.ipynb - INFO - Participant P00001 (GeneLocus.TCR) has 96054 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00001.tsv'), (130930, 132), (100283, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:15:49,853 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBV7-5*02': 59, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 8}




2022-12-28 18:15:50,127 - assign_clone_ids.ipynb - INFO - Participant P00002 (GeneLocus.TCR) has 92226 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00002.tsv'), (115913, 132), (94392, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:15:52,774 - assign_clone_ids.ipynb - INFO - Participant Keck0105_MC1 (GeneLocus.TCR) has 286698 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0105_MC1.tsv'), (363568, 132), (295556, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:15:54,535 - assign_clone_ids.ipynb - INFO - Participant P00010 (GeneLocus.TCR) has 84834 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00010.parquet.


2022-12-28 18:15:54,515 - assign_clone_ids.ipynb - INFO - Participant Montague_3 (GeneLocus.BCR) has 27237 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5718836644552053226-242ac116-0001-012.tsv'), (233695, 46), (49298, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5731506798075253226-242ac116-0001-012.tsv'), (95534, 46), (24213, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5744907096038773226-242ac116-0001-012.tsv'), (298910, 46), (72614, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/expo



2022-12-28 18:15:54,965 - malid.sample_sequences - INFO - Removing Montague_3 specimen 5718836644552053226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4230.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:55,051 - malid.sample_sequences - INFO - Removing Montague_3 specimen 5731506798075253226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2660.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:55,270 - malid.sample_sequences - INFO - Removing Montague_3 specimen 5744907096038773226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6565.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:55,372 - malid.sample_sequences - INFO - Removing Montague_3 specimen 5758264444329333226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3393.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:55,441 - malid.sample_sequences - INFO - Removing Montague_3 specimen 5771922440330613226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2433.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:15:55,600 - malid.sample_sequences - INFO - Removing Montague_3 specimen 5786783027174773226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 7407.0, 'IGHA': 0.0, 'IGHD-M': 0.0}






2022-12-28 18:15:56,701 - assign_clone_ids.ipynb - INFO - Participant P00005 (GeneLocus.TCR) has 92739 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00005.tsv'), (128270, 132), (98516, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:15:56,965 - assign_clone_ids.ipynb - INFO - Participant P00020 (GeneLocus.TCR) has 90247 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00020.tsv'), (119021, 132), (94360, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:15:57,105 - assign_clone_ids.ipynb - INFO - Participant P00011 (GeneLocus.TCR) has 92512 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00011.tsv'), (127879, 132), (97677, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:15:57,616 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 11, 'TRBV7-5*02': 21, 'TRBVA*01': 9}














2022-12-28 18:16:00,143 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 41, 'TRBVA*01': 2}






2022-12-28 18:16:01,051 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 37, 'TRBVA*01': 2}


2022-12-28 18:16:02,626 - assign_clone_ids.ipynb - INFO - Participant P00001 (GeneLocus.TCR) has 95869 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00001.parquet.


2022-12-28 18:16:03,415 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 1, 'TRBVA*01': 2}


2022-12-28 18:16:09,093 - assign_clone_ids.ipynb - INFO - Participant P00002 (GeneLocus.TCR) has 92140 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00002.parquet.








2022-12-28 18:16:12,448 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 33, 'TRBVA*01': 1}




2022-12-28 18:16:13,066 - assign_clone_ids.ipynb - INFO - Participant P00005 (GeneLocus.TCR) has 92551 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00005.parquet.


2022-12-28 18:16:14,454 - assign_clone_ids.ipynb - INFO - Participant P00020 (GeneLocus.TCR) has 90086 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00020.parquet.


2022-12-28 18:16:15,574 - assign_clone_ids.ipynb - INFO - Participant P00018 (GeneLocus.TCR) has 124022 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00018.tsv'), (164435, 132), (130171, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:16,136 - assign_clone_ids.ipynb - INFO - Participant P00011 (GeneLocus.TCR) has 92361 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00011.parquet.






2022-12-28 18:16:16,772 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 55, 'TRBVA*01': 6}


2022-12-28 18:16:17,525 - assign_clone_ids.ipynb - INFO - Participant P00012 (GeneLocus.TCR) has 136278 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00012.tsv'), (184572, 132), (145544, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:21,068 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 46, 'TRBVA*01': 6}


2022-12-28 18:16:21,479 - assign_clone_ids.ipynb - INFO - Participant Keck0096_MC1 (GeneLocus.TCR) has 315127 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0096_MC1.tsv'), (406242, 132), (325607, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:16:24,905 - assign_clone_ids.ipynb - INFO - Participant Keck0120_MC1 (GeneLocus.TCR) has 311693 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0120_MC1.tsv'), (397170, 132), (321638, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:25,018 - assign_clone_ids.ipynb - INFO - Participant Keck0101_MC1 (GeneLocus.TCR) has 337694 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0101_MC1.tsv'), (398509, 132), (348144, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:25,502 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 18, 'TRBV7-5*02': 36, 'TRBVA*01': 10}


  df = pd.read_csv(



2022-12-28 18:16:29,314 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 3}


2022-12-28 18:16:31,166 - assign_clone_ids.ipynb - INFO - Participant P00003 (GeneLocus.TCR) has 150896 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00003.tsv'), (198824, 132), (156009, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:31,233 - assign_clone_ids.ipynb - INFO - Participant Keck0105_MC1 (GeneLocus.TCR) has 286280 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0105_MC1.parquet.


2022-12-28 18:16:31,416 - assign_clone_ids.ipynb - INFO - Participant Keck0116_MC1 (GeneLocus.TCR) has 341956 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0116_MC1.tsv'), (438826, 132), (353827, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:16:31,997 - assign_clone_ids.ipynb - INFO - Participant P00021 (GeneLocus.TCR) has 141939 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00021.tsv'), (187248, 132), (148857, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:32,054 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 31, 'TRBVA*01': 9}


2022-12-28 18:16:32,129 - assign_clone_ids.ipynb - INFO - Participant Keck0097_MC1 (GeneLocus.TCR) has 355156 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0097_MC1.tsv'), (452076, 132), (365761, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:32,179 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 49, 'TRBVA*01': 5}


2022-12-28 18:16:33,802 - assign_clone_ids.ipynb - INFO - Participant P00007 (GeneLocus.TCR) has 161465 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00007.tsv'), (210818, 132), (168535, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:35,861 - assign_clone_ids.ipynb - INFO - Participant Keck0118_MC1 (GeneLocus.TCR) has 302714 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0118_MC1.tsv'), (397487, 132), (311873, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:16:36,853 - assign_clone_ids.ipynb - INFO - Participant P00006 (GeneLocus.TCR) has 189424 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00006.tsv'), (253381, 132), (198009, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:16:38,213 - assign_clone_ids.ipynb - INFO - Participant P00018 (GeneLocus.TCR) has 123847 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00018.parquet.








2022-12-28 18:16:39,227 - assign_clone_ids.ipynb - INFO - Participant P00004 (GeneLocus.TCR) has 181959 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00004.tsv'), (248947, 132), (189660, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:16:41,654 - assign_clone_ids.ipynb - INFO - Participant Keck0104_MC1 (GeneLocus.TCR) has 378550 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0104_MC1.tsv'), (479431, 132), (393461, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:43,471 - assign_clone_ids.ipynb - INFO - Participant P00012 (GeneLocus.TCR) has 136111 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00012.parquet.


2022-12-28 18:16:46,850 - assign_clone_ids.ipynb - INFO - Participant Montague_2 (GeneLocus.BCR) has 28654 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5609272028831093226-242ac116-0001-012.tsv'), (21084, 46), (7174, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5665063654006133226-242ac116-0001-012.tsv'), (82129, 46), (19822, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5692895042084213226-242ac116-0001-012.tsv'), (86492, 46), (20453, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exporte

2022-12-28 18:16:46,879 - assign_clone_ids.ipynb - INFO - Participant Keck0119_MC1 (GeneLocus.TCR) has 337634 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0119_MC1.tsv'), (421151, 132), (350455, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:16:47,310 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5609272028831093226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 1294.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:16:47,462 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5624519162731893226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4560.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:16:47,637 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5638220108406133226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4625.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:16:48,032 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5651706305715573226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 10063.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:16:48,383 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5665063654006133226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 2121.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:16:48,500 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5692895042084213226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 1945.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:16:48,669 - malid.sample_sequences - INFO - Removing Montague_2 specimen 5705736994299253226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3521.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


  df = pd.read_csv(





2022-12-28 18:16:50,114 - assign_clone_ids.ipynb - INFO - Participant P00021 (GeneLocus.TCR) has 141737 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00021.parquet.






2022-12-28 18:16:53,012 - assign_clone_ids.ipynb - INFO - Participant P00013 (GeneLocus.TCR) has 185309 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00013.tsv'), (238160, 132), (195665, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(











2022-12-28 18:17:00,952 - assign_clone_ids.ipynb - INFO - Participant P00022 (GeneLocus.TCR) has 126246 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00022.tsv'), (169821, 132), (130949, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:17:02,572 - assign_clone_ids.ipynb - INFO - Participant Keck0093_MC1 (GeneLocus.TCR) has 455367 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0093_MC1.tsv'), (570729, 132), (474902, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:02,983 - assign_clone_ids.ipynb - INFO - Participant P00003 (GeneLocus.TCR) has 150627 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00003.parquet.


2022-12-28 18:17:02,983 - assign_clone_ids.ipynb - INFO - Participant Keck0113_MC1 (GeneLocus.TCR) has 450412 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0113_MC1.tsv'), (572028, 132), (467127, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:04,384 - assign_clone_ids.ipynb - INFO - Participant P00007 (GeneLocus.TCR) has 161248 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00007.parquet.


  df = pd.read_csv(



2022-12-28 18:17:05,194 - assign_clone_ids.ipynb - INFO - Participant P00006 (GeneLocus.TCR) has 189196 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00006.parquet.


  df = pd.read_csv(



2022-12-28 18:17:06,323 - assign_clone_ids.ipynb - INFO - Participant P00004 (GeneLocus.TCR) has 181762 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00004.parquet.


2022-12-28 18:17:06,629 - assign_clone_ids.ipynb - INFO - Participant Keck0096_MC1 (GeneLocus.TCR) has 314453 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0096_MC1.parquet.


2022-12-28 18:17:07,761 - assign_clone_ids.ipynb - INFO - Participant Keck0114_MC1 (GeneLocus.TCR) has 401973 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0114_MC1.tsv'), (510885, 132), (419846, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:17:10,174 - assign_clone_ids.ipynb - INFO - Participant Keck0120_MC1 (GeneLocus.TCR) has 311200 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0120_MC1.parquet.


2022-12-28 18:17:10,565 - assign_clone_ids.ipynb - INFO - Participant Keck0094_MC1 (GeneLocus.TCR) has 398684 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0094_MC1.tsv'), (522826, 132), (412825, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:14,663 - assign_clone_ids.ipynb - INFO - Participant Keck0116_MC1 (GeneLocus.TCR) has 341439 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0116_MC1.parquet.


2022-12-28 18:17:16,506 - assign_clone_ids.ipynb - INFO - Participant Keck0097_MC1 (GeneLocus.TCR) has 354441 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0097_MC1.parquet.


2022-12-28 18:17:19,538 - assign_clone_ids.ipynb - INFO - Participant P00009 (GeneLocus.TCR) has 271279 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00009.tsv'), (369167, 132), (282441, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:20,417 - assign_clone_ids.ipynb - INFO - Participant P00022 (GeneLocus.TCR) has 126030 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00022.parquet.


2022-12-28 18:17:20,927 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 16, 'TRBVA*01': 2}


2022-12-28 18:17:21,553 - assign_clone_ids.ipynb - INFO - Participant P00017 (GeneLocus.TCR) has 241159 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00017.tsv'), (327893, 132), (256213, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:22,557 - assign_clone_ids.ipynb - INFO - Participant Keck0101_MC1 (GeneLocus.TCR) has 337104 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0101_MC1.parquet.


  df = pd.read_csv(



2022-12-28 18:17:23,394 - assign_clone_ids.ipynb - INFO - Participant P00016 (GeneLocus.TCR) has 224096 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00016.tsv'), (297970, 132), (237579, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:24,498 - assign_clone_ids.ipynb - INFO - Participant Montague_8 (GeneLocus.BCR) has 33770 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6045683655777653226-242ac116-0001-012.tsv'), (125378, 46), (37748, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6059341651778933226-242ac116-0001-012.tsv'), (227739, 46), (61557, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.6009734779510133226-242ac116-0001-012.tsv'), (220537, 46), (49298, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exp

2022-12-28 18:17:24,951 - assign_clone_ids.ipynb - INFO - Participant Keck0118_MC1 (GeneLocus.TCR) has 302057 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0118_MC1.parquet.


2022-12-28 18:17:25,123 - malid.sample_sequences - INFO - Removing Montague_8 specimen 5977479575117173226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 3953.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:25,266 - malid.sample_sequences - INFO - Removing Montague_8 specimen 5994186997898613226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5515.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:25,437 - malid.sample_sequences - INFO - Removing Montague_8 specimen 6009734779510133226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 4983.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:25,579 - malid.sample_sequences - INFO - Removing Montague_8 specimen 6028546736266613226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5236.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:25,691 - malid.sample_sequences - INFO - Removing Montague_8 specimen 6045683655777653226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 5613.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:25,881 - malid.sample_sequences - INFO - Removing Montague_8 specimen 6059341651778933226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 8199.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:17:27,639 - assign_clone_ids.ipynb - INFO - Participant P00013 (GeneLocus.TCR) has 185005 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00013.parquet.


2022-12-28 18:17:27,639 - assign_clone_ids.ipynb - INFO - Participant Keck0104_MC1 (GeneLocus.TCR) has 378024 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0104_MC1.parquet.






2022-12-28 18:17:38,958 - assign_clone_ids.ipynb - INFO - Participant Keck0119_MC1 (GeneLocus.TCR) has 337169 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0119_MC1.parquet.


2022-12-28 18:17:39,530 - assign_clone_ids.ipynb - INFO - Participant P00015 (GeneLocus.TCR) has 268439 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00015.tsv'), (381116, 132), (283411, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:17:40,790 - assign_clone_ids.ipynb - INFO - Participant P00019 (GeneLocus.TCR) has 293334 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00019.tsv'), (389170, 132), (310319, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:42,393 - assign_clone_ids.ipynb - INFO - Participant P00014 (GeneLocus.TCR) has 296017 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00014.tsv'), (400752, 132), (312393, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:17:46,934 - assign_clone_ids.ipynb - INFO - Participant Montague_7 (GeneLocus.BCR) has 35056 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5962618988273013226-242ac116-0001-012.tsv'), (322663, 46), (87385, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5946684659604853226-242ac116-0001-012.tsv'), (152761, 46), (45221, 57), <GeneLocus.BCR: 1>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/covid_external_as_part_tables/exported.part_table.5928130400886133226-242ac116-0001-012.tsv'), (707337, 46), (171024, 57), <GeneLocus.BCR: 1>)]


2022-12-28 18:17:47,729 - malid.sample_sequences - INFO - Removing Montague_7 specimen 5928130400886133226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 18830.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:47,925 - malid.sample_sequences - INFO - Removing Montague_7 specimen 5946684659604853226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 6102.0, 'IGHA': 0.0, 'IGHD-M': 0.0}


2022-12-28 18:17:48,192 - malid.sample_sequences - INFO - Removing Montague_7 specimen 5962618988273013226-242ac116-0001-012 because it did not have enough clones. Clone count by isotype: {'IGHG': 9618.0, 'IGHA': 0.0, 'IGHD-M': 0.0}




2022-12-28 18:17:49,826 - assign_clone_ids.ipynb - INFO - Participant P00024 (GeneLocus.TCR) has 160587 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00024.tsv'), (213621, 132), (168202, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:17:49,835 - assign_clone_ids.ipynb - INFO - Participant P00009 (GeneLocus.TCR) has 270898 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00009.parquet.


2022-12-28 18:17:50,406 - assign_clone_ids.ipynb - INFO - Participant P00016 (GeneLocus.TCR) has 223830 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00016.parquet.


2022-12-28 18:17:50,626 - assign_clone_ids.ipynb - INFO - Participant P00017 (GeneLocus.TCR) has 240769 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00017.parquet.


2022-12-28 18:17:55,852 - assign_clone_ids.ipynb - INFO - Participant Keck0095_MC1 (GeneLocus.TCR) has 496183 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/Keck0095_MC1.tsv'), (633620, 132), (521782, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:18:08,453 - assign_clone_ids.ipynb - INFO - Participant Keck0114_MC1 (GeneLocus.TCR) has 401413 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0114_MC1.parquet.


2022-12-28 18:18:08,453 - assign_clone_ids.ipynb - INFO - Participant Keck0113_MC1 (GeneLocus.TCR) has 449574 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0113_MC1.parquet.


2022-12-28 18:18:08,453 - assign_clone_ids.ipynb - INFO - Participant P00024 (GeneLocus.TCR) has 160387 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00024.parquet.


2022-12-28 18:18:08,453 - assign_clone_ids.ipynb - INFO - Participant Keck0093_MC1 (GeneLocus.TCR) has 454679 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0093_MC1.parquet.


2022-12-28 18:18:08,454 - assign_clone_ids.ipynb - INFO - Participant Keck0094_MC1 (GeneLocus.TCR) has 397662 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0094_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:18:15,167 - assign_clone_ids.ipynb - INFO - Participant P00015 (GeneLocus.TCR) has 267947 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00015.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:18:27,243 - assign_clone_ids.ipynb - INFO - Participant P00019 (GeneLocus.TCR) has 292980 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00019.parquet.


2022-12-28 18:18:27,243 - assign_clone_ids.ipynb - INFO - Participant P00014 (GeneLocus.TCR) has 295449 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00014.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:18:49,193 - assign_clone_ids.ipynb - INFO - Participant Keck0095_MC1 (GeneLocus.TCR) has 495489 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/Keck0095_MC1.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:19:45,847 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBVA*01': 1}




2022-12-28 18:19:51,152 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 5, 'TRBV8-2*01': 1}


2022-12-28 18:19:56,132 - assign_clone_ids.ipynb - INFO - Participant P00041 (GeneLocus.TCR) has 51073 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00041.tsv'), (73409, 132), (55636, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:19:57,236 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 26, 'TRBVA*01': 4}




2022-12-28 18:19:59,737 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 25, 'TRBVA*01': 4}


2022-12-28 18:20:01,762 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 9, 'TRBV7-5*02': 35, 'TRBVA*01': 7}


2022-12-28 18:20:03,102 - assign_clone_ids.ipynb - INFO - Participant P00041 (GeneLocus.TCR) has 51001 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00041.parquet.


2022-12-28 18:20:05,780 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 24, 'TRBVA*01': 1}


2022-12-28 18:20:07,114 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 8, 'TRBV7-5*02': 8}






2022-12-28 18:20:08,222 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBVA*01': 1}






2022-12-28 18:20:09,048 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 22}






2022-12-28 18:20:12,913 - assign_clone_ids.ipynb - INFO - Participant P00034 (GeneLocus.TCR) has 107802 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00034.tsv'), (147659, 132), (119003, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:20:13,218 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 29, 'TRBV7-5*02': 37, 'TRBVA*01': 5}


2022-12-28 18:20:13,340 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 48, 'TRBVA*01': 4}


  df = pd.read_csv(



2022-12-28 18:20:15,239 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBVA*01': 3}




2022-12-28 18:20:20,575 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 90, 'TRBVA*01': 3}








2022-12-28 18:20:23,495 - assign_clone_ids.ipynb - INFO - Participant P00049 (GeneLocus.TCR) has 75176 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00049.tsv'), (102014, 132), (77501, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:20:25,177 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 6, 'TRBVA*01': 1}


2022-12-28 18:20:26,367 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26}


2022-12-28 18:20:27,036 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 22, 'TRBV7-5*02': 28, 'TRBVA*01': 4}


2022-12-28 18:20:28,427 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBVA*01': 3}


2022-12-28 18:20:28,708 - assign_clone_ids.ipynb - INFO - Participant P00034 (GeneLocus.TCR) has 107637 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00034.parquet.


2022-12-28 18:20:30,549 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 19, 'TRBVA*01': 5}




2022-12-28 18:20:31,482 - assign_clone_ids.ipynb - INFO - Participant P00051 (GeneLocus.TCR) has 91870 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00051.tsv'), (116526, 132), (94499, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:20:32,026 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 13, 'TRBVA*01': 2}


2022-12-28 18:20:32,872 - assign_clone_ids.ipynb - INFO - Participant P00031 (GeneLocus.TCR) has 159713 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00031.tsv'), (220883, 132), (171278, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:20:33,062 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 42, 'TRBVA*01': 3}


2022-12-28 18:20:33,287 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 46, 'TRBV8-2*01': 2, 'TRBVA*01': 4}


2022-12-28 18:20:33,504 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBVA*01': 2}




2022-12-28 18:20:33,894 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 9}


2022-12-28 18:20:35,658 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 10, 'TRBVA*01': 2}


2022-12-28 18:20:36,891 - assign_clone_ids.ipynb - INFO - Participant P00030 (GeneLocus.TCR) has 167225 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00030.tsv'), (230923, 132), (174434, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:20:39,153 - assign_clone_ids.ipynb - INFO - Participant P00049 (GeneLocus.TCR) has 75023 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00049.parquet.


2022-12-28 18:20:39,197 - assign_clone_ids.ipynb - INFO - Participant P00025 (GeneLocus.TCR) has 201455 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00025.tsv'), (266985, 132), (210326, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:20:39,341 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 3}




2022-12-28 18:20:40,752 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 24, 'TRBVA*01': 2}


2022-12-28 18:20:40,797 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 8, 'TRBVA*01': 6}




2022-12-28 18:20:40,990 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 24, 'TRBV7-5*02': 26, 'TRBVA*01': 7}


2022-12-28 18:20:41,127 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 40, 'TRBVA*01': 4}




2022-12-28 18:20:41,701 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 37, 'TRBVA*01': 6}


2022-12-28 18:20:43,358 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 17, 'TRBV7-5*02': 25, 'TRBVA*01': 3}






2022-12-28 18:20:43,777 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 12, 'TRBV7-5*02': 18, 'TRBVA*01': 4}


2022-12-28 18:20:43,871 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 46, 'TRBVA*01': 5}




  df = pd.read_csv(



2022-12-28 18:20:45,033 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBV8-2*01': 30, 'TRBVA*01': 3}


2022-12-28 18:20:45,576 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBV7-5*02': 23, 'TRBVA*01': 3}


2022-12-28 18:20:45,801 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 50, 'TRBVA*01': 3}




2022-12-28 18:20:47,096 - assign_clone_ids.ipynb - INFO - Participant P00040 (GeneLocus.TCR) has 185622 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00040.tsv'), (247830, 132), (194807, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:20:47,298 - assign_clone_ids.ipynb - INFO - Participant P00046 (GeneLocus.TCR) has 112375 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00046.tsv'), (151577, 132), (117702, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:20:47,621 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 20, 'TRBV7-5*02': 29, 'TRBVA*01': 3}






2022-12-28 18:20:50,301 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 25, 'TRBVA*01': 10}




  df = pd.read_csv(



2022-12-28 18:20:51,603 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV7-5*02': 42, 'TRBV8-1*01': 1, 'TRBVA*01': 7}


2022-12-28 18:20:51,650 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 21, 'TRBVA*01': 5}




2022-12-28 18:20:51,744 - assign_clone_ids.ipynb - INFO - Participant P00051 (GeneLocus.TCR) has 91763 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00051.parquet.




2022-12-28 18:20:54,331 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 28, 'TRBV8-2*01': 1, 'TRBVA*01': 2}


2022-12-28 18:20:55,781 - assign_clone_ids.ipynb - INFO - Participant P00031 (GeneLocus.TCR) has 159472 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00031.parquet.






































2022-12-28 18:21:01,720 - assign_clone_ids.ipynb - INFO - Participant P00063 (GeneLocus.TCR) has 122498 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00063.tsv'), (168949, 132), (137062, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:21:06,728 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 12, 'TRBVA*01': 6}


2022-12-28 18:21:06,765 - assign_clone_ids.ipynb - INFO - Participant P00045 (GeneLocus.TCR) has 133109 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00045.tsv'), (177004, 132), (139342, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:21:06,938 - assign_clone_ids.ipynb - INFO - Participant P00064 (GeneLocus.TCR) has 105029 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00064.tsv'), (142141, 132), (109558, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:21:08,631 - assign_clone_ids.ipynb - INFO - Participant P00052 (GeneLocus.TCR) has 158295 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00052.tsv'), (213144, 132), (164102, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:21:09,158 - assign_clone_ids.ipynb - INFO - Participant P00033 (GeneLocus.TCR) has 214364 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00033.tsv'), (281756, 132), (223673, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:21:09,581 - assign_clone_ids.ipynb - INFO - Participant P00025 (GeneLocus.TCR) has 201172 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00025.parquet.


2022-12-28 18:21:09,622 - assign_clone_ids.ipynb - INFO - Participant P00046 (GeneLocus.TCR) has 112214 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00046.parquet.


2022-12-28 18:21:09,626 - assign_clone_ids.ipynb - INFO - Participant P00056 (GeneLocus.TCR) has 137759 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00056.tsv'), (191537, 132), (150123, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:10,360 - assign_clone_ids.ipynb - INFO - Participant P00030 (GeneLocus.TCR) has 166972 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00030.parquet.






  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:21:13,229 - assign_clone_ids.ipynb - INFO - Participant P00057 (GeneLocus.TCR) has 152082 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00057.tsv'), (205063, 132), (158018, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:14,621 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 23, 'TRBVA*01': 4}


  df = pd.read_csv(



2022-12-28 18:21:17,614 - assign_clone_ids.ipynb - INFO - Participant P00043 (GeneLocus.TCR) has 171474 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00043.tsv'), (223749, 132), (177899, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:18,169 - assign_clone_ids.ipynb - INFO - Participant P00029 (GeneLocus.TCR) has 247230 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00029.tsv'), (332219, 132), (258727, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:18,556 - assign_clone_ids.ipynb - INFO - Participant P00026 (GeneLocus.TCR) has 287975 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00026.tsv'), (398686, 132), (303825, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:21:19,497 - assign_clone_ids.ipynb - INFO - Participant P00040 (GeneLocus.TCR) has 185327 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00040.parquet.




2022-12-28 18:21:19,778 - assign_clone_ids.ipynb - INFO - Participant P00028 (GeneLocus.TCR) has 267607 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00028.tsv'), (367411, 132), (283548, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:21:22,845 - assign_clone_ids.ipynb - INFO - Participant P00064 (GeneLocus.TCR) has 104895 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00064.parquet.


2022-12-28 18:21:23,049 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 1, 'TRBV8-2*01': 1}


2022-12-28 18:21:23,166 - assign_clone_ids.ipynb - INFO - Participant P00063 (GeneLocus.TCR) has 122358 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00063.parquet.






2022-12-28 18:21:24,509 - assign_clone_ids.ipynb - INFO - Participant P00054 (GeneLocus.TCR) has 164647 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00054.tsv'), (227965, 132), (174316, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:25,470 - assign_clone_ids.ipynb - INFO - Participant P00023 (GeneLocus.TCR) has 296985 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00023.tsv'), (401200, 132), (312083, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 18:21:28,753 - assign_clone_ids.ipynb - INFO - Participant P00044 (GeneLocus.TCR) has 223040 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00044.tsv'), (301289, 132), (237797, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:29,509 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13}


2022-12-28 18:21:29,975 - assign_clone_ids.ipynb - INFO - Participant P00052 (GeneLocus.TCR) has 158060 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00052.parquet.


2022-12-28 18:21:31,008 - assign_clone_ids.ipynb - INFO - Participant P00042 (GeneLocus.TCR) has 171906 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00042.tsv'), (245327, 132), (183577, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:32,656 - assign_clone_ids.ipynb - INFO - Participant P00050 (GeneLocus.TCR) has 236912 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00050.tsv'), (313961, 132), (248534, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:21:33,180 - assign_clone_ids.ipynb - INFO - Participant P00071 (GeneLocus.TCR) has 38654 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00071.tsv'), (49874, 132), (41171, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:21:33,942 - assign_clone_ids.ipynb - INFO - Participant P00053 (GeneLocus.TCR) has 181847 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00053.tsv'), (242065, 132), (191920, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:21:34,484 - assign_clone_ids.ipynb - INFO - Participant P00048 (GeneLocus.TCR) has 170329 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00048.tsv'), (242152, 132), (179700, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:34,702 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 12, 'TRBV7-5*02': 19, 'TRBVA*01': 5}


2022-12-28 18:21:35,127 - assign_clone_ids.ipynb - INFO - Participant P00039 (GeneLocus.TCR) has 235065 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00039.tsv'), (309864, 132), (246853, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:21:35,581 - assign_clone_ids.ipynb - INFO - Participant P00061 (GeneLocus.TCR) has 204620 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00061.tsv'), (279312, 132), (216052, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:36,636 - assign_clone_ids.ipynb - INFO - Participant P00047 (GeneLocus.TCR) has 187099 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00047.tsv'), (252638, 132), (200887, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:37,160 - assign_clone_ids.ipynb - INFO - Participant P00059 (GeneLocus.TCR) has 240213 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00059.tsv'), (319010, 132), (252527, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:37,974 - assign_clone_ids.ipynb - INFO - Participant P00056 (GeneLocus.TCR) has 137610 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00056.parquet.


2022-12-28 18:21:38,859 - assign_clone_ids.ipynb - INFO - Participant P00045 (GeneLocus.TCR) has 132991 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00045.parquet.


2022-12-28 18:21:39,601 - assign_clone_ids.ipynb - INFO - Participant P00043 (GeneLocus.TCR) has 171217 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00043.parquet.


2022-12-28 18:21:40,220 - assign_clone_ids.ipynb - INFO - Participant P00062 (GeneLocus.TCR) has 182737 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00062.tsv'), (249677, 132), (193734, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:21:41,516 - assign_clone_ids.ipynb - INFO - Participant P00071 (GeneLocus.TCR) has 38586 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00071.parquet.


2022-12-28 18:21:42,950 - assign_clone_ids.ipynb - INFO - Participant P00057 (GeneLocus.TCR) has 151890 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00057.parquet.


2022-12-28 18:21:43,839 - assign_clone_ids.ipynb - INFO - Participant P00055 (GeneLocus.TCR) has 252114 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00055.tsv'), (326480, 132), (266353, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:44,533 - assign_clone_ids.ipynb - INFO - Participant P00033 (GeneLocus.TCR) has 214023 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00033.parquet.




2022-12-28 18:21:45,641 - assign_clone_ids.ipynb - INFO - Participant P00035 (GeneLocus.TCR) has 314077 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00035.tsv'), (406090, 132), (330252, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:21:46,799 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBV7-5*02': 1, 'TRBVA*01': 3}


2022-12-28 18:21:47,138 - assign_clone_ids.ipynb - INFO - Participant P00070 (GeneLocus.TCR) has 85599 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00070.tsv'), (109623, 132), (89959, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:21:50,207 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 39, 'TRBVA*01': 2}


2022-12-28 18:21:52,075 - assign_clone_ids.ipynb - INFO - Participant P00038 (GeneLocus.TCR) has 278221 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00038.tsv'), (358006, 132), (299268, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:21:53,304 - assign_clone_ids.ipynb - INFO - Participant P00065 (GeneLocus.TCR) has 148779 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00065.tsv'), (202559, 132), (156259, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 18:22:00,735 - assign_clone_ids.ipynb - INFO - Participant P00070 (GeneLocus.TCR) has 85487 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00070.parquet.


2022-12-28 18:22:00,931 - assign_clone_ids.ipynb - INFO - Participant P00032 (GeneLocus.TCR) has 299938 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00032.tsv'), (403311, 132), (314798, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:01,705 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV8-2*01': 1}


  df = pd.read_csv(



2022-12-28 18:22:03,880 - assign_clone_ids.ipynb - INFO - Participant P00058 (GeneLocus.TCR) has 305373 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00058.tsv'), (412392, 132), (321730, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:03,966 - assign_clone_ids.ipynb - INFO - Participant P00036 (GeneLocus.TCR) has 242482 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00036.tsv'), (311294, 132), (252633, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:04,303 - assign_clone_ids.ipynb - INFO - Participant P00023 (GeneLocus.TCR) has 296463 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00023.parquet.




  df = pd.read_csv(



2022-12-28 18:22:07,346 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBV8-2*01': 1, 'TRBVA*01': 5}


2022-12-28 18:22:07,617 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBVA*01': 1}


2022-12-28 18:22:12,530 - assign_clone_ids.ipynb - INFO - Participant P00027 (GeneLocus.TCR) has 278715 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00027.tsv'), (388474, 132), (297510, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:13,061 - assign_clone_ids.ipynb - INFO - Participant P00037 (GeneLocus.TCR) has 237882 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00037.tsv'), (330239, 132), (250631, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:14,789 - assign_clone_ids.ipynb - INFO - Participant P00076 (GeneLocus.TCR) has 53824 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00076.tsv'), (72716, 132), (56350, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:22:20,173 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 30}


  df = pd.read_csv(



2022-12-28 18:22:24,110 - assign_clone_ids.ipynb - INFO - Participant P00076 (GeneLocus.TCR) has 53744 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00076.parquet.


  df = pd.read_csv(



2022-12-28 18:22:30,732 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 7}






2022-12-28 18:22:36,465 - assign_clone_ids.ipynb - INFO - Participant P00066 (GeneLocus.TCR) has 162878 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00066.tsv'), (213017, 132), (169559, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:36,744 - assign_clone_ids.ipynb - INFO - Participant P00029 (GeneLocus.TCR) has 246839 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00029.parquet.


2022-12-28 18:22:37,075 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 5, 'TRBVA*01': 2}


2022-12-28 18:22:37,212 - assign_clone_ids.ipynb - INFO - Participant P00054 (GeneLocus.TCR) has 164469 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00054.parquet.


2022-12-28 18:22:37,621 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 16, 'TRBV8-2*01': 1, 'TRBVA*01': 1}














2022-12-28 18:22:50,752 - assign_clone_ids.ipynb - INFO - Participant P00085 (GeneLocus.TCR) has 46647 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00085.tsv'), (61254, 132), (48114, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:22:54,163 - assign_clone_ids.ipynb - INFO - Participant P00068 (GeneLocus.TCR) has 154229 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00068.tsv'), (215337, 132), (164768, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:22:55,451 - assign_clone_ids.ipynb - INFO - Participant P00074 (GeneLocus.TCR) has 162975 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00074.tsv'), (222297, 132), (170701, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:22:58,763 - assign_clone_ids.ipynb - INFO - Participant P00085 (GeneLocus.TCR) has 46550 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00085.parquet.


2022-12-28 18:22:59,131 - assign_clone_ids.ipynb - INFO - Participant P00060 (GeneLocus.TCR) has 257911 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00060.tsv'), (339097, 132), (268093, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:23:01,269 - assign_clone_ids.ipynb - INFO - Participant P00044 (GeneLocus.TCR) has 222825 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00044.parquet.


2022-12-28 18:23:02,831 - assign_clone_ids.ipynb - INFO - Participant P00042 (GeneLocus.TCR) has 171666 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00042.parquet.


2022-12-28 18:23:04,157 - assign_clone_ids.ipynb - INFO - Participant P00073 (GeneLocus.TCR) has 96084 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00073.tsv'), (130849, 132), (99882, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:23:04,503 - assign_clone_ids.ipynb - INFO - Participant P00084 (GeneLocus.TCR) has 117263 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00084.tsv'), (159569, 132), (123408, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:23:04,819 - assign_clone_ids.ipynb - INFO - Participant P00026 (GeneLocus.TCR) has 287452 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00026.parquet.


2022-12-28 18:23:06,357 - assign_clone_ids.ipynb - INFO - Participant P00048 (GeneLocus.TCR) has 170082 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00048.parquet.


2022-12-28 18:23:06,999 - assign_clone_ids.ipynb - INFO - Participant P00053 (GeneLocus.TCR) has 181596 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00053.parquet.


2022-12-28 18:23:07,395 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 1}


2022-12-28 18:23:07,751 - assign_clone_ids.ipynb - INFO - Participant P00050 (GeneLocus.TCR) has 236618 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00050.parquet.


2022-12-28 18:23:08,163 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27}


2022-12-28 18:23:08,702 - assign_clone_ids.ipynb - INFO - Participant P00059 (GeneLocus.TCR) has 239923 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00059.parquet.


2022-12-28 18:23:08,753 - assign_clone_ids.ipynb - INFO - Participant P00039 (GeneLocus.TCR) has 234752 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00039.parquet.


2022-12-28 18:23:08,981 - assign_clone_ids.ipynb - INFO - Participant P00065 (GeneLocus.TCR) has 148528 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00065.parquet.


2022-12-28 18:23:09,016 - assign_clone_ids.ipynb - INFO - Participant P00062 (GeneLocus.TCR) has 182444 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00062.parquet.


2022-12-28 18:23:09,337 - assign_clone_ids.ipynb - INFO - Participant P00061 (GeneLocus.TCR) has 204237 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00061.parquet.


2022-12-28 18:23:09,467 - assign_clone_ids.ipynb - INFO - Participant P00055 (GeneLocus.TCR) has 251805 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00055.parquet.


2022-12-28 18:23:09,501 - assign_clone_ids.ipynb - INFO - Participant P00028 (GeneLocus.TCR) has 267291 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00028.parquet.


2022-12-28 18:23:10,007 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 3, 'TRBV7-5*01': 32, 'TRBVA*01': 2}


2022-12-28 18:23:10,780 - assign_clone_ids.ipynb - INFO - Participant P00047 (GeneLocus.TCR) has 186804 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00047.parquet.






2022-12-28 18:23:12,093 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 34, 'TRBV8-2*01': 1, 'TRBVA*01': 3}


2022-12-28 18:23:13,165 - assign_clone_ids.ipynb - INFO - Participant P00035 (GeneLocus.TCR) has 313705 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00035.parquet.


2022-12-28 18:23:17,931 - assign_clone_ids.ipynb - INFO - Participant P00077 (GeneLocus.TCR) has 149671 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00077.tsv'), (199752, 132), (155699, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:23:18,197 - assign_clone_ids.ipynb - INFO - Participant P00068 (GeneLocus.TCR) has 153995 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00068.parquet.


2022-12-28 18:23:18,666 - assign_clone_ids.ipynb - INFO - Participant P00084 (GeneLocus.TCR) has 117072 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00084.parquet.


  df = pd.read_csv(



2022-12-28 18:23:19,717 - assign_clone_ids.ipynb - INFO - Participant P00072 (GeneLocus.TCR) has 139349 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00072.tsv'), (180080, 132), (145274, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:23:22,140 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22/OR9-2*01': 1, 'TRBV7-5*01': 27, 'TRBV7-5*02': 28, 'TRBV8-2*01': 1, 'TRBVA*01': 2}




2022-12-28 18:23:25,212 - assign_clone_ids.ipynb - INFO - Participant P00074 (GeneLocus.TCR) has 162755 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00074.parquet.


2022-12-28 18:23:25,212 - assign_clone_ids.ipynb - INFO - Participant P00066 (GeneLocus.TCR) has 162606 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00066.parquet.


2022-12-28 18:23:25,216 - assign_clone_ids.ipynb - INFO - Participant P00073 (GeneLocus.TCR) has 95965 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00073.parquet.


2022-12-28 18:23:26,169 - assign_clone_ids.ipynb - INFO - Participant P00038 (GeneLocus.TCR) has 277773 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00038.parquet.


2022-12-28 18:23:34,314 - assign_clone_ids.ipynb - INFO - Participant P00067 (GeneLocus.TCR) has 240668 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00067.tsv'), (320009, 132), (254756, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:23:36,665 - assign_clone_ids.ipynb - INFO - Participant P00069 (GeneLocus.TCR) has 192961 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00069.tsv'), (254495, 132), (201088, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:23:39,043 - assign_clone_ids.ipynb - INFO - Participant P00058 (GeneLocus.TCR) has 304889 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00058.parquet.


2022-12-28 18:23:39,046 - assign_clone_ids.ipynb - INFO - Participant P00037 (GeneLocus.TCR) has 237365 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00037.parquet.


2022-12-28 18:23:39,047 - assign_clone_ids.ipynb - INFO - Participant P00036 (GeneLocus.TCR) has 242147 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00036.parquet.


2022-12-28 18:23:39,047 - assign_clone_ids.ipynb - INFO - Participant P00060 (GeneLocus.TCR) has 257559 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00060.parquet.


2022-12-28 18:23:39,048 - assign_clone_ids.ipynb - INFO - Participant P00027 (GeneLocus.TCR) has 278180 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00027.parquet.


2022-12-28 18:23:39,044 - assign_clone_ids.ipynb - INFO - Participant P00072 (GeneLocus.TCR) has 139144 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00072.parquet.


2022-12-28 18:23:39,047 - assign_clone_ids.ipynb - INFO - Participant P00077 (GeneLocus.TCR) has 149476 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00077.parquet.


2022-12-28 18:23:39,379 - assign_clone_ids.ipynb - INFO - Participant P00032 (GeneLocus.TCR) has 299533 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00032.parquet.


2022-12-28 18:23:39,678 - assign_clone_ids.ipynb - INFO - Participant P00078 (GeneLocus.TCR) has 165834 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00078.tsv'), (217012, 132), (173340, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:23:40,553 - assign_clone_ids.ipynb - INFO - Participant P00083 (GeneLocus.TCR) has 177094 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00083.tsv'), (230106, 132), (185543, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:23:45,226 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 14, 'TRBVA*01': 1}


2022-12-28 18:23:47,748 - assign_clone_ids.ipynb - INFO - Participant P00075 (GeneLocus.TCR) has 187961 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00075.tsv'), (256397, 132), (199154, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:23:48,790 - assign_clone_ids.ipynb - INFO - Participant P00079 (GeneLocus.TCR) has 200467 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00079.tsv'), (270492, 132), (209216, 144), <GeneLocus.TCR: 2>)]






  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:24:00,644 - assign_clone_ids.ipynb - INFO - Participant P00069 (GeneLocus.TCR) has 192746 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00069.parquet.


2022-12-28 18:24:00,644 - assign_clone_ids.ipynb - INFO - Participant P00078 (GeneLocus.TCR) has 165663 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00078.parquet.


2022-12-28 18:24:00,645 - assign_clone_ids.ipynb - INFO - Participant P00083 (GeneLocus.TCR) has 176861 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00083.parquet.


2022-12-28 18:24:01,852 - assign_clone_ids.ipynb - INFO - Participant P00067 (GeneLocus.TCR) has 240311 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00067.parquet.


  df = pd.read_csv(



2022-12-28 18:24:07,380 - assign_clone_ids.ipynb - INFO - Participant P00080 (GeneLocus.TCR) has 262985 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00080.tsv'), (358809, 132), (275844, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:24:10,339 - assign_clone_ids.ipynb - INFO - Participant P00087 (GeneLocus.TCR) has 149479 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00087.tsv'), (198500, 132), (156483, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:24:25,771 - assign_clone_ids.ipynb - INFO - Participant P00079 (GeneLocus.TCR) has 200172 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00079.parquet.


2022-12-28 18:24:25,772 - assign_clone_ids.ipynb - INFO - Participant P00075 (GeneLocus.TCR) has 187728 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00075.parquet.


2022-12-28 18:24:28,993 - assign_clone_ids.ipynb - INFO - Participant P00087 (GeneLocus.TCR) has 149303 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00087.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:24:38,142 - assign_clone_ids.ipynb - INFO - Participant P00080 (GeneLocus.TCR) has 262619 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00080.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:26:40,567 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 3}




2022-12-28 18:26:43,346 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 2, 'TRBVA*01': 1}


2022-12-28 18:26:43,851 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 33, 'TRBVA*01': 3}




2022-12-28 18:26:47,103 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 8, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 18:26:47,577 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 7}


2022-12-28 18:26:47,680 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16}


2022-12-28 18:26:47,842 - assign_clone_ids.ipynb - INFO - Participant P00120 (GeneLocus.TCR) has 40116 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00120.tsv'), (48161, 132), (40588, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:26:51,344 - assign_clone_ids.ipynb - INFO - Participant P00105 (GeneLocus.TCR) has 38054 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00105.tsv'), (52986, 132), (39510, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:26:52,768 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 5, 'TRBV7-5*02': 7, 'TRBVA*01': 6}


2022-12-28 18:26:52,788 - assign_clone_ids.ipynb - INFO - Participant P00120 (GeneLocus.TCR) has 40022 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00120.parquet.






2022-12-28 18:26:55,716 - assign_clone_ids.ipynb - INFO - Participant P00105 (GeneLocus.TCR) has 37995 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00105.parquet.


2022-12-28 18:26:56,475 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBVA*01': 2}




2022-12-28 18:26:57,625 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 1}


2022-12-28 18:26:57,733 - assign_clone_ids.ipynb - INFO - Participant P00117 (GeneLocus.TCR) has 56161 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00117.tsv'), (75719, 132), (58910, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:26:59,155 - assign_clone_ids.ipynb - INFO - Participant P00109 (GeneLocus.TCR) has 62112 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00109.tsv'), (77033, 132), (63573, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:27:00,843 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBVA*01': 3}






2022-12-28 18:27:02,304 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 18:27:04,028 - assign_clone_ids.ipynb - INFO - Participant P00117 (GeneLocus.TCR) has 56104 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00117.parquet.




2022-12-28 18:27:06,930 - assign_clone_ids.ipynb - INFO - Participant P00109 (GeneLocus.TCR) has 62031 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00109.parquet.


2022-12-28 18:27:07,072 - assign_clone_ids.ipynb - INFO - Participant P00097 (GeneLocus.TCR) has 78638 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00097.tsv'), (107906, 132), (83149, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:27:08,402 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 34, 'TRBVA*01': 6}




2022-12-28 18:27:13,437 - assign_clone_ids.ipynb - INFO - Participant P00099 (GeneLocus.TCR) has 93741 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00099.tsv'), (124432, 132), (97061, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:27:14,115 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 13, 'TRBVA*01': 3}


2022-12-28 18:27:14,231 - assign_clone_ids.ipynb - INFO - Participant P00101 (GeneLocus.TCR) has 103852 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00101.tsv'), (130946, 132), (106009, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:27:16,086 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 1}


2022-12-28 18:27:16,205 - assign_clone_ids.ipynb - INFO - Participant P00097 (GeneLocus.TCR) has 78477 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00097.parquet.


2022-12-28 18:27:17,120 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 8}


  df = pd.read_csv(



2022-12-28 18:27:19,528 - assign_clone_ids.ipynb - INFO - Participant P00121 (GeneLocus.TCR) has 106551 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00121.tsv'), (143394, 132), (110669, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(







2022-12-28 18:27:22,140 - assign_clone_ids.ipynb - INFO - Participant P00081 (GeneLocus.TCR) has 189959 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00081.tsv'), (254644, 132), (199992, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:27:23,593 - assign_clone_ids.ipynb - INFO - Participant P00099 (GeneLocus.TCR) has 93648 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00099.parquet.


2022-12-28 18:27:25,151 - assign_clone_ids.ipynb - INFO - Participant P00123 (GeneLocus.TCR) has 113579 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00123.tsv'), (147045, 132), (117249, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:27:25,823 - assign_clone_ids.ipynb - INFO - Participant P00101 (GeneLocus.TCR) has 103628 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00101.parquet.


2022-12-28 18:27:27,002 - assign_clone_ids.ipynb - INFO - Participant P00086 (GeneLocus.TCR) has 231170 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00086.tsv'), (284073, 132), (241349, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:27:28,027 - assign_clone_ids.ipynb - INFO - Participant P00114 (GeneLocus.TCR) has 115824 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00114.tsv'), (163151, 132), (120836, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:27:30,924 - assign_clone_ids.ipynb - INFO - Participant P00121 (GeneLocus.TCR) has 106369 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00121.parquet.


  df = pd.read_csv(



2022-12-28 18:27:34,604 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 12, 'TRBV8-2*01': 1, 'TRBVA*01': 4}


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:27:37,792 - assign_clone_ids.ipynb - INFO - Participant P00123 (GeneLocus.TCR) has 113424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00123.parquet.


2022-12-28 18:27:38,847 - assign_clone_ids.ipynb - INFO - Participant P00096 (GeneLocus.TCR) has 134817 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00096.tsv'), (177542, 132), (139109, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:27:40,141 - assign_clone_ids.ipynb - INFO - Participant P00100 (GeneLocus.TCR) has 136356 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00100.tsv'), (180320, 132), (140647, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:27:40,574 - assign_clone_ids.ipynb - INFO - Participant P00114 (GeneLocus.TCR) has 115665 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00114.parquet.






2022-12-28 18:27:44,173 - assign_clone_ids.ipynb - INFO - Participant P00081 (GeneLocus.TCR) has 189704 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00081.parquet.


2022-12-28 18:27:44,900 - assign_clone_ids.ipynb - INFO - Participant P00111 (GeneLocus.TCR) has 150223 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00111.tsv'), (188904, 132), (156432, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:27:45,909 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 16, 'TRBV7-5*02': 17}


  df = pd.read_csv(



2022-12-28 18:27:47,054 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 10, 'TRBVA*01': 7}


2022-12-28 18:27:48,012 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 11, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 18:27:52,158 - assign_clone_ids.ipynb - INFO - Participant P00086 (GeneLocus.TCR) has 230793 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00086.parquet.


2022-12-28 18:27:55,474 - assign_clone_ids.ipynb - INFO - Participant P00096 (GeneLocus.TCR) has 134602 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00096.parquet.






2022-12-28 18:27:56,220 - assign_clone_ids.ipynb - INFO - Participant P00100 (GeneLocus.TCR) has 136148 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00100.parquet.




2022-12-28 18:27:58,137 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 39, 'TRBVA*01': 1}


2022-12-28 18:27:58,325 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 18, 'TRBVA*01': 2}




2022-12-28 18:27:59,546 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 22, 'TRBVA*01': 3}


2022-12-28 18:28:01,831 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBVA*01': 4}


2022-12-28 18:28:03,904 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 19, 'TRBV8-1*01': 1, 'TRBVA*01': 1}


2022-12-28 18:28:04,706 - assign_clone_ids.ipynb - INFO - Participant P00111 (GeneLocus.TCR) has 150034 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00111.parquet.


2022-12-28 18:28:04,849 - assign_clone_ids.ipynb - INFO - Participant P00118 (GeneLocus.TCR) has 166161 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00118.tsv'), (222089, 132), (172445, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:28:04,957 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 26, 'TRBVA*01': 2}


2022-12-28 18:28:05,341 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBVA*01': 7}


  df = pd.read_csv(



  df = pd.read_csv(











2022-12-28 18:28:09,677 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 36, 'TRBVA*01': 1}


2022-12-28 18:28:10,745 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBV7-5*02': 1, 'TRBVA*01': 11}


2022-12-28 18:28:11,002 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBV7-5*02': 37, 'TRBVA*01': 3}








  df = pd.read_csv(







  df = pd.read_csv(



2022-12-28 18:28:18,914 - assign_clone_ids.ipynb - INFO - Participant P00122 (GeneLocus.TCR) has 173594 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00122.tsv'), (229960, 132), (183826, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:28:20,357 - assign_clone_ids.ipynb - INFO - Participant P00107 (GeneLocus.TCR) has 198305 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00107.tsv'), (250491, 132), (206189, 144), <GeneLocus.TCR: 2>)]














  df = pd.read_csv(



2022-12-28 18:28:24,054 - assign_clone_ids.ipynb - INFO - Participant P00118 (GeneLocus.TCR) has 165915 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00118.parquet.


2022-12-28 18:28:24,050 - assign_clone_ids.ipynb - INFO - Participant P00090 (GeneLocus.TCR) has 181075 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00090.tsv'), (232276, 132), (191375, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:28:26,343 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 54, 'TRBVA*01': 4}


2022-12-28 18:28:34,015 - assign_clone_ids.ipynb - INFO - Participant P00116 (GeneLocus.TCR) has 189296 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00116.tsv'), (253871, 132), (198058, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:28:34,148 - assign_clone_ids.ipynb - INFO - Participant P00113 (GeneLocus.TCR) has 190416 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00113.tsv'), (255760, 132), (198311, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:28:34,269 - assign_clone_ids.ipynb - INFO - Participant P00088 (GeneLocus.TCR) has 193943 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00088.tsv'), (253556, 132), (200635, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:28:39,820 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 42, 'TRBV7-5*02': 1, 'TRBVA*01': 8}


  df = pd.read_csv(





2022-12-28 18:28:50,254 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 22, 'TRBVA*01': 6}


2022-12-28 18:28:51,097 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV8-2*01': 2, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 18:28:56,609 - assign_clone_ids.ipynb - INFO - Participant P00095 (GeneLocus.TCR) has 202347 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00095.tsv'), (262504, 132), (212372, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:28:56,720 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBVA*01': 2}






2022-12-28 18:29:01,077 - assign_clone_ids.ipynb - INFO - Participant P00103 (GeneLocus.TCR) has 203275 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00103.tsv'), (283026, 132), (214301, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:29:01,261 - assign_clone_ids.ipynb - INFO - Participant P00091 (GeneLocus.TCR) has 202780 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00091.tsv'), (267369, 132), (210540, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:29:07,389 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 24, 'TRBVA*01': 6}






2022-12-28 18:29:24,797 - assign_clone_ids.ipynb - INFO - Participant P00131 (GeneLocus.TCR) has 116503 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00131.tsv'), (162552, 132), (126326, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:29:28,536 - assign_clone_ids.ipynb - INFO - Participant P00132 (GeneLocus.TCR) has 128282 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00132.tsv'), (166084, 132), (135723, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:29:33,367 - assign_clone_ids.ipynb - INFO - Participant P00137 (GeneLocus.TCR) has 116357 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00137.tsv'), (150004, 132), (124192, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:29:40,671 - assign_clone_ids.ipynb - INFO - Participant P00119 (GeneLocus.TCR) has 215838 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00119.tsv'), (283487, 132), (228792, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:29:45,268 - assign_clone_ids.ipynb - INFO - Participant P00108 (GeneLocus.TCR) has 203153 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00108.tsv'), (272008, 132), (212981, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:29:45,496 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 1, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 18:29:51,064 - assign_clone_ids.ipynb - INFO - Participant P00125 (GeneLocus.TCR) has 211586 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00125.tsv'), (287408, 132), (221302, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:30:02,657 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 16, 'TRBV7-5*02': 30, 'TRBVA*01': 2}


2022-12-28 18:30:03,620 - assign_clone_ids.ipynb - INFO - Participant P00122 (GeneLocus.TCR) has 173240 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00122.parquet.


2022-12-28 18:30:04,005 - assign_clone_ids.ipynb - INFO - Participant P00127 (GeneLocus.TCR) has 204205 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00127.tsv'), (270005, 132), (216671, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:30:10,138 - assign_clone_ids.ipynb - INFO - Participant P00102 (GeneLocus.TCR) has 208283 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00102.tsv'), (287818, 132), (220183, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:30:10,201 - assign_clone_ids.ipynb - INFO - Participant P00090 (GeneLocus.TCR) has 180840 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00090.parquet.


2022-12-28 18:30:10,292 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9}


2022-12-28 18:30:10,450 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 36, 'TRBVA*01': 2}


2022-12-28 18:30:10,904 - assign_clone_ids.ipynb - INFO - Participant P00107 (GeneLocus.TCR) has 198042 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00107.parquet.


2022-12-28 18:30:11,816 - assign_clone_ids.ipynb - INFO - Participant P00116 (GeneLocus.TCR) has 189030 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00116.parquet.


2022-12-28 18:30:13,146 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 23, 'TRBV8-2*01': 1, 'TRBVA*01': 4}


2022-12-28 18:30:13,900 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 5}


2022-12-28 18:30:14,503 - assign_clone_ids.ipynb - INFO - Participant P00131 (GeneLocus.TCR) has 116319 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00131.parquet.


2022-12-28 18:30:14,900 - assign_clone_ids.ipynb - INFO - Participant P00088 (GeneLocus.TCR) has 193646 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00088.parquet.


2022-12-28 18:30:15,498 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 22, 'TRBVA*01': 8}


2022-12-28 18:30:15,763 - assign_clone_ids.ipynb - INFO - Participant P00113 (GeneLocus.TCR) has 190225 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00113.parquet.


2022-12-28 18:30:15,880 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 32, 'TRBVA*01': 2}


2022-12-28 18:30:16,989 - assign_clone_ids.ipynb - INFO - Participant P00139 (GeneLocus.TCR) has 71547 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00139.tsv'), (96486, 132), (74437, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:30:17,443 - assign_clone_ids.ipynb - INFO - Participant P00132 (GeneLocus.TCR) has 128167 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00132.parquet.


2022-12-28 18:30:17,828 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 41, 'TRBVA*01': 2}


2022-12-28 18:30:17,894 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 27, 'TRBVA*01': 5}


2022-12-28 18:30:18,645 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*02': 32, 'TRBVA*01': 3}


2022-12-28 18:30:19,655 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 45, 'TRBVA*01': 2}




2022-12-28 18:30:20,477 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 37, 'TRBVA*01': 5}


2022-12-28 18:30:21,231 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBVA*01': 5}




2022-12-28 18:30:21,627 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 8, 'TRBV7-5*02': 30}


2022-12-28 18:30:22,092 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 28, 'TRBV8-2*01': 1, 'TRBVA*01': 7}


2022-12-28 18:30:22,826 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 49, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 5}


2022-12-28 18:30:23,384 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 17, 'TRBV7-5*02': 33, 'TRBVA*01': 1}


2022-12-28 18:30:24,087 - assign_clone_ids.ipynb - INFO - Participant P00137 (GeneLocus.TCR) has 116244 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00137.parquet.




  df = pd.read_csv(







2022-12-28 18:30:26,744 - assign_clone_ids.ipynb - INFO - Participant P00091 (GeneLocus.TCR) has 202457 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00091.parquet.


2022-12-28 18:30:27,229 - assign_clone_ids.ipynb - INFO - Participant P00139 (GeneLocus.TCR) has 71453 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00139.parquet.


2022-12-28 18:30:27,363 - assign_clone_ids.ipynb - INFO - Participant P00095 (GeneLocus.TCR) has 202114 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00095.parquet.


2022-12-28 18:30:27,830 - assign_clone_ids.ipynb - INFO - Participant P00103 (GeneLocus.TCR) has 203009 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00103.parquet.








2022-12-28 18:30:29,248 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBV7-5*02': 42, 'TRBVA*01': 7}


2022-12-28 18:30:29,956 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV10-1*03': 1, 'TRBV7-5*01': 38, 'TRBV7-5*02': 42, 'TRBV8-1*01': 1, 'TRBVA*01': 8}


2022-12-28 18:30:30,763 - assign_clone_ids.ipynb - INFO - Participant P00115 (GeneLocus.TCR) has 237350 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00115.tsv'), (314299, 132), (248701, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:30:31,696 - assign_clone_ids.ipynb - INFO - Participant P00110 (GeneLocus.TCR) has 246944 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00110.tsv'), (331957, 132), (260005, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:30:33,312 - assign_clone_ids.ipynb - INFO - Participant P00119 (GeneLocus.TCR) has 215574 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00119.parquet.




2022-12-28 18:30:33,656 - assign_clone_ids.ipynb - INFO - Participant P00125 (GeneLocus.TCR) has 211294 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00125.parquet.


  df = pd.read_csv(









2022-12-28 18:30:36,969 - assign_clone_ids.ipynb - INFO - Participant P00133 (GeneLocus.TCR) has 113286 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00133.tsv'), (151580, 132), (119402, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:30:37,367 - assign_clone_ids.ipynb - INFO - Participant P00108 (GeneLocus.TCR) has 202900 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00108.parquet.




2022-12-28 18:30:37,605 - assign_clone_ids.ipynb - INFO - Participant P00127 (GeneLocus.TCR) has 203852 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00127.parquet.














2022-12-28 18:30:38,633 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 40, 'TRBVA*01': 10}












2022-12-28 18:30:41,894 - assign_clone_ids.ipynb - INFO - Participant P00130 (GeneLocus.TCR) has 136435 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00130.tsv'), (192065, 132), (142050, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 18:30:43,284 - assign_clone_ids.ipynb - INFO - Participant P00102 (GeneLocus.TCR) has 207991 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00102.parquet.




  df = pd.read_csv(





2022-12-28 18:30:51,072 - assign_clone_ids.ipynb - INFO - Participant P00133 (GeneLocus.TCR) has 113148 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00133.parquet.






2022-12-28 18:30:53,447 - assign_clone_ids.ipynb - INFO - Participant P00129 (GeneLocus.TCR) has 189269 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00129.tsv'), (248590, 132), (195903, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:30:56,734 - assign_clone_ids.ipynb - INFO - Participant P00138 (GeneLocus.TCR) has 196927 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00138.tsv'), (251546, 132), (203518, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:30:57,076 - assign_clone_ids.ipynb - INFO - Participant P00135 (GeneLocus.TCR) has 214904 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00135.tsv'), (271738, 132), (225177, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:30:58,369 - assign_clone_ids.ipynb - INFO - Participant P00130 (GeneLocus.TCR) has 136240 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00130.parquet.


2022-12-28 18:30:59,191 - assign_clone_ids.ipynb - INFO - Participant P00115 (GeneLocus.TCR) has 237048 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00115.parquet.


2022-12-28 18:31:01,756 - assign_clone_ids.ipynb - INFO - Participant P00089 (GeneLocus.TCR) has 264088 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00089.tsv'), (347223, 132), (276965, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:04,037 - assign_clone_ids.ipynb - INFO - Participant P00134 (GeneLocus.TCR) has 213218 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00134.tsv'), (281901, 132), (222286, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:04,946 - assign_clone_ids.ipynb - INFO - Participant P00110 (GeneLocus.TCR) has 246474 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00110.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:31:10,769 - assign_clone_ids.ipynb - INFO - Participant P00094 (GeneLocus.TCR) has 277911 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00094.tsv'), (351654, 132), (289443, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:11,633 - assign_clone_ids.ipynb - INFO - Participant P00141 (GeneLocus.TCR) has 241931 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00141.tsv'), (313889, 132), (254927, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:13,844 - assign_clone_ids.ipynb - INFO - Participant P00128 (GeneLocus.TCR) has 280955 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00128.tsv'), (386061, 132), (295533, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:31:15,762 - assign_clone_ids.ipynb - INFO - Participant P00129 (GeneLocus.TCR) has 188995 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00129.parquet.


2022-12-28 18:31:15,926 - assign_clone_ids.ipynb - INFO - Participant P00092 (GeneLocus.TCR) has 294415 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00092.tsv'), (382680, 132), (307844, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:16,396 - assign_clone_ids.ipynb - INFO - Participant P00098 (GeneLocus.TCR) has 264524 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00098.tsv'), (365939, 132), (279754, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:19,006 - assign_clone_ids.ipynb - INFO - Participant P00136 (GeneLocus.TCR) has 284814 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00136.tsv'), (378990, 132), (298618, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:19,870 - assign_clone_ids.ipynb - INFO - Participant P00104 (GeneLocus.TCR) has 289362 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00104.tsv'), (404784, 132), (305988, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:31:21,473 - assign_clone_ids.ipynb - INFO - Participant P00126 (GeneLocus.TCR) has 289023 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00126.tsv'), (385312, 132), (303031, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:21,687 - assign_clone_ids.ipynb - INFO - Participant P00138 (GeneLocus.TCR) has 196643 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00138.parquet.


2022-12-28 18:31:23,470 - assign_clone_ids.ipynb - INFO - Participant P00135 (GeneLocus.TCR) has 214620 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00135.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:31:29,485 - assign_clone_ids.ipynb - INFO - Participant P00124 (GeneLocus.TCR) has 282755 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00124.tsv'), (382088, 132), (297984, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:30,828 - assign_clone_ids.ipynb - INFO - Participant P00112 (GeneLocus.TCR) has 326199 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00112.tsv'), (424476, 132), (338930, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:31:31,236 - assign_clone_ids.ipynb - INFO - Participant P00134 (GeneLocus.TCR) has 212766 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00134.parquet.


  df = pd.read_csv(



2022-12-28 18:31:33,017 - assign_clone_ids.ipynb - INFO - Participant P00089 (GeneLocus.TCR) has 263718 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00089.parquet.


2022-12-28 18:31:37,572 - assign_clone_ids.ipynb - INFO - Participant P00140 (GeneLocus.TCR) has 287543 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00140.tsv'), (383684, 132), (300456, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:31:59,488 - assign_clone_ids.ipynb - INFO - Participant P00141 (GeneLocus.TCR) has 241589 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00141.parquet.


  df = pd.read_csv(



2022-12-28 18:32:03,190 - assign_clone_ids.ipynb - INFO - Participant P00106 (GeneLocus.TCR) has 411366 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00106.tsv'), (577153, 132), (439932, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:32:04,682 - assign_clone_ids.ipynb - INFO - Participant P00094 (GeneLocus.TCR) has 277474 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00094.parquet.


2022-12-28 18:32:04,889 - assign_clone_ids.ipynb - INFO - Participant P00093 (GeneLocus.TCR) has 410432 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00093.tsv'), (549599, 132), (433623, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:32:15,608 - assign_clone_ids.ipynb - INFO - Participant P00098 (GeneLocus.TCR) has 264215 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00098.parquet.


2022-12-28 18:32:16,178 - assign_clone_ids.ipynb - INFO - Participant P00128 (GeneLocus.TCR) has 280486 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00128.parquet.


2022-12-28 18:32:17,125 - assign_clone_ids.ipynb - INFO - Participant P00136 (GeneLocus.TCR) has 284424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00136.parquet.


2022-12-28 18:32:17,163 - assign_clone_ids.ipynb - INFO - Participant P00092 (GeneLocus.TCR) has 293754 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00092.parquet.


2022-12-28 18:32:17,296 - assign_clone_ids.ipynb - INFO - Participant P00104 (GeneLocus.TCR) has 288968 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00104.parquet.


2022-12-28 18:32:17,314 - assign_clone_ids.ipynb - INFO - Participant P00124 (GeneLocus.TCR) has 282214 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00124.parquet.


2022-12-28 18:32:17,635 - assign_clone_ids.ipynb - INFO - Participant P00126 (GeneLocus.TCR) has 288424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00126.parquet.


  df = pd.read_csv(



2022-12-28 18:32:18,622 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6}




  df = pd.read_csv(



2022-12-28 18:32:29,079 - assign_clone_ids.ipynb - INFO - Participant P00112 (GeneLocus.TCR) has 325775 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00112.parquet.


2022-12-28 18:32:31,270 - assign_clone_ids.ipynb - INFO - Participant P00140 (GeneLocus.TCR) has 287097 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00140.parquet.


2022-12-28 18:32:32,697 - assign_clone_ids.ipynb - INFO - Participant P00148 (GeneLocus.TCR) has 73329 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00148.tsv'), (99025, 132), (79257, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:32:41,210 - assign_clone_ids.ipynb - INFO - Participant P00148 (GeneLocus.TCR) has 73226 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00148.parquet.


  df = pd.read_csv(



2022-12-28 18:32:55,233 - assign_clone_ids.ipynb - INFO - Participant P00093 (GeneLocus.TCR) has 409643 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00093.parquet.


  df = pd.read_csv(



2022-12-28 18:32:55,571 - assign_clone_ids.ipynb - INFO - Participant P00106 (GeneLocus.TCR) has 410715 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00106.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:33:39,028 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 13}


  df = pd.read_csv(





2022-12-28 18:33:44,175 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBVA*01': 1}




2022-12-28 18:33:46,274 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1}






2022-12-28 18:33:51,351 - assign_clone_ids.ipynb - INFO - Participant P00177 (GeneLocus.TCR) has 27219 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00177.tsv'), (32611, 132), (27467, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:33:54,309 - assign_clone_ids.ipynb - INFO - Participant P00177 (GeneLocus.TCR) has 27164 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00177.parquet.


2022-12-28 18:33:56,846 - assign_clone_ids.ipynb - INFO - Participant P00147 (GeneLocus.TCR) has 103667 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00147.tsv'), (151008, 132), (108515, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:33:58,971 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5}


2022-12-28 18:33:59,011 - assign_clone_ids.ipynb - INFO - Participant P00157 (GeneLocus.TCR) has 86346 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00157.tsv'), (122473, 132), (95196, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:03,107 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 12}


2022-12-28 18:34:03,216 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBVA*01': 5}


2022-12-28 18:34:04,068 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 51, 'TRBVA*01': 2}


2022-12-28 18:34:04,098 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 8, 'TRBVA*01': 3}


2022-12-28 18:34:04,745 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 15, 'TRBVA*01': 1}




2022-12-28 18:34:05,068 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 2}


2022-12-28 18:34:06,182 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 12, 'TRBVA*01': 2}


2022-12-28 18:34:08,524 - assign_clone_ids.ipynb - INFO - Participant P00147 (GeneLocus.TCR) has 103560 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00147.parquet.


2022-12-28 18:34:09,263 - assign_clone_ids.ipynb - INFO - Participant P00157 (GeneLocus.TCR) has 86219 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00157.parquet.




2022-12-28 18:34:10,120 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


  df = pd.read_csv(











2022-12-28 18:34:12,517 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 26, 'TRBVA*01': 2}






2022-12-28 18:34:13,031 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBV7-5*02': 14, 'TRBVA*01': 3}






2022-12-28 18:34:13,163 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 3}










2022-12-28 18:34:17,826 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 53, 'TRBVA*01': 4}


2022-12-28 18:34:19,954 - assign_clone_ids.ipynb - INFO - Participant P00150 (GeneLocus.TCR) has 130475 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00150.tsv'), (160450, 132), (133136, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:34:21,102 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25}


2022-12-28 18:34:21,244 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBV8-1*01': 1, 'TRBVA*01': 2}






2022-12-28 18:34:26,074 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 33, 'TRBVA*01': 8}


2022-12-28 18:34:26,537 - assign_clone_ids.ipynb - INFO - Participant P00164 (GeneLocus.TCR) has 102641 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00164.tsv'), (140588, 132), (106379, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:26,628 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 15, 'TRBVA*01': 1}




2022-12-28 18:34:27,811 - assign_clone_ids.ipynb - INFO - Participant P00165 (GeneLocus.TCR) has 127142 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00165.tsv'), (164399, 132), (132234, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:30,445 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 45, 'TRBVA*01': 4}






2022-12-28 18:34:32,105 - assign_clone_ids.ipynb - INFO - Participant P00151 (GeneLocus.TCR) has 143837 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00151.tsv'), (183269, 132), (149868, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:33,557 - assign_clone_ids.ipynb - INFO - Participant P00144 (GeneLocus.TCR) has 161402 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00144.tsv'), (214210, 132), (169395, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:34,229 - assign_clone_ids.ipynb - INFO - Participant P00143 (GeneLocus.TCR) has 158804 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00143.tsv'), (201680, 132), (165495, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:34:35,201 - assign_clone_ids.ipynb - INFO - Participant P00163 (GeneLocus.TCR) has 139765 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00163.tsv'), (187801, 132), (144995, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:35,361 - assign_clone_ids.ipynb - INFO - Participant P00158 (GeneLocus.TCR) has 133686 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00158.tsv'), (186172, 132), (147835, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:36,048 - assign_clone_ids.ipynb - INFO - Participant P00150 (GeneLocus.TCR) has 130223 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00150.parquet.


2022-12-28 18:34:36,652 - assign_clone_ids.ipynb - INFO - Participant P00142 (GeneLocus.TCR) has 161517 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00142.tsv'), (220218, 132), (169134, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:34:38,933 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 42, 'TRBVA*01': 7}


2022-12-28 18:34:40,188 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 40, 'TRBVA*01': 7}




2022-12-28 18:34:41,055 - assign_clone_ids.ipynb - INFO - Participant P00164 (GeneLocus.TCR) has 102474 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00164.parquet.




2022-12-28 18:34:41,568 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16}


2022-12-28 18:34:44,280 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 18:34:44,502 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 31, 'TRBV7-5*02': 23, 'TRBVA*01': 5}


2022-12-28 18:34:44,733 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 19, 'TRBV7-5*02': 43, 'TRBVA*01': 6}




2022-12-28 18:34:45,180 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 31, 'TRBVA*01': 1}




2022-12-28 18:34:46,901 - assign_clone_ids.ipynb - INFO - Participant P00165 (GeneLocus.TCR) has 126908 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00165.parquet.


2022-12-28 18:34:47,265 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 37, 'TRBVA*01': 5}


2022-12-28 18:34:47,464 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV5-2*01': 1, 'TRBV7-5*01': 23, 'TRBV7-5*02': 29, 'TRBVA*01': 2}


2022-12-28 18:34:47,482 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 15, 'TRBVA*01': 2}


2022-12-28 18:34:49,181 - assign_clone_ids.ipynb - INFO - Participant P00166 (GeneLocus.TCR) has 141459 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00166.tsv'), (180665, 132), (146554, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 18:34:56,595 - assign_clone_ids.ipynb - INFO - Participant P00154 (GeneLocus.TCR) has 165421 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00154.tsv'), (221584, 132), (172342, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:56,938 - assign_clone_ids.ipynb - INFO - Participant P00161 (GeneLocus.TCR) has 175379 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00161.tsv'), (231040, 132), (182405, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:34:58,645 - assign_clone_ids.ipynb - INFO - Participant P00158 (GeneLocus.TCR) has 133448 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00158.parquet.




  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:35:15,507 - assign_clone_ids.ipynb - INFO - Participant P00159 (GeneLocus.TCR) has 181228 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00159.tsv'), (230241, 132), (188233, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:35:15,864 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 40, 'TRBVA*01': 2}


  df = pd.read_csv(





2022-12-28 18:35:18,600 - assign_clone_ids.ipynb - INFO - Participant P00152 (GeneLocus.TCR) has 154574 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00152.tsv'), (214147, 132), (163783, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:35:28,806 - assign_clone_ids.ipynb - INFO - Participant P00168 (GeneLocus.TCR) has 155422 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00168.tsv'), (208950, 132), (163053, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:35:29,258 - assign_clone_ids.ipynb - INFO - Participant P00163 (GeneLocus.TCR) has 139555 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00163.parquet.


2022-12-28 18:35:30,836 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 53, 'TRBVA*01': 5}




2022-12-28 18:35:36,095 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV8-2*01': 1}


  df = pd.read_csv(





2022-12-28 18:35:51,233 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 10, 'TRBV7-5*02': 12, 'TRBVA*01': 1}


















2022-12-28 18:36:16,444 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBVA*01': 4}


2022-12-28 18:36:17,430 - assign_clone_ids.ipynb - INFO - Participant P00189 (GeneLocus.TCR) has 137027 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00189.tsv'), (172748, 132), (143352, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:17,691 - assign_clone_ids.ipynb - INFO - Participant P00143 (GeneLocus.TCR) has 158576 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00143.parquet.


2022-12-28 18:36:17,678 - assign_clone_ids.ipynb - INFO - Participant P00190 (GeneLocus.TCR) has 84852 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00190.tsv'), (112186, 132), (87053, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:17,727 - assign_clone_ids.ipynb - INFO - Participant P00144 (GeneLocus.TCR) has 161102 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00144.parquet.






2022-12-28 18:36:19,356 - assign_clone_ids.ipynb - INFO - Participant P00151 (GeneLocus.TCR) has 143646 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00151.parquet.


2022-12-28 18:36:19,893 - assign_clone_ids.ipynb - INFO - Participant P00142 (GeneLocus.TCR) has 161291 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00142.parquet.


2022-12-28 18:36:20,154 - assign_clone_ids.ipynb - INFO - Participant P00161 (GeneLocus.TCR) has 175162 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00161.parquet.


2022-12-28 18:36:21,665 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 39, 'TRBVA*01': 3}


2022-12-28 18:36:22,207 - assign_clone_ids.ipynb - INFO - Participant P00166 (GeneLocus.TCR) has 141290 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00166.parquet.




2022-12-28 18:36:22,558 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 7}




2022-12-28 18:36:22,935 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 1, 'TRBVA*01': 2}


2022-12-28 18:36:24,897 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 31, 'TRBVA*01': 3}


2022-12-28 18:36:25,199 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBVA*01': 3}


2022-12-28 18:36:25,771 - assign_clone_ids.ipynb - INFO - Participant P00178 (GeneLocus.TCR) has 99574 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00178.tsv'), (131242, 132), (103975, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:25,884 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 39, 'TRBVA*01': 4}


  df = pd.read_csv(







2022-12-28 18:36:27,085 - assign_clone_ids.ipynb - INFO - Participant P00180 (GeneLocus.TCR) has 165689 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00180.tsv'), (213924, 132), (171683, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:36:27,834 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 48, 'TRBVA*01': 3}






2022-12-28 18:36:28,974 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 55, 'TRBVA*01': 12}


  df = pd.read_csv(



2022-12-28 18:36:29,314 - assign_clone_ids.ipynb - INFO - Participant P00179 (GeneLocus.TCR) has 116626 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00179.tsv'), (158861, 132), (121828, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:30,705 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 1, 'TRBV7-5*02': 75, 'TRBVA*01': 4}


2022-12-28 18:36:30,963 - assign_clone_ids.ipynb - INFO - Participant P00190 (GeneLocus.TCR) has 84707 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00190.parquet.


2022-12-28 18:36:31,212 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 36, 'TRBVA*01': 1}


2022-12-28 18:36:31,551 - assign_clone_ids.ipynb - INFO - Participant P00152 (GeneLocus.TCR) has 154374 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00152.parquet.


2022-12-28 18:36:34,152 - assign_clone_ids.ipynb - INFO - Participant P00159 (GeneLocus.TCR) has 180934 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00159.parquet.


2022-12-28 18:36:34,503 - assign_clone_ids.ipynb - INFO - Participant P00154 (GeneLocus.TCR) has 165210 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00154.parquet.


2022-12-28 18:36:35,948 - assign_clone_ids.ipynb - INFO - Participant P00168 (GeneLocus.TCR) has 155213 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00168.parquet.




2022-12-28 18:36:37,603 - assign_clone_ids.ipynb - INFO - Participant P00153 (GeneLocus.TCR) has 222070 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00153.tsv'), (298090, 132), (235273, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:36:39,415 - assign_clone_ids.ipynb - INFO - Participant P00171 (GeneLocus.TCR) has 205308 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00171.tsv'), (272867, 132), (215426, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:36:39,948 - assign_clone_ids.ipynb - INFO - Participant P00189 (GeneLocus.TCR) has 136805 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00189.parquet.


2022-12-28 18:36:40,569 - assign_clone_ids.ipynb - INFO - Participant P00178 (GeneLocus.TCR) has 99408 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00178.parquet.


2022-12-28 18:36:40,891 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 7}




2022-12-28 18:36:41,090 - assign_clone_ids.ipynb - INFO - Participant P00162 (GeneLocus.TCR) has 238077 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00162.tsv'), (329163, 132), (253173, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:42,961 - assign_clone_ids.ipynb - INFO - Participant P00145 (GeneLocus.TCR) has 219791 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00145.tsv'), (294061, 132), (231561, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:43,078 - assign_clone_ids.ipynb - INFO - Participant P00155 (GeneLocus.TCR) has 248862 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00155.tsv'), (337781, 132), (262288, 144), <GeneLocus.TCR: 2>)]
















2022-12-28 18:36:46,070 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 49, 'TRBVA*01': 4}






2022-12-28 18:36:48,114 - assign_clone_ids.ipynb - INFO - Participant P00179 (GeneLocus.TCR) has 116459 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00179.parquet.




2022-12-28 18:36:50,016 - assign_clone_ids.ipynb - INFO - Participant P00180 (GeneLocus.TCR) has 165452 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00180.parquet.


  df = pd.read_csv(



2022-12-28 18:36:51,348 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 12, 'TRBVA*01': 2}


2022-12-28 18:36:51,523 - assign_clone_ids.ipynb - INFO - Participant P00173 (GeneLocus.TCR) has 192080 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00173.tsv'), (269261, 132), (204443, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:36:52,167 - assign_clone_ids.ipynb - INFO - Participant P00186 (GeneLocus.TCR) has 168773 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00186.tsv'), (237082, 132), (175489, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:52,435 - assign_clone_ids.ipynb - INFO - Participant P00191 (GeneLocus.TCR) has 51635 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00191.tsv'), (69515, 132), (55570, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(





2022-12-28 18:36:55,950 - assign_clone_ids.ipynb - INFO - Participant P00160 (GeneLocus.TCR) has 259654 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00160.tsv'), (359237, 132), (272071, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:36:56,513 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 28, 'TRBVA*01': 1}


2022-12-28 18:36:59,133 - assign_clone_ids.ipynb - INFO - Participant P00191 (GeneLocus.TCR) has 51583 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00191.parquet.


2022-12-28 18:37:00,160 - assign_clone_ids.ipynb - INFO - Participant P00169 (GeneLocus.TCR) has 261055 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00169.tsv'), (356448, 132), (273389, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:02,257 - assign_clone_ids.ipynb - INFO - Participant P00146 (GeneLocus.TCR) has 288133 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00146.tsv'), (385128, 132), (304011, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:03,195 - assign_clone_ids.ipynb - INFO - Participant P00149 (GeneLocus.TCR) has 267874 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00149.tsv'), (371446, 132), (286057, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:37:04,248 - assign_clone_ids.ipynb - INFO - Participant P00192 (GeneLocus.TCR) has 65080 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00192.tsv'), (89095, 132), (67712, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:37:08,077 - assign_clone_ids.ipynb - INFO - Participant P00167 (GeneLocus.TCR) has 269941 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00167.tsv'), (366918, 132), (283486, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 18:37:06,856 - assign_clone_ids.ipynb - INFO - Participant P00181 (GeneLocus.TCR) has 211911 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00181.tsv'), (283408, 132), (224240, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





2022-12-28 18:37:10,931 - assign_clone_ids.ipynb - INFO - Participant P00171 (GeneLocus.TCR) has 205001 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00171.parquet.


2022-12-28 18:37:11,764 - assign_clone_ids.ipynb - INFO - Participant P00192 (GeneLocus.TCR) has 65018 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00192.parquet.


2022-12-28 18:37:11,835 - assign_clone_ids.ipynb - INFO - Participant P00175 (GeneLocus.TCR) has 245476 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00175.tsv'), (318570, 132), (258144, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:11,921 - assign_clone_ids.ipynb - INFO - Participant P00183 (GeneLocus.TCR) has 211580 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00183.tsv'), (260033, 132), (218607, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:12,583 - assign_clone_ids.ipynb - INFO - Participant P00162 (GeneLocus.TCR) has 237716 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00162.parquet.






  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:37:15,600 - assign_clone_ids.ipynb - INFO - Participant P00172 (GeneLocus.TCR) has 244230 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00172.tsv'), (321110, 132), (257658, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:16,039 - assign_clone_ids.ipynb - INFO - Participant P00145 (GeneLocus.TCR) has 219539 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00145.parquet.


2022-12-28 18:37:16,039 - assign_clone_ids.ipynb - INFO - Participant P00153 (GeneLocus.TCR) has 221745 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00153.parquet.


2022-12-28 18:37:16,507 - assign_clone_ids.ipynb - INFO - Participant P00186 (GeneLocus.TCR) has 168479 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00186.parquet.


2022-12-28 18:37:17,335 - assign_clone_ids.ipynb - INFO - Participant P00173 (GeneLocus.TCR) has 191812 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00173.parquet.


2022-12-28 18:37:17,694 - assign_clone_ids.ipynb - INFO - Participant P00174 (GeneLocus.TCR) has 218451 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00174.tsv'), (303100, 132), (231360, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:18,530 - assign_clone_ids.ipynb - INFO - Participant P00156 (GeneLocus.TCR) has 320095 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00156.tsv'), (433600, 132), (337974, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:18,557 - assign_clone_ids.ipynb - INFO - Participant P00182 (GeneLocus.TCR) has 238393 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00182.tsv'), (345161, 132), (255023, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:37:19,876 - assign_clone_ids.ipynb - INFO - Participant P00184 (GeneLocus.TCR) has 229927 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00184.tsv'), (317161, 132), (242974, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:37:21,364 - assign_clone_ids.ipynb - INFO - Participant P00155 (GeneLocus.TCR) has 248528 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00155.parquet.


  df = pd.read_csv(





2022-12-28 18:37:22,946 - assign_clone_ids.ipynb - INFO - Participant P00207 (GeneLocus.TCR) has 966 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00207.tsv'), (1491, 132), (1045, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:37:23,249 - assign_clone_ids.ipynb - INFO - Participant P00207 (GeneLocus.TCR) has 964 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00207.parquet.


2022-12-28 18:37:26,343 - assign_clone_ids.ipynb - INFO - Participant P00176 (GeneLocus.TCR) has 250073 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00176.tsv'), (328079, 132), (259425, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:37:41,027 - assign_clone_ids.ipynb - INFO - Participant P00185 (GeneLocus.TCR) has 283949 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00185.tsv'), (365542, 132), (298878, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:38:08,092 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


  df = pd.read_csv(







2022-12-28 18:38:16,031 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 1}


2022-12-28 18:38:17,822 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 1}


2022-12-28 18:38:19,941 - assign_clone_ids.ipynb - INFO - Participant P00160 (GeneLocus.TCR) has 259159 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00160.parquet.




2022-12-28 18:38:21,165 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*02': 8, 'TRBVA*01': 1}


2022-12-28 18:38:21,265 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBVA*01': 2}


2022-12-28 18:38:22,700 - assign_clone_ids.ipynb - INFO - Participant P00203 (GeneLocus.TCR) has 59556 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00203.tsv'), (80661, 132), (62123, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:38:30,466 - assign_clone_ids.ipynb - INFO - Participant P00170 (GeneLocus.TCR) has 319962 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00170.tsv'), (438271, 132), (335342, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:38:32,822 - assign_clone_ids.ipynb - INFO - Participant P00203 (GeneLocus.TCR) has 59458 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00203.parquet.


2022-12-28 18:38:32,864 - assign_clone_ids.ipynb - INFO - Participant P00213 (GeneLocus.TCR) has 69032 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00213.tsv'), (89178, 132), (72786, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:38:38,102 - assign_clone_ids.ipynb - INFO - Participant P00205 (GeneLocus.TCR) has 102925 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00205.tsv'), (126848, 132), (104809, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:38:41,926 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 32}


2022-12-28 18:38:48,612 - assign_clone_ids.ipynb - INFO - Participant P00204 (GeneLocus.TCR) has 76201 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00204.tsv'), (99532, 132), (78910, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(





2022-12-28 18:38:57,717 - assign_clone_ids.ipynb - INFO - Participant P00205 (GeneLocus.TCR) has 102768 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00205.parquet.


2022-12-28 18:38:57,956 - assign_clone_ids.ipynb - INFO - Participant P00213 (GeneLocus.TCR) has 68872 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00213.parquet.


2022-12-28 18:39:09,083 - assign_clone_ids.ipynb - INFO - Participant P00183 (GeneLocus.TCR) has 211095 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00183.parquet.


2022-12-28 18:39:09,615 - assign_clone_ids.ipynb - INFO - Participant P00181 (GeneLocus.TCR) has 211617 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00181.parquet.


2022-12-28 18:39:13,286 - assign_clone_ids.ipynb - INFO - Participant P00175 (GeneLocus.TCR) has 244965 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00175.parquet.


2022-12-28 18:39:13,317 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 5, 'TRBVA*01': 1}


2022-12-28 18:39:13,735 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBVA*01': 2}


2022-12-28 18:39:13,951 - assign_clone_ids.ipynb - INFO - Participant P00172 (GeneLocus.TCR) has 243873 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00172.parquet.


2022-12-28 18:39:15,863 - assign_clone_ids.ipynb - INFO - Participant P00169 (GeneLocus.TCR) has 260704 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00169.parquet.


2022-12-28 18:39:17,467 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4}


2022-12-28 18:39:18,247 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBVA*01': 3}


2022-12-28 18:39:18,614 - assign_clone_ids.ipynb - INFO - Participant P00146 (GeneLocus.TCR) has 287603 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00146.parquet.


2022-12-28 18:39:18,621 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 18:39:18,686 - assign_clone_ids.ipynb - INFO - Participant P00149 (GeneLocus.TCR) has 267552 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00149.parquet.


2022-12-28 18:39:19,491 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 12, 'TRBV8-2*01': 1, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 18:39:19,572 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 20, 'TRBVA*01': 4}




2022-12-28 18:39:19,975 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBV7-5*02': 18, 'TRBVA*01': 6}








2022-12-28 18:39:20,442 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBVA*01': 3}


2022-12-28 18:39:20,828 - assign_clone_ids.ipynb - INFO - Participant P00174 (GeneLocus.TCR) has 218216 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00174.parquet.


2022-12-28 18:39:20,852 - assign_clone_ids.ipynb - INFO - Participant P00204 (GeneLocus.TCR) has 76093 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00204.parquet.


2022-12-28 18:39:20,874 - assign_clone_ids.ipynb - INFO - Participant P00184 (GeneLocus.TCR) has 229495 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00184.parquet.


2022-12-28 18:39:20,920 - assign_clone_ids.ipynb - INFO - Participant P00182 (GeneLocus.TCR) has 237940 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00182.parquet.


2022-12-28 18:39:20,943 - assign_clone_ids.ipynb - INFO - Participant P00167 (GeneLocus.TCR) has 269402 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00167.parquet.


2022-12-28 18:39:22,191 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 37, 'TRBVA*01': 3}


2022-12-28 18:39:23,510 - assign_clone_ids.ipynb - INFO - Participant P00185 (GeneLocus.TCR) has 283598 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00185.parquet.


2022-12-28 18:39:23,518 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 26, 'TRBVA*01': 7}


2022-12-28 18:39:23,671 - assign_clone_ids.ipynb - INFO - Participant P00200 (GeneLocus.TCR) has 116006 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00200.tsv'), (153432, 132), (121390, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:39:23,870 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 22, 'TRBVA*01': 8}


2022-12-28 18:39:25,371 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBV7-5*02': 30, 'TRBVA*01': 7}


2022-12-28 18:39:25,461 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 27, 'TRBVA*01': 2}


2022-12-28 18:39:25,760 - assign_clone_ids.ipynb - INFO - Participant P00176 (GeneLocus.TCR) has 249719 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00176.parquet.


2022-12-28 18:39:25,761 - assign_clone_ids.ipynb - INFO - Participant P00156 (GeneLocus.TCR) has 319609 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00156.parquet.




2022-12-28 18:39:26,835 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 16, 'TRBV7-5*02': 16, 'TRBVA*01': 2}




2022-12-28 18:39:26,837 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 18, 'TRBV7-5*02': 31, 'TRBVA*01': 2}








2022-12-28 18:39:28,841 - assign_clone_ids.ipynb - INFO - Participant P00214 (GeneLocus.TCR) has 204945 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00214.tsv'), (267327, 132), (212055, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:39:31,361 - assign_clone_ids.ipynb - INFO - Participant P00187 (GeneLocus.TCR) has 255684 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00187.tsv'), (328377, 132), (267048, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:39:34,033 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 45, 'TRBV7-5*02': 1}


2022-12-28 18:39:33,555 - assign_clone_ids.ipynb - INFO - Participant P00210 (GeneLocus.TCR) has 85604 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00210.tsv'), (139757, 132), (97766, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:39:32,221 - assign_clone_ids.ipynb - INFO - Participant P00206 (GeneLocus.TCR) has 72293 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00206.tsv'), (111657, 132), (84297, 144), <GeneLocus.TCR: 2>)]


















2022-12-28 18:39:39,839 - assign_clone_ids.ipynb - INFO - Participant P00188 (GeneLocus.TCR) has 307465 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00188.tsv'), (392885, 132), (322011, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:39:38,806 - assign_clone_ids.ipynb - INFO - Participant P00200 (GeneLocus.TCR) has 115869 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00200.parquet.






2022-12-28 18:39:40,520 - assign_clone_ids.ipynb - INFO - Participant P00216 (GeneLocus.TCR) has 106009 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00216.tsv'), (140519, 132), (114351, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:39:42,591 - assign_clone_ids.ipynb - INFO - Participant P00206 (GeneLocus.TCR) has 72184 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00206.parquet.




2022-12-28 18:39:44,889 - assign_clone_ids.ipynb - INFO - Participant P00210 (GeneLocus.TCR) has 85500 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00210.parquet.


2022-12-28 18:39:46,262 - assign_clone_ids.ipynb - INFO - Participant P00208 (GeneLocus.TCR) has 132258 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00208.tsv'), (179356, 132), (137171, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:39:47,035 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 42, 'TRBVA*01': 2}


2022-12-28 18:39:48,705 - assign_clone_ids.ipynb - INFO - Participant P00199 (GeneLocus.TCR) has 147964 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00199.tsv'), (194692, 132), (153369, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:39:48,164 - assign_clone_ids.ipynb - INFO - Participant P00196 (GeneLocus.TCR) has 146547 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00196.tsv'), (192476, 132), (154650, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:39:50,544 - assign_clone_ids.ipynb - INFO - Participant P00170 (GeneLocus.TCR) has 319419 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00170.parquet.


2022-12-28 18:39:53,403 - assign_clone_ids.ipynb - INFO - Participant P00214 (GeneLocus.TCR) has 204528 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00214.parquet.




2022-12-28 18:39:56,207 - assign_clone_ids.ipynb - INFO - Participant P00209 (GeneLocus.TCR) has 206773 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00209.tsv'), (260592, 132), (215764, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:39:59,817 - assign_clone_ids.ipynb - INFO - Participant P00195 (GeneLocus.TCR) has 183478 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00195.tsv'), (248151, 132), (194890, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:40:02,528 - assign_clone_ids.ipynb - INFO - Participant P00187 (GeneLocus.TCR) has 255305 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00187.parquet.


2022-12-28 18:40:02,528 - assign_clone_ids.ipynb - INFO - Participant P00208 (GeneLocus.TCR) has 132007 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00208.parquet.


2022-12-28 18:40:02,534 - assign_clone_ids.ipynb - INFO - Participant P00216 (GeneLocus.TCR) has 105734 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00216.parquet.


2022-12-28 18:40:04,157 - assign_clone_ids.ipynb - INFO - Participant P00212 (GeneLocus.TCR) has 235799 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00212.tsv'), (307025, 132), (245116, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:05,835 - assign_clone_ids.ipynb - INFO - Participant P00197 (GeneLocus.TCR) has 234565 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00197.tsv'), (307875, 132), (246750, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:40:09,273 - assign_clone_ids.ipynb - INFO - Participant P00194 (GeneLocus.TCR) has 225307 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00194.tsv'), (296782, 132), (235648, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:10,434 - assign_clone_ids.ipynb - INFO - Participant P00201 (GeneLocus.TCR) has 229684 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00201.tsv'), (303127, 132), (240649, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:11,197 - assign_clone_ids.ipynb - INFO - Participant P00196 (GeneLocus.TCR) has 146352 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00196.parquet.


2022-12-28 18:40:11,197 - assign_clone_ids.ipynb - INFO - Participant P00199 (GeneLocus.TCR) has 147755 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00199.parquet.


2022-12-28 18:40:11,302 - assign_clone_ids.ipynb - INFO - Participant P00202 (GeneLocus.TCR) has 234669 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00202.tsv'), (308350, 132), (246878, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:11,624 - assign_clone_ids.ipynb - INFO - Participant P00193 (GeneLocus.TCR) has 238317 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00193.tsv'), (309357, 132), (251290, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:12,644 - assign_clone_ids.ipynb - INFO - Participant P00211 (GeneLocus.TCR) has 249703 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00211.tsv'), (328348, 132), (263300, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:15,871 - assign_clone_ids.ipynb - INFO - Participant P00198 (GeneLocus.TCR) has 254966 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00198.tsv'), (348148, 132), (270884, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:17,823 - assign_clone_ids.ipynb - INFO - Participant P00188 (GeneLocus.TCR) has 307015 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00188.parquet.


2022-12-28 18:40:21,557 - assign_clone_ids.ipynb - INFO - Participant P00215 (GeneLocus.TCR) has 253532 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00215.tsv'), (334478, 132), (266844, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:40:21,142 - assign_clone_ids.ipynb - INFO - Participant P00195 (GeneLocus.TCR) has 183095 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00195.parquet.


2022-12-28 18:40:23,781 - assign_clone_ids.ipynb - INFO - Participant P00209 (GeneLocus.TCR) has 206497 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00209.parquet.


  df = pd.read_csv(



2022-12-28 18:40:29,547 - assign_clone_ids.ipynb - INFO - Participant P00217 (GeneLocus.TCR) has 233959 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00217.tsv'), (316044, 132), (246916, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:40:44,699 - assign_clone_ids.ipynb - INFO - Participant P00197 (GeneLocus.TCR) has 234212 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00197.parquet.


2022-12-28 18:40:44,699 - assign_clone_ids.ipynb - INFO - Participant P00212 (GeneLocus.TCR) has 235400 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00212.parquet.


2022-12-28 18:40:44,700 - assign_clone_ids.ipynb - INFO - Participant P00193 (GeneLocus.TCR) has 238097 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00193.parquet.


2022-12-28 18:40:44,700 - assign_clone_ids.ipynb - INFO - Participant P00202 (GeneLocus.TCR) has 234416 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00202.parquet.


2022-12-28 18:40:44,700 - assign_clone_ids.ipynb - INFO - Participant P00194 (GeneLocus.TCR) has 225032 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00194.parquet.


2022-12-28 18:40:44,701 - assign_clone_ids.ipynb - INFO - Participant P00211 (GeneLocus.TCR) has 249389 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00211.parquet.


2022-12-28 18:40:44,701 - assign_clone_ids.ipynb - INFO - Participant P00198 (GeneLocus.TCR) has 254607 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00198.parquet.


2022-12-28 18:40:44,700 - assign_clone_ids.ipynb - INFO - Participant P00201 (GeneLocus.TCR) has 229293 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00201.parquet.


  df = pd.read_csv(



2022-12-28 18:40:51,865 - assign_clone_ids.ipynb - INFO - Participant P00215 (GeneLocus.TCR) has 253164 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00215.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:41:58,619 - assign_clone_ids.ipynb - INFO - Participant P00217 (GeneLocus.TCR) has 233517 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00217.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:45:17,325 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 24, 'TRBVA*01': 5}


2022-12-28 18:45:19,751 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7}


2022-12-28 18:45:20,893 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 13, 'TRBV7-5*02': 11, 'TRBVA*01': 2}


2022-12-28 18:45:21,863 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBVA*01': 1}






2022-12-28 18:45:24,610 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 9}




2022-12-28 18:45:25,820 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 33, 'TRBVA*01': 1}


2022-12-28 18:45:26,802 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 18, 'TRBVA*01': 3}








2022-12-28 18:45:31,235 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 5, 'TRBVA*01': 4}


2022-12-28 18:45:31,840 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 14, 'TRBV8-1*01': 1, 'TRBVA*01': 5}


2022-12-28 18:45:32,236 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 39, 'TRBVA*01': 4}


2022-12-28 18:45:32,540 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 32, 'TRBVA*01': 3}












2022-12-28 18:45:36,401 - assign_clone_ids.ipynb - INFO - Participant P00237 (GeneLocus.TCR) has 80160 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00237.tsv'), (115848, 132), (90578, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:45:43,028 - assign_clone_ids.ipynb - INFO - Participant P00226 (GeneLocus.TCR) has 122010 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00226.tsv'), (162938, 132), (127705, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:45:45,722 - assign_clone_ids.ipynb - INFO - Participant P00237 (GeneLocus.TCR) has 80041 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00237.parquet.


2022-12-28 18:45:47,055 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 2, 'TRBVA*01': 1}


2022-12-28 18:45:48,698 - assign_clone_ids.ipynb - INFO - Participant P00222 (GeneLocus.TCR) has 161989 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00222.tsv'), (216945, 132), (167270, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:45:49,038 - assign_clone_ids.ipynb - INFO - Participant P00241 (GeneLocus.TCR) has 148664 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00241.tsv'), (192491, 132), (154648, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:45:49,400 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 34, 'TRBV7-5*02': 49, 'TRBVA*01': 6}




2022-12-28 18:45:51,860 - assign_clone_ids.ipynb - INFO - Participant P00221 (GeneLocus.TCR) has 131758 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00221.tsv'), (178940, 132), (139222, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:45:55,889 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 18, 'TRBV7-5*02': 9, 'TRBVA*01': 2}


2022-12-28 18:45:58,046 - assign_clone_ids.ipynb - INFO - Participant P00218 (GeneLocus.TCR) has 172560 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00218.tsv'), (226251, 132), (178913, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:45:58,166 - assign_clone_ids.ipynb - INFO - Participant P00226 (GeneLocus.TCR) has 121756 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00226.parquet.


2022-12-28 18:45:58,444 - assign_clone_ids.ipynb - INFO - Participant P00223 (GeneLocus.TCR) has 181737 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00223.tsv'), (221156, 132), (189802, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:01,896 - assign_clone_ids.ipynb - INFO - Participant P00247 (GeneLocus.TCR) has 83591 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00247.tsv'), (113509, 132), (88200, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:46:05,760 - assign_clone_ids.ipynb - INFO - Participant P00241 (GeneLocus.TCR) has 148489 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00241.parquet.


2022-12-28 18:46:05,844 - assign_clone_ids.ipynb - INFO - Participant P00227 (GeneLocus.TCR) has 189482 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00227.tsv'), (242383, 132), (198851, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:06,662 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 50, 'TRBVA*01': 2}


2022-12-28 18:46:07,835 - assign_clone_ids.ipynb - INFO - Participant P00225 (GeneLocus.TCR) has 181842 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00225.tsv'), (244558, 132), (191516, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:08,123 - assign_clone_ids.ipynb - INFO - Participant P00232 (GeneLocus.TCR) has 197334 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00232.tsv'), (264848, 132), (207244, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:08,342 - assign_clone_ids.ipynb - INFO - Participant P00221 (GeneLocus.TCR) has 131493 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00221.parquet.


2022-12-28 18:46:09,270 - assign_clone_ids.ipynb - INFO - Participant P00222 (GeneLocus.TCR) has 161737 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00222.parquet.


  df = pd.read_csv(



2022-12-28 18:46:11,232 - assign_clone_ids.ipynb - INFO - Participant P00247 (GeneLocus.TCR) has 83481 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00247.parquet.


2022-12-28 18:46:13,253 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 30, 'TRBVA*01': 2}


2022-12-28 18:46:15,650 - assign_clone_ids.ipynb - INFO - Participant P00228 (GeneLocus.TCR) has 222978 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00228.tsv'), (291130, 132), (236001, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:19,383 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBVA*01': 1}




2022-12-28 18:46:25,441 - assign_clone_ids.ipynb - INFO - Participant P00248 (GeneLocus.TCR) has 122762 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00248.tsv'), (162365, 132), (127418, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:26,256 - assign_clone_ids.ipynb - INFO - Participant P00218 (GeneLocus.TCR) has 172340 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00218.parquet.


2022-12-28 18:46:26,779 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBV8-1*01': 1, 'TRBVA*01': 2}


2022-12-28 18:46:27,168 - assign_clone_ids.ipynb - INFO - Participant P00223 (GeneLocus.TCR) has 181498 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00223.parquet.


2022-12-28 18:46:27,573 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 22, 'TRBVA*01': 3}








2022-12-28 18:46:30,762 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 23, 'TRBVA*01': 5}




  df = pd.read_csv(









2022-12-28 18:46:39,297 - assign_clone_ids.ipynb - INFO - Participant P00225 (GeneLocus.TCR) has 181621 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00225.parquet.




2022-12-28 18:46:39,296 - assign_clone_ids.ipynb - INFO - Participant P00227 (GeneLocus.TCR) has 189131 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00227.parquet.


2022-12-28 18:46:39,296 - assign_clone_ids.ipynb - INFO - Participant P00232 (GeneLocus.TCR) has 197065 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00232.parquet.




2022-12-28 18:46:40,281 - assign_clone_ids.ipynb - INFO - Participant P00248 (GeneLocus.TCR) has 122602 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00248.parquet.


2022-12-28 18:46:42,173 - assign_clone_ids.ipynb - INFO - Participant P00256 (GeneLocus.TCR) has 144374 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00256.tsv'), (194270, 132), (150338, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:43,612 - assign_clone_ids.ipynb - INFO - Participant P00220 (GeneLocus.TCR) has 261875 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00220.tsv'), (372361, 132), (278225, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:47,446 - assign_clone_ids.ipynb - INFO - Participant P00251 (GeneLocus.TCR) has 148880 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00251.tsv'), (200053, 132), (154820, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:46:51,150 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 21, 'TRBVA*01': 3}


  df = pd.read_csv(



2022-12-28 18:46:52,495 - assign_clone_ids.ipynb - INFO - Participant P00228 (GeneLocus.TCR) has 222670 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00228.parquet.


  df = pd.read_csv(



2022-12-28 18:46:55,419 - assign_clone_ids.ipynb - INFO - Participant P00243 (GeneLocus.TCR) has 158727 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00243.tsv'), (209458, 132), (163659, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:46:58,137 - assign_clone_ids.ipynb - INFO - Participant P00256 (GeneLocus.TCR) has 144148 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00256.parquet.


2022-12-28 18:46:58,599 - assign_clone_ids.ipynb - INFO - Participant P00246 (GeneLocus.TCR) has 177852 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00246.tsv'), (235113, 132), (186784, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:47:01,611 - assign_clone_ids.ipynb - INFO - Participant P00250 (GeneLocus.TCR) has 175429 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00250.tsv'), (222981, 132), (180848, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:47:03,474 - assign_clone_ids.ipynb - INFO - Participant P00251 (GeneLocus.TCR) has 148687 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00251.parquet.


  df = pd.read_csv(



2022-12-28 18:47:11,275 - assign_clone_ids.ipynb - INFO - Participant P00219 (GeneLocus.TCR) has 311707 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00219.tsv'), (457563, 132), (345332, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:47:12,875 - assign_clone_ids.ipynb - INFO - Participant P00220 (GeneLocus.TCR) has 261399 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00220.parquet.


2022-12-28 18:47:13,185 - assign_clone_ids.ipynb - INFO - Participant P00243 (GeneLocus.TCR) has 158456 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00243.parquet.


2022-12-28 18:47:15,002 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 15, 'TRBV7-5*02': 31, 'TRBVA*01': 3}


  df = pd.read_csv(



2022-12-28 18:47:23,373 - assign_clone_ids.ipynb - INFO - Participant P00250 (GeneLocus.TCR) has 175253 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00250.parquet.


2022-12-28 18:47:23,373 - assign_clone_ids.ipynb - INFO - Participant P00246 (GeneLocus.TCR) has 177555 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00246.parquet.


2022-12-28 18:47:24,939 - assign_clone_ids.ipynb - INFO - Participant P00253 (GeneLocus.TCR) has 195032 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00253.tsv'), (254979, 132), (201428, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:47:44,606 - assign_clone_ids.ipynb - INFO - Participant P00219 (GeneLocus.TCR) has 311225 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00219.parquet.


2022-12-28 18:47:45,493 - assign_clone_ids.ipynb - INFO - Participant P00253 (GeneLocus.TCR) has 194756 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00253.parquet.


  df = pd.read_csv(



2022-12-28 18:47:46,888 - assign_clone_ids.ipynb - INFO - Participant P00240 (GeneLocus.TCR) has 192276 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00240.tsv'), (260711, 132), (205756, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:48:08,260 - assign_clone_ids.ipynb - INFO - Participant P00240 (GeneLocus.TCR) has 192004 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00240.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:49:06,930 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 42, 'TRBVA*01': 1}


2022-12-28 18:49:14,775 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 34, 'TRBVA*01': 3}


2022-12-28 18:49:15,441 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV7-5*02': 17}


2022-12-28 18:49:16,182 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBVA*01': 2}






2022-12-28 18:49:28,884 - assign_clone_ids.ipynb - INFO - Participant P00277 (GeneLocus.TCR) has 55900 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00277.tsv'), (78015, 132), (58002, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:49:38,078 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 25, 'TRBVA*01': 4}


2022-12-28 18:49:38,306 - assign_clone_ids.ipynb - INFO - Participant P00277 (GeneLocus.TCR) has 55799 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00277.parquet.


  df = pd.read_csv(







2022-12-28 18:49:59,974 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 4, 'TRBV7-5*02': 11}






2022-12-28 18:50:14,293 - assign_clone_ids.ipynb - INFO - Participant P00257 (GeneLocus.TCR) has 211685 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00257.tsv'), (292178, 132), (222623, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:50:26,879 - assign_clone_ids.ipynb - INFO - Participant P00280 (GeneLocus.TCR) has 110075 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00280.tsv'), (140457, 132), (114872, 144), <GeneLocus.TCR: 2>)]










2022-12-28 18:50:33,510 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 2, 'TRBV7-5*02': 5, 'TRBVA*01': 1}


2022-12-28 18:50:33,766 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7}


2022-12-28 18:50:35,523 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBVA*01': 2}


2022-12-28 18:50:37,907 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13}


2022-12-28 18:50:38,274 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBVA*01': 3}


2022-12-28 18:50:38,909 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 24, 'TRBV7-5*02': 15, 'TRBVA*01': 11}




2022-12-28 18:50:40,456 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 15, 'TRBVA*01': 6}


2022-12-28 18:50:41,025 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 2, 'TRBV7-5*01': 29, 'TRBVA*01': 3}


2022-12-28 18:50:41,129 - assign_clone_ids.ipynb - INFO - Participant P00262 (GeneLocus.TCR) has 156928 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00262.tsv'), (203158, 132), (164119, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:50:41,271 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 8, 'TRBVA*01': 2}


2022-12-28 18:50:41,905 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 16, 'TRBV7-5*02': 26, 'TRBVA*01': 7}


2022-12-28 18:50:42,520 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBV7-5*02': 33, 'TRBVA*01': 5}


2022-12-28 18:50:42,607 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 6, 'TRBV7-5*02': 30, 'TRBV8-2*01': 1, 'TRBVA*01': 2}


2022-12-28 18:50:42,633 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 49, 'TRBVA*01': 5}


2022-12-28 18:50:42,799 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 26, 'TRBV8-2*01': 1}


2022-12-28 18:50:42,892 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 29, 'TRBVA*01': 1}


2022-12-28 18:50:43,215 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBVA*01': 6}


2022-12-28 18:50:43,217 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 39, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 18:50:43,422 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBVA*01': 3}


2022-12-28 18:50:45,106 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 29, 'TRBVA*01': 6}


2022-12-28 18:50:45,202 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 3}


2022-12-28 18:50:45,228 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 5}




2022-12-28 18:50:45,711 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 25, 'TRBVA*01': 3}




2022-12-28 18:50:46,330 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBV7-5*02': 40, 'TRBVA*01': 5}




2022-12-28 18:50:46,534 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 29, 'TRBV7-5*02': 30, 'TRBVA*01': 7}




2022-12-28 18:50:47,261 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 35, 'TRBVA*01': 4}


2022-12-28 18:50:48,083 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 18:50:48,552 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 39, 'TRBVA*01': 9}




2022-12-28 18:50:49,619 - assign_clone_ids.ipynb - INFO - Participant P00280 (GeneLocus.TCR) has 109894 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00280.parquet.


2022-12-28 18:50:50,191 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 34, 'TRBVA*01': 7}


2022-12-28 18:50:51,201 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 33, 'TRBVA*01': 5}




2022-12-28 18:50:51,893 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 36, 'TRBV7-5*02': 32, 'TRBVA*01': 9}






2022-12-28 18:50:52,534 - assign_clone_ids.ipynb - INFO - Participant P00257 (GeneLocus.TCR) has 211365 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00257.parquet.


2022-12-28 18:50:52,694 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 39, 'TRBVA*01': 2}


2022-12-28 18:50:53,061 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 65, 'TRBVA*01': 4}


2022-12-28 18:50:53,736 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 51, 'TRBV8-1*01': 1, 'TRBVA*01': 8}


2022-12-28 18:50:54,298 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 22, 'TRBV7-5*02': 20, 'TRBVA*01': 7}




2022-12-28 18:50:54,405 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 62, 'TRBVA*01': 11}














  df = pd.read_csv(











2022-12-28 18:51:01,255 - assign_clone_ids.ipynb - INFO - Participant P00260 (GeneLocus.TCR) has 106860 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00260.tsv'), (137881, 132), (110229, 144), <GeneLocus.TCR: 2>)]




















2022-12-28 18:51:05,315 - assign_clone_ids.ipynb - INFO - Participant P00262 (GeneLocus.TCR) has 156625 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00262.parquet.


2022-12-28 18:51:06,575 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 3}




2022-12-28 18:51:07,203 - assign_clone_ids.ipynb - INFO - Participant P00264 (GeneLocus.TCR) has 116492 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00264.tsv'), (149371, 132), (122301, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 18:51:08,059 - assign_clone_ids.ipynb - INFO - Participant P00269 (GeneLocus.TCR) has 99453 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00269.tsv'), (128987, 132), (103221, 144), <GeneLocus.TCR: 2>)]
















2022-12-28 18:51:13,791 - assign_clone_ids.ipynb - INFO - Participant P00268 (GeneLocus.TCR) has 165557 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00268.tsv'), (216659, 132), (173243, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:51:14,494 - assign_clone_ids.ipynb - INFO - Participant P00242 (GeneLocus.TCR) has 214737 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00242.tsv'), (281470, 132), (227985, 144), <GeneLocus.TCR: 2>)]
















2022-12-28 18:51:17,658 - assign_clone_ids.ipynb - INFO - Participant P00235 (GeneLocus.TCR) has 214313 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00235.tsv'), (300437, 132), (227958, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:51:18,898 - assign_clone_ids.ipynb - INFO - Participant P00266 (GeneLocus.TCR) has 150639 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00266.tsv'), (189509, 132), (154853, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:51:20,961 - assign_clone_ids.ipynb - INFO - Participant P00273 (GeneLocus.TCR) has 182825 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00273.tsv'), (249302, 132), (190982, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:51:23,598 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 3}




  df = pd.read_csv(



2022-12-28 18:51:34,002 - assign_clone_ids.ipynb - INFO - Participant P00281 (GeneLocus.TCR) has 83170 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00281.tsv'), (113822, 132), (90196, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:51:36,385 - assign_clone_ids.ipynb - INFO - Participant P00267 (GeneLocus.TCR) has 197867 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00267.tsv'), (268971, 132), (207181, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:51:45,003 - assign_clone_ids.ipynb - INFO - Participant P00281 (GeneLocus.TCR) has 82995 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00281.parquet.




2022-12-28 18:52:00,016 - assign_clone_ids.ipynb - INFO - Participant P00282 (GeneLocus.TCR) has 137735 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00282.tsv'), (174767, 132), (142378, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:52:00,909 - assign_clone_ids.ipynb - INFO - Participant P00260 (GeneLocus.TCR) has 106725 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00260.parquet.


  df = pd.read_csv(



2022-12-28 18:52:11,739 - assign_clone_ids.ipynb - INFO - Participant P00270 (GeneLocus.TCR) has 145769 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00270.tsv'), (193130, 132), (154255, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:11,746 - assign_clone_ids.ipynb - INFO - Participant P00275 (GeneLocus.TCR) has 146174 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00275.tsv'), (190172, 132), (154137, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





2022-12-28 18:52:16,122 - assign_clone_ids.ipynb - INFO - Participant P00258 (GeneLocus.TCR) has 192710 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00258.tsv'), (252105, 132), (201395, 144), <GeneLocus.TCR: 2>)]








2022-12-28 18:52:21,701 - assign_clone_ids.ipynb - INFO - Participant P00282 (GeneLocus.TCR) has 137500 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00282.parquet.


2022-12-28 18:52:21,897 - assign_clone_ids.ipynb - INFO - Participant P00264 (GeneLocus.TCR) has 116360 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00264.parquet.


2022-12-28 18:52:22,066 - assign_clone_ids.ipynb - INFO - Participant P00269 (GeneLocus.TCR) has 99344 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00269.parquet.


2022-12-28 18:52:22,146 - assign_clone_ids.ipynb - INFO - Participant P00271 (GeneLocus.TCR) has 209434 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00271.tsv'), (269604, 132), (220812, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:27,903 - assign_clone_ids.ipynb - INFO - Participant P00268 (GeneLocus.TCR) has 165318 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00268.parquet.


2022-12-28 18:52:29,309 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 8, 'TRBVA*01': 1}


2022-12-28 18:52:31,887 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 18, 'TRBVA*01': 4}


2022-12-28 18:52:32,464 - assign_clone_ids.ipynb - INFO - Participant P00278 (GeneLocus.TCR) has 173973 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00278.tsv'), (240727, 132), (182769, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:33,269 - assign_clone_ids.ipynb - INFO - Participant P00242 (GeneLocus.TCR) has 214489 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00242.parquet.


2022-12-28 18:52:34,509 - assign_clone_ids.ipynb - INFO - Participant P00266 (GeneLocus.TCR) has 150450 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00266.parquet.






2022-12-28 18:52:38,307 - assign_clone_ids.ipynb - INFO - Participant P00261 (GeneLocus.TCR) has 200216 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00261.tsv'), (273542, 132), (211287, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:52:39,533 - assign_clone_ids.ipynb - INFO - Participant P00231 (GeneLocus.TCR) has 219516 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00231.tsv'), (293062, 132), (229789, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:44,073 - assign_clone_ids.ipynb - INFO - Participant P00275 (GeneLocus.TCR) has 145957 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00275.parquet.


2022-12-28 18:52:44,019 - assign_clone_ids.ipynb - INFO - Participant P00238 (GeneLocus.TCR) has 280503 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00238.tsv'), (374593, 132), (297127, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:44,226 - assign_clone_ids.ipynb - INFO - Participant P00270 (GeneLocus.TCR) has 145567 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00270.parquet.


  df = pd.read_csv(



2022-12-28 18:52:46,294 - assign_clone_ids.ipynb - INFO - Participant P00252 (GeneLocus.TCR) has 270222 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00252.tsv'), (370692, 132), (285673, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:46,729 - assign_clone_ids.ipynb - INFO - Participant P00273 (GeneLocus.TCR) has 182483 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00273.parquet.


2022-12-28 18:52:47,135 - assign_clone_ids.ipynb - INFO - Participant P00235 (GeneLocus.TCR) has 214001 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00235.parquet.


2022-12-28 18:52:47,992 - assign_clone_ids.ipynb - INFO - Participant P00263 (GeneLocus.TCR) has 276485 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00263.tsv'), (367269, 132), (287871, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:48,302 - assign_clone_ids.ipynb - INFO - Participant P00267 (GeneLocus.TCR) has 197613 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00267.parquet.




2022-12-28 18:52:48,550 - assign_clone_ids.ipynb - INFO - Participant P00265 (GeneLocus.TCR) has 262827 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00265.tsv'), (356561, 132), (276878, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:48,683 - assign_clone_ids.ipynb - INFO - Participant P00274 (GeneLocus.TCR) has 250399 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00274.tsv'), (316871, 132), (262140, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:52:52,043 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9}


2022-12-28 18:52:52,065 - assign_clone_ids.ipynb - INFO - Participant P00244 (GeneLocus.TCR) has 240871 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00244.tsv'), (313198, 132), (251646, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:52,132 - assign_clone_ids.ipynb - INFO - Participant P00276 (GeneLocus.TCR) has 237495 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00276.tsv'), (334778, 132), (258984, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:52,266 - assign_clone_ids.ipynb - INFO - Participant P00239 (GeneLocus.TCR) has 244766 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00239.tsv'), (319854, 132), (262086, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:52:56,764 - assign_clone_ids.ipynb - INFO - Participant P00285 (GeneLocus.TCR) has 105577 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00285.tsv'), (136982, 132), (109870, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:56,858 - assign_clone_ids.ipynb - INFO - Participant P00258 (GeneLocus.TCR) has 192482 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00258.parquet.


2022-12-28 18:52:56,859 - assign_clone_ids.ipynb - INFO - Participant P00259 (GeneLocus.TCR) has 293967 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00259.tsv'), (391810, 132), (310946, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:57,232 - assign_clone_ids.ipynb - INFO - Participant P00254 (GeneLocus.TCR) has 301739 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00254.tsv'), (398658, 132), (321901, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:57,575 - assign_clone_ids.ipynb - INFO - Participant P00249 (GeneLocus.TCR) has 263807 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00249.tsv'), (352257, 132), (279885, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:52:58,018 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 53, 'TRBVA*01': 4}


  df = pd.read_csv(



2022-12-28 18:53:02,303 - assign_clone_ids.ipynb - INFO - Participant P00236 (GeneLocus.TCR) has 300924 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00236.tsv'), (387346, 132), (314732, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:53:05,590 - assign_clone_ids.ipynb - INFO - Participant P00255 (GeneLocus.TCR) has 311357 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00255.tsv'), (413794, 132), (326224, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:53:05,711 - assign_clone_ids.ipynb - INFO - Participant P00289 (GeneLocus.TCR) has 61842 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00289.tsv'), (82690, 132), (65580, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:53:07,989 - assign_clone_ids.ipynb - INFO - Participant P00272 (GeneLocus.TCR) has 246729 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00272.tsv'), (330224, 132), (260169, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:53:08,217 - assign_clone_ids.ipynb - INFO - Participant P00279 (GeneLocus.TCR) has 287540 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00279.tsv'), (372585, 132), (300210, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:53:08,410 - assign_clone_ids.ipynb - INFO - Participant P00234 (GeneLocus.TCR) has 324901 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00234.tsv'), (417062, 132), (344413, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:53:15,921 - assign_clone_ids.ipynb - INFO - Participant P00289 (GeneLocus.TCR) has 61752 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00289.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 18:53:33,105 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 8}


2022-12-28 18:53:33,448 - assign_clone_ids.ipynb - INFO - Participant P00278 (GeneLocus.TCR) has 173707 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00278.parquet.


  df = pd.read_csv(



  df = pd.read_csv(







  df = pd.read_csv(



2022-12-28 18:53:54,845 - assign_clone_ids.ipynb - INFO - Participant P00224 (GeneLocus.TCR) has 321608 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00224.tsv'), (436371, 132), (341299, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:53:56,091 - assign_clone_ids.ipynb - INFO - Participant P00296 (GeneLocus.TCR) has 72487 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00296.tsv'), (90535, 132), (73703, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:54:04,755 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 54, 'TRBVA*01': 6}


2022-12-28 18:54:15,469 - assign_clone_ids.ipynb - INFO - Participant P00229 (GeneLocus.TCR) has 308185 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00229.tsv'), (399434, 132), (324555, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:54:16,568 - assign_clone_ids.ipynb - INFO - Participant P00296 (GeneLocus.TCR) has 72340 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00296.parquet.




  df = pd.read_csv(



2022-12-28 18:54:39,128 - assign_clone_ids.ipynb - INFO - Participant P00233 (GeneLocus.TCR) has 339685 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00233.tsv'), (464834, 132), (356153, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:54:41,826 - assign_clone_ids.ipynb - INFO - Participant P00271 (GeneLocus.TCR) has 209210 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00271.parquet.


2022-12-28 18:54:58,126 - assign_clone_ids.ipynb - INFO - Participant P00230 (GeneLocus.TCR) has 308269 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00230.tsv'), (411488, 132), (325326, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(







2022-12-28 18:55:12,511 - assign_clone_ids.ipynb - INFO - Participant P00285 (GeneLocus.TCR) has 105445 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00285.parquet.


2022-12-28 18:55:23,255 - assign_clone_ids.ipynb - INFO - Participant P00288 (GeneLocus.TCR) has 262398 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00288.tsv'), (352389, 132), (276827, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:56:04,681 - assign_clone_ids.ipynb - INFO - Participant P00288 (GeneLocus.TCR) has 261965 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00288.parquet.


  df = pd.read_csv(



2022-12-28 18:56:25,880 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBVA*01': 1}




2022-12-28 18:56:43,793 - assign_clone_ids.ipynb - INFO - Participant P00303 (GeneLocus.TCR) has 60211 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00303.tsv'), (74991, 132), (61529, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:56:53,118 - assign_clone_ids.ipynb - INFO - Participant P00303 (GeneLocus.TCR) has 60138 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00303.parquet.


  df = pd.read_csv(



2022-12-28 18:57:48,537 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 4}


2022-12-28 18:57:51,828 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBV7-5*02': 29, 'TRBV8-1*01': 1, 'TRBVA*01': 4}






2022-12-28 18:58:37,563 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 46, 'TRBVA*01': 1}


2022-12-28 18:58:43,402 - assign_clone_ids.ipynb - INFO - Participant P00231 (GeneLocus.TCR) has 219230 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00231.parquet.


2022-12-28 18:58:50,165 - assign_clone_ids.ipynb - INFO - Participant P00284 (GeneLocus.TCR) has 236663 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00284.tsv'), (297601, 132), (245738, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:58:53,114 - assign_clone_ids.ipynb - INFO - Participant P00304 (GeneLocus.TCR) has 266732 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00304.tsv'), (368726, 132), (283732, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:58:56,264 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 8, 'TRBV7-5*02': 21, 'TRBVA*01': 7}


2022-12-28 18:58:58,464 - assign_clone_ids.ipynb - INFO - Participant P00261 (GeneLocus.TCR) has 199972 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00261.parquet.


2022-12-28 18:58:59,766 - assign_clone_ids.ipynb - INFO - Participant P00265 (GeneLocus.TCR) has 262452 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00265.parquet.


2022-12-28 18:59:00,191 - assign_clone_ids.ipynb - INFO - Participant P00245 (GeneLocus.TCR) has 391404 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00245.tsv'), (532927, 132), (415180, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 18:59:02,811 - assign_clone_ids.ipynb - INFO - Participant P00238 (GeneLocus.TCR) has 280123 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00238.parquet.


2022-12-28 18:59:03,182 - assign_clone_ids.ipynb - INFO - Participant P00239 (GeneLocus.TCR) has 244498 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00239.parquet.


2022-12-28 18:59:04,466 - assign_clone_ids.ipynb - INFO - Participant P00244 (GeneLocus.TCR) has 240513 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00244.parquet.


2022-12-28 18:59:04,637 - assign_clone_ids.ipynb - INFO - Participant P00276 (GeneLocus.TCR) has 237202 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00276.parquet.




2022-12-28 18:59:05,410 - assign_clone_ids.ipynb - INFO - Participant P00274 (GeneLocus.TCR) has 250019 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00274.parquet.


2022-12-28 18:59:06,272 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 32, 'TRBVA*01': 1}


2022-12-28 18:59:06,432 - assign_clone_ids.ipynb - INFO - Participant P00272 (GeneLocus.TCR) has 246461 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00272.parquet.


2022-12-28 18:59:06,660 - assign_clone_ids.ipynb - INFO - Participant P00252 (GeneLocus.TCR) has 269912 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00252.parquet.


2022-12-28 18:59:07,105 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 48}


2022-12-28 18:59:07,287 - assign_clone_ids.ipynb - INFO - Participant P00263 (GeneLocus.TCR) has 275983 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00263.parquet.


2022-12-28 18:59:07,581 - assign_clone_ids.ipynb - INFO - Participant P00259 (GeneLocus.TCR) has 293599 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00259.parquet.


2022-12-28 18:59:07,868 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 34, 'TRBVA*01': 1}


2022-12-28 18:59:08,338 - assign_clone_ids.ipynb - INFO - Participant P00254 (GeneLocus.TCR) has 301135 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00254.parquet.


2022-12-28 18:59:08,802 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV8-2*01': 2, 'TRBVA*01': 3}


2022-12-28 18:59:09,273 - assign_clone_ids.ipynb - INFO - Participant P00255 (GeneLocus.TCR) has 310761 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00255.parquet.


2022-12-28 18:59:09,168 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 15, 'TRBV7-5*02': 12, 'TRBVA*01': 2}




2022-12-28 18:59:09,436 - assign_clone_ids.ipynb - INFO - Participant P00249 (GeneLocus.TCR) has 263444 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00249.parquet.


2022-12-28 18:59:09,455 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 41, 'TRBVA*01': 3}




2022-12-28 18:59:09,671 - assign_clone_ids.ipynb - INFO - Participant P00279 (GeneLocus.TCR) has 287028 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00279.parquet.


2022-12-28 18:59:09,677 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBVA*01': 3}


2022-12-28 18:59:09,944 - assign_clone_ids.ipynb - INFO - Participant P00236 (GeneLocus.TCR) has 300319 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00236.parquet.


2022-12-28 18:59:11,423 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 37, 'TRBVA*01': 5}


2022-12-28 18:59:12,111 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 38, 'TRBVA*01': 6}




2022-12-28 18:59:12,734 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 35, 'TRBVA*01': 10}


2022-12-28 18:59:13,065 - assign_clone_ids.ipynb - INFO - Participant P00224 (GeneLocus.TCR) has 321125 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00224.parquet.


2022-12-28 18:59:14,274 - assign_clone_ids.ipynb - INFO - Participant P00234 (GeneLocus.TCR) has 324480 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00234.parquet.


2022-12-28 18:59:14,818 - assign_clone_ids.ipynb - INFO - Participant P00229 (GeneLocus.TCR) has 307765 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00229.parquet.




2022-12-28 18:59:18,593 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 2, 'TRBV7-5*02': 93, 'TRBVA*01': 12}








2022-12-28 18:59:19,050 - assign_clone_ids.ipynb - INFO - Participant P00233 (GeneLocus.TCR) has 339133 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00233.parquet.






2022-12-28 18:59:18,899 - assign_clone_ids.ipynb - INFO - Participant P00230 (GeneLocus.TCR) has 307888 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00230.parquet.














2022-12-28 18:59:24,497 - assign_clone_ids.ipynb - INFO - Participant P00299 (GeneLocus.TCR) has 140638 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00299.tsv'), (175561, 132), (143893, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:31,933 - assign_clone_ids.ipynb - INFO - Participant P00298 (GeneLocus.TCR) has 174090 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00298.tsv'), (235991, 132), (182044, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:59:24,984 - assign_clone_ids.ipynb - INFO - Participant P00283 (GeneLocus.TCR) has 288466 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00283.tsv'), (388624, 132), (302566, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:31,000 - assign_clone_ids.ipynb - INFO - Participant P00284 (GeneLocus.TCR) has 236356 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00284.parquet.


  df = pd.read_csv(



2022-12-28 18:59:31,402 - assign_clone_ids.ipynb - INFO - Participant P00304 (GeneLocus.TCR) has 266397 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00304.parquet.




2022-12-28 18:59:35,570 - assign_clone_ids.ipynb - INFO - Participant P00291 (GeneLocus.TCR) has 172401 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00291.tsv'), (227167, 132), (180300, 144), <GeneLocus.TCR: 2>)]






2022-12-28 18:59:39,203 - assign_clone_ids.ipynb - INFO - Participant P00294 (GeneLocus.TCR) has 171154 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00294.tsv'), (238901, 132), (182484, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





2022-12-28 18:59:50,061 - assign_clone_ids.ipynb - INFO - Participant P00300 (GeneLocus.TCR) has 213250 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00300.tsv'), (268852, 132), (223711, 144), <GeneLocus.TCR: 2>)]




2022-12-28 18:59:52,167 - assign_clone_ids.ipynb - INFO - Participant P00298 (GeneLocus.TCR) has 173901 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00298.parquet.


2022-12-28 18:59:46,654 - assign_clone_ids.ipynb - INFO - Participant P00301 (GeneLocus.TCR) has 197175 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00301.tsv'), (256572, 132), (205487, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:52,698 - assign_clone_ids.ipynb - INFO - Participant P00292 (GeneLocus.TCR) has 221182 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00292.tsv'), (297259, 132), (233114, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:48,622 - assign_clone_ids.ipynb - INFO - Participant P00245 (GeneLocus.TCR) has 390634 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00245.parquet.


2022-12-28 18:59:48,378 - assign_clone_ids.ipynb - INFO - Participant P00299 (GeneLocus.TCR) has 140376 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00299.parquet.


  df = pd.read_csv(



2022-12-28 18:59:57,220 - assign_clone_ids.ipynb - INFO - Participant P00290 (GeneLocus.TCR) has 190965 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00290.tsv'), (255953, 132), (199791, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:57,479 - assign_clone_ids.ipynb - INFO - Participant P00287 (GeneLocus.TCR) has 188900 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00287.tsv'), (254463, 132), (199398, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:58,392 - assign_clone_ids.ipynb - INFO - Participant P00297 (GeneLocus.TCR) has 217849 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00297.tsv'), (269034, 132), (228146, 144), <GeneLocus.TCR: 2>)]


2022-12-28 18:59:58,439 - assign_clone_ids.ipynb - INFO - Participant P00291 (GeneLocus.TCR) has 172199 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00291.parquet.


2022-12-28 19:00:00,822 - assign_clone_ids.ipynb - INFO - Participant P00294 (GeneLocus.TCR) has 170941 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00294.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:00:04,645 - assign_clone_ids.ipynb - INFO - Participant P00283 (GeneLocus.TCR) has 288055 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00283.parquet.


  df = pd.read_csv(



2022-12-28 19:00:06,163 - assign_clone_ids.ipynb - INFO - Participant P00295 (GeneLocus.TCR) has 228291 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00295.tsv'), (307462, 132), (247485, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:00:06,135 - assign_clone_ids.ipynb - INFO - Participant P00302 (GeneLocus.TCR) has 222417 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00302.tsv'), (298518, 132), (234904, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:00:10,669 - assign_clone_ids.ipynb - INFO - Participant P00293 (GeneLocus.TCR) has 255152 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00293.tsv'), (332843, 132), (268661, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:00:16,771 - assign_clone_ids.ipynb - INFO - Participant P00301 (GeneLocus.TCR) has 196901 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00301.parquet.


2022-12-28 19:00:19,147 - assign_clone_ids.ipynb - INFO - Participant P00300 (GeneLocus.TCR) has 213008 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00300.parquet.


2022-12-28 19:00:19,273 - assign_clone_ids.ipynb - INFO - Participant P00287 (GeneLocus.TCR) has 188632 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00287.parquet.


2022-12-28 19:00:20,054 - assign_clone_ids.ipynb - INFO - Participant P00290 (GeneLocus.TCR) has 190749 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00290.parquet.


2022-12-28 19:00:25,298 - assign_clone_ids.ipynb - INFO - Participant P00297 (GeneLocus.TCR) has 217563 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00297.parquet.


2022-12-28 19:00:25,298 - assign_clone_ids.ipynb - INFO - Participant P00292 (GeneLocus.TCR) has 220940 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00292.parquet.


2022-12-28 19:00:31,894 - assign_clone_ids.ipynb - INFO - Participant P00302 (GeneLocus.TCR) has 222138 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00302.parquet.


2022-12-28 19:00:32,614 - assign_clone_ids.ipynb - INFO - Participant P00295 (GeneLocus.TCR) has 227968 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00295.parquet.


2022-12-28 19:00:34,489 - assign_clone_ids.ipynb - INFO - Participant P00286 (GeneLocus.TCR) has 329221 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00286.tsv'), (453790, 132), (349714, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:00:40,204 - assign_clone_ids.ipynb - INFO - Participant P00293 (GeneLocus.TCR) has 254756 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00293.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:01:10,159 - assign_clone_ids.ipynb - INFO - Participant P00286 (GeneLocus.TCR) has 328765 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00286.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:02:56,177 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBVA*01': 1}




2022-12-28 19:03:00,276 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 10}


2022-12-28 19:03:03,392 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8}


2022-12-28 19:03:03,579 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 5}






2022-12-28 19:03:04,773 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 5, 'TRBVA*01': 5}




2022-12-28 19:03:08,301 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBVA*01': 1}








2022-12-28 19:03:09,010 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11}






2022-12-28 19:03:10,820 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 19}


2022-12-28 19:03:11,016 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 7}


2022-12-28 19:03:12,060 - assign_clone_ids.ipynb - INFO - Participant P00311 (GeneLocus.TCR) has 74832 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00311.tsv'), (94664, 132), (77137, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:13,800 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 19:03:14,332 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 19, 'TRBVA*01': 3}


2022-12-28 19:03:14,472 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 4}








2022-12-28 19:03:14,698 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 19, 'TRBVA*01': 3}


2022-12-28 19:03:16,590 - assign_clone_ids.ipynb - INFO - Participant P00340 (GeneLocus.TCR) has 82589 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00340.tsv'), (108054, 132), (85869, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:16,623 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 7, 'TRBVA*01': 1}












2022-12-28 19:03:21,255 - assign_clone_ids.ipynb - INFO - Participant P00327 (GeneLocus.TCR) has 93031 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00327.tsv'), (122074, 132), (97343, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:22,626 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 29}


2022-12-28 19:03:22,631 - assign_clone_ids.ipynb - INFO - Participant P00312 (GeneLocus.TCR) has 101150 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00312.tsv'), (125386, 132), (103558, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:23,307 - assign_clone_ids.ipynb - INFO - Participant P00311 (GeneLocus.TCR) has 74709 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00311.parquet.






2022-12-28 19:03:25,890 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 12, 'TRBVA*01': 3}


2022-12-28 19:03:26,612 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 12, 'TRBV7-5*02': 14, 'TRBVA*01': 2}


2022-12-28 19:03:26,779 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 19, 'TRBV8-1*01': 1, 'TRBVA*01': 3}






2022-12-28 19:03:28,502 - assign_clone_ids.ipynb - INFO - Participant P00328 (GeneLocus.TCR) has 89559 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00328.tsv'), (121594, 132), (97673, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:28,926 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 24, 'TRBV8-1*01': 1, 'TRBVA*01': 1}


2022-12-28 19:03:29,362 - assign_clone_ids.ipynb - INFO - Participant P00326 (GeneLocus.TCR) has 108814 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00326.tsv'), (140531, 132), (114107, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:30,385 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 27, 'TRBVA*01': 1}


2022-12-28 19:03:30,939 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 7}


2022-12-28 19:03:31,803 - assign_clone_ids.ipynb - INFO - Participant P00340 (GeneLocus.TCR) has 82356 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00340.parquet.




2022-12-28 19:03:32,471 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 31, 'TRBVA*01': 6}


2022-12-28 19:03:33,570 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 34}


2022-12-28 19:03:33,644 - assign_clone_ids.ipynb - INFO - Participant P00323 (GeneLocus.TCR) has 88291 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00323.tsv'), (135721, 132), (104721, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:34,630 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 28, 'TRBVA*01': 3}


2022-12-28 19:03:34,851 - assign_clone_ids.ipynb - INFO - Participant P00312 (GeneLocus.TCR) has 100960 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00312.parquet.


2022-12-28 19:03:35,851 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBVA*01': 1}


2022-12-28 19:03:36,089 - assign_clone_ids.ipynb - INFO - Participant P00327 (GeneLocus.TCR) has 92926 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00327.parquet.


2022-12-28 19:03:36,378 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 5}


2022-12-28 19:03:36,953 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBVA*01': 6}


2022-12-28 19:03:37,694 - assign_clone_ids.ipynb - INFO - Participant P00343 (GeneLocus.TCR) has 113284 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00343.tsv'), (146635, 132), (116358, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:40,437 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 36}


2022-12-28 19:03:41,250 - assign_clone_ids.ipynb - INFO - Participant P00334 (GeneLocus.TCR) has 118560 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00334.tsv'), (156167, 132), (121967, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:03:41,671 - assign_clone_ids.ipynb - INFO - Participant P00324 (GeneLocus.TCR) has 116147 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00324.tsv'), (152498, 132), (119697, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





  df = pd.read_csv(





2022-12-28 19:03:42,306 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 22}


2022-12-28 19:03:42,986 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 51, 'TRBVA*01': 7}














2022-12-28 19:03:44,024 - assign_clone_ids.ipynb - INFO - Participant P00328 (GeneLocus.TCR) has 89390 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00328.parquet.


2022-12-28 19:03:44,415 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBV7-5*02': 24, 'TRBVA*01': 4}


2022-12-28 19:03:44,570 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 40, 'TRBVA*01': 9}




2022-12-28 19:03:45,105 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 87, 'TRBVA*01': 2}


2022-12-28 19:03:45,474 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 14, 'TRBV8-2*01': 1, 'TRBVA*01': 2}




2022-12-28 19:03:45,746 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 17, 'TRBVA*01': 3}


2022-12-28 19:03:46,266 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 27, 'TRBVA*01': 8}


2022-12-28 19:03:46,545 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 63, 'TRBVA*01': 6}


2022-12-28 19:03:47,136 - assign_clone_ids.ipynb - INFO - Participant P00307 (GeneLocus.TCR) has 130019 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00307.tsv'), (157499, 132), (133355, 144), <GeneLocus.TCR: 2>)]








2022-12-28 19:03:47,726 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 51, 'TRBVA*01': 8}












2022-12-28 19:03:48,484 - assign_clone_ids.ipynb - INFO - Participant P00326 (GeneLocus.TCR) has 108678 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00326.parquet.




2022-12-28 19:03:50,154 - assign_clone_ids.ipynb - INFO - Participant P00308 (GeneLocus.TCR) has 151524 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00308.tsv'), (190250, 132), (159918, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:03:50,959 - assign_clone_ids.ipynb - INFO - Participant P00323 (GeneLocus.TCR) has 88157 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00323.parquet.






2022-12-28 19:03:51,949 - assign_clone_ids.ipynb - INFO - Participant P00309 (GeneLocus.TCR) has 136738 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00309.tsv'), (173540, 132), (141088, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:03:53,188 - assign_clone_ids.ipynb - INFO - Participant P00338 (GeneLocus.TCR) has 133931 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00338.tsv'), (167813, 132), (139696, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(









  df = pd.read_csv(



2022-12-28 19:03:57,315 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 43, 'TRBVA*01': 8}


2022-12-28 19:03:57,701 - assign_clone_ids.ipynb - INFO - Participant P00343 (GeneLocus.TCR) has 113143 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00343.parquet.








2022-12-28 19:04:00,419 - assign_clone_ids.ipynb - INFO - Participant P00334 (GeneLocus.TCR) has 118382 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00334.parquet.












2022-12-28 19:04:01,758 - assign_clone_ids.ipynb - INFO - Participant P00306 (GeneLocus.TCR) has 144802 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00306.tsv'), (193972, 132), (151030, 144), <GeneLocus.TCR: 2>)]








2022-12-28 19:04:02,312 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBVA*01': 1}










2022-12-28 19:04:05,724 - assign_clone_ids.ipynb - INFO - Participant P00324 (GeneLocus.TCR) has 116027 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00324.parquet.




2022-12-28 19:04:06,752 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 34, 'TRBV7-5*02': 59, 'TRBVA*01': 8}




2022-12-28 19:04:09,018 - assign_clone_ids.ipynb - INFO - Participant P00307 (GeneLocus.TCR) has 129759 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00307.parquet.


2022-12-28 19:04:09,720 - assign_clone_ids.ipynb - INFO - Participant P00322 (GeneLocus.TCR) has 181850 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00322.tsv'), (244725, 132), (189193, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:04:10,862 - assign_clone_ids.ipynb - INFO - Participant P00315 (GeneLocus.TCR) has 186853 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00315.tsv'), (254735, 132), (200885, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:11,467 - assign_clone_ids.ipynb - INFO - Participant P00308 (GeneLocus.TCR) has 151310 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00308.parquet.


2022-12-28 19:04:13,174 - assign_clone_ids.ipynb - INFO - Participant P00341 (GeneLocus.TCR) has 166134 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00341.tsv'), (217313, 132), (172315, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:04:15,968 - assign_clone_ids.ipynb - INFO - Participant P00309 (GeneLocus.TCR) has 136530 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00309.parquet.


2022-12-28 19:04:17,214 - assign_clone_ids.ipynb - INFO - Participant P00305 (GeneLocus.TCR) has 177465 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00305.tsv'), (221324, 132), (184912, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:17,756 - assign_clone_ids.ipynb - INFO - Participant P00338 (GeneLocus.TCR) has 133738 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00338.parquet.


2022-12-28 19:04:18,233 - assign_clone_ids.ipynb - INFO - Participant P00335 (GeneLocus.TCR) has 189183 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00335.tsv'), (263457, 132), (198410, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:18,713 - assign_clone_ids.ipynb - INFO - Participant P00329 (GeneLocus.TCR) has 198545 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00329.tsv'), (269169, 132), (206417, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:19,699 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 2, 'TRBV7-5*01': 13, 'TRBV7-5*02': 24, 'TRBVA*01': 3}






2022-12-28 19:04:21,579 - assign_clone_ids.ipynb - INFO - Participant P00336 (GeneLocus.TCR) has 189303 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00336.tsv'), (252583, 132), (197869, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:21,910 - assign_clone_ids.ipynb - INFO - Participant P00337 (GeneLocus.TCR) has 175137 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00337.tsv'), (231000, 132), (183107, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:23,145 - assign_clone_ids.ipynb - INFO - Participant P00313 (GeneLocus.TCR) has 208041 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00313.tsv'), (266817, 132), (219563, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:23,750 - assign_clone_ids.ipynb - INFO - Participant P00321 (GeneLocus.TCR) has 217076 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00321.tsv'), (292468, 132), (225286, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:24,097 - assign_clone_ids.ipynb - INFO - Participant P00314 (GeneLocus.TCR) has 203241 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00314.tsv'), (258550, 132), (211426, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:24,592 - assign_clone_ids.ipynb - INFO - Participant P00339 (GeneLocus.TCR) has 194890 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00339.tsv'), (249924, 132), (204123, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:25,868 - assign_clone_ids.ipynb - INFO - Participant P00306 (GeneLocus.TCR) has 144629 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00306.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:04:26,869 - assign_clone_ids.ipynb - INFO - Participant P00346 (GeneLocus.TCR) has 93204 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00346.tsv'), (123510, 132), (97719, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(







2022-12-28 19:04:31,519 - assign_clone_ids.ipynb - INFO - Participant P00320 (GeneLocus.TCR) has 199607 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00320.tsv'), (269583, 132), (210922, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:04:32,582 - assign_clone_ids.ipynb - INFO - Participant P00322 (GeneLocus.TCR) has 181629 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00322.parquet.


2022-12-28 19:04:32,800 - assign_clone_ids.ipynb - INFO - Participant P00325 (GeneLocus.TCR) has 230721 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00325.tsv'), (308025, 132), (243339, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:33,267 - assign_clone_ids.ipynb - INFO - Participant P00344 (GeneLocus.TCR) has 238607 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00344.tsv'), (314634, 132), (249956, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:04:35,243 - assign_clone_ids.ipynb - INFO - Participant P00341 (GeneLocus.TCR) has 165923 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00341.parquet.


2022-12-28 19:04:35,243 - assign_clone_ids.ipynb - INFO - Participant P00315 (GeneLocus.TCR) has 186655 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00315.parquet.


2022-12-28 19:04:35,840 - assign_clone_ids.ipynb - INFO - Participant P00330 (GeneLocus.TCR) has 242946 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00330.tsv'), (323776, 132), (254982, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:36,485 - assign_clone_ids.ipynb - INFO - Participant P00319 (GeneLocus.TCR) has 223540 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00319.tsv'), (308041, 132), (237836, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:37,514 - assign_clone_ids.ipynb - INFO - Participant P00318 (GeneLocus.TCR) has 244138 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00318.tsv'), (331007, 132), (254516, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:38,721 - assign_clone_ids.ipynb - INFO - Participant P00317 (GeneLocus.TCR) has 256083 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00317.tsv'), (347656, 132), (269965, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:04:49,447 - assign_clone_ids.ipynb - INFO - Participant P00331 (GeneLocus.TCR) has 244506 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00331.tsv'), (332102, 132), (256310, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:04:49,979 - assign_clone_ids.ipynb - INFO - Participant P00342 (GeneLocus.TCR) has 229774 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00342.tsv'), (313221, 132), (246032, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:49,994 - assign_clone_ids.ipynb - INFO - Participant P00310 (GeneLocus.TCR) has 244090 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00310.tsv'), (315402, 132), (258726, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:04:50,756 - assign_clone_ids.ipynb - INFO - Participant P00346 (GeneLocus.TCR) has 93084 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00346.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:05:02,781 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV8-2*01': 1, 'TRBVA*01': 1}




2022-12-28 19:05:07,717 - assign_clone_ids.ipynb - INFO - Participant P00333 (GeneLocus.TCR) has 267740 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00333.tsv'), (366036, 132), (279946, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:05:08,382 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 15, 'TRBVA*01': 5}


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:05:19,497 - assign_clone_ids.ipynb - INFO - Participant P00305 (GeneLocus.TCR) has 177193 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00305.parquet.


2022-12-28 19:05:24,235 - assign_clone_ids.ipynb - INFO - Participant P00358 (GeneLocus.TCR) has 54592 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00358.tsv'), (66179, 132), (55637, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:05:27,131 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 34, 'TRBVA*01': 3}


2022-12-28 19:05:27,915 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 24, 'TRBVA*01': 2}


  df = pd.read_csv(



2022-12-28 19:05:30,870 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 24, 'TRBVA*01': 4}








2022-12-28 19:05:42,115 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 58, 'TRBVA*01': 7}




2022-12-28 19:05:45,601 - assign_clone_ids.ipynb - INFO - Participant P00358 (GeneLocus.TCR) has 54533 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00358.parquet.


  df = pd.read_csv(



2022-12-28 19:05:59,901 - assign_clone_ids.ipynb - INFO - Participant P00347 (GeneLocus.TCR) has 196442 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00347.tsv'), (261847, 132), (205565, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:06:02,387 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 41, 'TRBVA*01': 1}










2022-12-28 19:06:22,349 - assign_clone_ids.ipynb - INFO - Participant P00359 (GeneLocus.TCR) has 100304 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00359.tsv'), (137079, 132), (105540, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:06:23,200 - assign_clone_ids.ipynb - INFO - Participant P00335 (GeneLocus.TCR) has 188865 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00335.parquet.


2022-12-28 19:06:28,189 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBVA*01': 2}


2022-12-28 19:06:34,748 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBVA*01': 1}


2022-12-28 19:06:38,280 - assign_clone_ids.ipynb - INFO - Participant P00353 (GeneLocus.TCR) has 155283 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00353.tsv'), (208047, 132), (161694, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:06:42,723 - assign_clone_ids.ipynb - INFO - Participant P00337 (GeneLocus.TCR) has 174875 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00337.parquet.


2022-12-28 19:06:43,040 - assign_clone_ids.ipynb - INFO - Participant P00329 (GeneLocus.TCR) has 198219 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00329.parquet.




2022-12-28 19:06:45,104 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 11, 'TRBV7-5*02': 18, 'TRBVA*01': 2}


2022-12-28 19:06:45,191 - assign_clone_ids.ipynb - INFO - Participant P00313 (GeneLocus.TCR) has 207798 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00313.parquet.


  df = pd.read_csv(



2022-12-28 19:06:45,495 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBVA*01': 2}




2022-12-28 19:06:46,221 - assign_clone_ids.ipynb - INFO - Participant P00336 (GeneLocus.TCR) has 188903 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00336.parquet.




2022-12-28 19:06:46,575 - assign_clone_ids.ipynb - INFO - Participant P00321 (GeneLocus.TCR) has 216739 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00321.parquet.


2022-12-28 19:06:46,768 - assign_clone_ids.ipynb - INFO - Participant P00361 (GeneLocus.TCR) has 145590 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00361.tsv'), (193477, 132), (150455, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:06:46,818 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 3}


2022-12-28 19:06:47,329 - assign_clone_ids.ipynb - INFO - Participant P00314 (GeneLocus.TCR) has 203049 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00314.parquet.


2022-12-28 19:06:48,055 - assign_clone_ids.ipynb - INFO - Participant P00320 (GeneLocus.TCR) has 199252 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00320.parquet.


2022-12-28 19:06:48,737 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBVA*01': 1}


2022-12-28 19:06:49,235 - assign_clone_ids.ipynb - INFO - Participant P00339 (GeneLocus.TCR) has 194571 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00339.parquet.


2022-12-28 19:06:49,386 - assign_clone_ids.ipynb - INFO - Participant P00325 (GeneLocus.TCR) has 230401 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00325.parquet.


2022-12-28 19:06:49,412 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 2, 'TRBV7-5*01': 4, 'TRBV7-5*02': 27, 'TRBVA*01': 4}


2022-12-28 19:06:49,669 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 25}


2022-12-28 19:06:49,822 - assign_clone_ids.ipynb - INFO - Participant P00330 (GeneLocus.TCR) has 242576 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00330.parquet.


2022-12-28 19:06:50,017 - assign_clone_ids.ipynb - INFO - Participant P00344 (GeneLocus.TCR) has 238282 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00344.parquet.


2022-12-28 19:06:50,640 - assign_clone_ids.ipynb - INFO - Participant P00318 (GeneLocus.TCR) has 243529 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00318.parquet.


  df = pd.read_csv(



2022-12-28 19:06:51,410 - assign_clone_ids.ipynb - INFO - Participant P00359 (GeneLocus.TCR) has 100172 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00359.parquet.


2022-12-28 19:06:51,813 - assign_clone_ids.ipynb - INFO - Participant P00331 (GeneLocus.TCR) has 244205 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00331.parquet.


2022-12-28 19:06:52,598 - assign_clone_ids.ipynb - INFO - Participant P00317 (GeneLocus.TCR) has 255719 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00317.parquet.


2022-12-28 19:06:52,787 - assign_clone_ids.ipynb - INFO - Participant P00319 (GeneLocus.TCR) has 223222 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00319.parquet.




2022-12-28 19:06:53,799 - assign_clone_ids.ipynb - INFO - Participant P00310 (GeneLocus.TCR) has 243738 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00310.parquet.




2022-12-28 19:06:54,557 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 48, 'TRBVA*01': 6}


2022-12-28 19:06:55,554 - assign_clone_ids.ipynb - INFO - Participant P00351 (GeneLocus.TCR) has 300118 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00351.tsv'), (400348, 132), (316170, 144), <GeneLocus.TCR: 2>)]












2022-12-28 19:06:59,753 - assign_clone_ids.ipynb - INFO - Participant P00345 (GeneLocus.TCR) has 183697 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00345.tsv'), (239800, 132), (192811, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:00,630 - assign_clone_ids.ipynb - INFO - Participant P00360 (GeneLocus.TCR) has 83951 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00360.tsv'), (108578, 132), (88360, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:07:02,641 - assign_clone_ids.ipynb - INFO - Participant P00356 (GeneLocus.TCR) has 212847 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00356.tsv'), (285523, 132), (225944, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:01,141 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 43, 'TRBVA*01': 3}


2022-12-28 19:07:02,933 - assign_clone_ids.ipynb - INFO - Participant P00353 (GeneLocus.TCR) has 155036 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00353.parquet.




2022-12-28 19:07:03,686 - assign_clone_ids.ipynb - INFO - Participant P00347 (GeneLocus.TCR) has 196219 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00347.parquet.


2022-12-28 19:07:03,727 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 47, 'TRBVA*01': 2}




2022-12-28 19:07:07,364 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 10, 'TRBV7-5*02': 25, 'TRBVA*01': 4}


2022-12-28 19:07:08,606 - assign_clone_ids.ipynb - INFO - Participant P00365 (GeneLocus.TCR) has 99593 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00365.tsv'), (135385, 132), (102650, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:09,724 - assign_clone_ids.ipynb - INFO - Participant P00316 (GeneLocus.TCR) has 342899 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00316.tsv'), (468136, 132), (360839, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:09,765 - assign_clone_ids.ipynb - INFO - Participant P00364 (GeneLocus.TCR) has 101620 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00364.tsv'), (133100, 132), (104609, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:10,397 - assign_clone_ids.ipynb - INFO - Participant P00361 (GeneLocus.TCR) has 145314 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00361.parquet.


2022-12-28 19:07:10,397 - assign_clone_ids.ipynb - INFO - Participant P00342 (GeneLocus.TCR) has 229411 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00342.parquet.


2022-12-28 19:07:10,398 - assign_clone_ids.ipynb - INFO - Participant P00333 (GeneLocus.TCR) has 267305 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00333.parquet.


2022-12-28 19:07:10,992 - assign_clone_ids.ipynb - INFO - Participant P00350 (GeneLocus.TCR) has 203748 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00350.tsv'), (264654, 132), (215064, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:07:15,288 - assign_clone_ids.ipynb - INFO - Participant P00332 (GeneLocus.TCR) has 357332 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00332.tsv'), (507636, 132), (380344, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:07:16,923 - assign_clone_ids.ipynb - INFO - Participant P00355 (GeneLocus.TCR) has 154223 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00355.tsv'), (217401, 132), (161690, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:18,473 - assign_clone_ids.ipynb - INFO - Participant P00363 (GeneLocus.TCR) has 136523 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00363.tsv'), (182697, 132), (143149, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:19,616 - assign_clone_ids.ipynb - INFO - Participant P00360 (GeneLocus.TCR) has 83782 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00360.parquet.






2022-12-28 19:07:23,857 - assign_clone_ids.ipynb - INFO - Participant P00364 (GeneLocus.TCR) has 101483 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00364.parquet.




2022-12-28 19:07:23,857 - assign_clone_ids.ipynb - INFO - Participant P00365 (GeneLocus.TCR) has 99448 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00365.parquet.




2022-12-28 19:07:24,896 - assign_clone_ids.ipynb - INFO - Participant P00345 (GeneLocus.TCR) has 183498 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00345.parquet.


2022-12-28 19:07:27,890 - assign_clone_ids.ipynb - INFO - Participant P00352 (GeneLocus.TCR) has 171321 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00352.tsv'), (239081, 132), (178352, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:30,099 - assign_clone_ids.ipynb - INFO - Participant P00348 (GeneLocus.TCR) has 206982 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00348.tsv'), (269476, 132), (217008, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:07:30,880 - assign_clone_ids.ipynb - INFO - Participant P00356 (GeneLocus.TCR) has 212594 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00356.parquet.


  df = pd.read_csv(



2022-12-28 19:07:34,475 - assign_clone_ids.ipynb - INFO - Participant P00351 (GeneLocus.TCR) has 299730 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00351.parquet.


2022-12-28 19:07:35,798 - assign_clone_ids.ipynb - INFO - Participant P00357 (GeneLocus.TCR) has 208880 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00357.tsv'), (283643, 132), (216910, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:07:38,397 - assign_clone_ids.ipynb - INFO - Participant P00350 (GeneLocus.TCR) has 203466 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00350.parquet.


2022-12-28 19:07:38,397 - assign_clone_ids.ipynb - INFO - Participant P00355 (GeneLocus.TCR) has 154005 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00355.parquet.


2022-12-28 19:07:38,397 - assign_clone_ids.ipynb - INFO - Participant P00363 (GeneLocus.TCR) has 136371 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00363.parquet.


2022-12-28 19:07:45,283 - assign_clone_ids.ipynb - INFO - Participant P00362 (GeneLocus.TCR) has 222283 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00362.tsv'), (305076, 132), (232708, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:07:56,147 - assign_clone_ids.ipynb - INFO - Participant P00316 (GeneLocus.TCR) has 342424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00316.parquet.


2022-12-28 19:07:56,146 - assign_clone_ids.ipynb - INFO - Participant P00352 (GeneLocus.TCR) has 171050 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00352.parquet.


2022-12-28 19:07:56,146 - assign_clone_ids.ipynb - INFO - Participant P00348 (GeneLocus.TCR) has 206708 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00348.parquet.


  df = pd.read_csv(



2022-12-28 19:07:57,600 - assign_clone_ids.ipynb - INFO - Participant P00332 (GeneLocus.TCR) has 356576 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00332.parquet.


2022-12-28 19:07:57,935 - assign_clone_ids.ipynb - INFO - Participant P00357 (GeneLocus.TCR) has 208584 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00357.parquet.


2022-12-28 19:07:57,971 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 1}


  df = pd.read_csv(



2022-12-28 19:07:59,198 - assign_clone_ids.ipynb - INFO - Participant P00354 (GeneLocus.TCR) has 276933 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00354.tsv'), (351286, 132), (289559, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:08:06,157 - assign_clone_ids.ipynb - INFO - Participant P00367 (GeneLocus.TCR) has 52202 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00367.tsv'), (71608, 132), (55865, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:08:08,560 - assign_clone_ids.ipynb - INFO - Participant P00349 (GeneLocus.TCR) has 313234 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00349.tsv'), (419621, 132), (329068, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:08:41,426 - assign_clone_ids.ipynb - INFO - Participant P00362 (GeneLocus.TCR) has 221898 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00362.parquet.


2022-12-28 19:08:41,427 - assign_clone_ids.ipynb - INFO - Participant P00367 (GeneLocus.TCR) has 52099 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00367.parquet.


2022-12-28 19:08:41,427 - assign_clone_ids.ipynb - INFO - Participant P00354 (GeneLocus.TCR) has 276584 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00354.parquet.


2022-12-28 19:08:41,862 - assign_clone_ids.ipynb - INFO - Participant P00349 (GeneLocus.TCR) has 312691 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00349.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:10:33,314 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4}






2022-12-28 19:10:38,171 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 3}




2022-12-28 19:10:42,114 - assign_clone_ids.ipynb - INFO - Participant P00400 (GeneLocus.TCR) has 51217 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00400.tsv'), (67910, 132), (53601, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:10:42,366 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 13, 'TRBVA*01': 2}


2022-12-28 19:10:44,057 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 3}


2022-12-28 19:10:46,985 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5}


2022-12-28 19:10:46,880 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15}






2022-12-28 19:10:47,733 - assign_clone_ids.ipynb - INFO - Participant P00400 (GeneLocus.TCR) has 51145 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00400.parquet.


2022-12-28 19:10:48,192 - assign_clone_ids.ipynb - INFO - Participant P00405 (GeneLocus.TCR) has 51164 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00405.tsv'), (68711, 132), (55575, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:10:49,347 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 9}






2022-12-28 19:10:53,995 - assign_clone_ids.ipynb - INFO - Participant P00405 (GeneLocus.TCR) has 51085 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00405.parquet.


2022-12-28 19:10:54,197 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 12, 'TRBVA*01': 3}




2022-12-28 19:10:57,430 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBV7-5*02': 1, 'TRBVA*01': 9}


2022-12-28 19:11:00,533 - assign_clone_ids.ipynb - INFO - Participant P00378 (GeneLocus.TCR) has 100013 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00378.tsv'), (133380, 132), (104900, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:00,987 - assign_clone_ids.ipynb - INFO - Participant P00387 (GeneLocus.TCR) has 99617 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00387.tsv'), (126625, 132), (102018, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(







2022-12-28 19:11:04,962 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 12, 'TRBVA*01': 7}


2022-12-28 19:11:05,506 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 4, 'TRBV7-5*01': 17, 'TRBV8-2*01': 1, 'TRBVA*01': 2}




2022-12-28 19:11:07,125 - assign_clone_ids.ipynb - INFO - Participant P00381 (GeneLocus.TCR) has 111439 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00381.tsv'), (138529, 132), (113832, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:07,369 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 22, 'TRBVA*01': 1}


2022-12-28 19:11:07,473 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28}


2022-12-28 19:11:08,529 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 17, 'TRBVA*01': 1}


2022-12-28 19:11:08,837 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBVA*01': 4}


2022-12-28 19:11:09,363 - assign_clone_ids.ipynb - INFO - Participant P00371 (GeneLocus.TCR) has 130777 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00371.tsv'), (164613, 132), (134836, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:09,468 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBVA*01': 3}


2022-12-28 19:11:09,646 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 19, 'TRBVA*01': 3}


2022-12-28 19:11:11,358 - assign_clone_ids.ipynb - INFO - Participant P00386 (GeneLocus.TCR) has 118755 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00386.tsv'), (159409, 132), (124735, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:12,387 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 25, 'TRBVA*01': 1}


2022-12-28 19:11:13,645 - assign_clone_ids.ipynb - INFO - Participant P00387 (GeneLocus.TCR) has 99463 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00387.parquet.


2022-12-28 19:11:13,702 - assign_clone_ids.ipynb - INFO - Participant P00378 (GeneLocus.TCR) has 99829 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00378.parquet.




  df = pd.read_csv(



2022-12-28 19:11:16,558 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 46, 'TRBVA*01': 7}






2022-12-28 19:11:16,956 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 25, 'TRBVA*01': 5}


2022-12-28 19:11:17,025 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBVA*01': 3}














2022-12-28 19:11:19,342 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 20, 'TRBVA*01': 6}


  df = pd.read_csv(







2022-12-28 19:11:20,711 - assign_clone_ids.ipynb - INFO - Participant P00381 (GeneLocus.TCR) has 111231 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00381.parquet.


2022-12-28 19:11:24,507 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 26}


2022-12-28 19:11:24,547 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBVA*01': 2}


2022-12-28 19:11:24,588 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBVA*01': 6}


2022-12-28 19:11:25,472 - assign_clone_ids.ipynb - INFO - Participant P00374 (GeneLocus.TCR) has 142402 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00374.tsv'), (183800, 132), (149682, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:26,381 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBVA*01': 4}




  df = pd.read_csv(



2022-12-28 19:11:33,127 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28}


2022-12-28 19:11:38,689 - assign_clone_ids.ipynb - INFO - Participant P00410 (GeneLocus.TCR) has 29985 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00410.tsv'), (44427, 132), (32937, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:39,941 - assign_clone_ids.ipynb - INFO - Participant P00370 (GeneLocus.TCR) has 151057 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00370.tsv'), (209894, 132), (158710, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 19:11:43,868 - assign_clone_ids.ipynb - INFO - Participant P00410 (GeneLocus.TCR) has 29956 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00410.parquet.


2022-12-28 19:11:45,851 - assign_clone_ids.ipynb - INFO - Participant P00385 (GeneLocus.TCR) has 171736 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00385.tsv'), (225658, 132), (178794, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:11:46,683 - assign_clone_ids.ipynb - INFO - Participant P00394 (GeneLocus.TCR) has 161741 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00394.tsv'), (209923, 132), (168556, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





2022-12-28 19:12:04,377 - assign_clone_ids.ipynb - INFO - Participant P00393 (GeneLocus.TCR) has 178095 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00393.tsv'), (222891, 132), (184346, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:12:13,051 - assign_clone_ids.ipynb - INFO - Participant P00394 (GeneLocus.TCR) has 161502 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00394.parquet.




  df = pd.read_csv(







2022-12-28 19:12:40,135 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 5, 'TRBV8-1*01': 1, 'TRBVA*01': 1}












2022-12-28 19:12:58,025 - assign_clone_ids.ipynb - INFO - Participant P00386 (GeneLocus.TCR) has 118547 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00386.parquet.






2022-12-28 19:13:01,596 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBVA*01': 4}


2022-12-28 19:13:01,944 - assign_clone_ids.ipynb - INFO - Participant P00371 (GeneLocus.TCR) has 130590 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00371.parquet.


2022-12-28 19:13:03,181 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 22, 'TRBV7-5*02': 40, 'TRBVA*01': 3}


2022-12-28 19:13:03,935 - assign_clone_ids.ipynb - INFO - Participant P00385 (GeneLocus.TCR) has 171480 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00385.parquet.


2022-12-28 19:13:04,219 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 2, 'TRBV7-5*01': 2, 'TRBV7-5*02': 70, 'TRBVA*01': 6}




2022-12-28 19:13:05,485 - assign_clone_ids.ipynb - INFO - Participant P00370 (GeneLocus.TCR) has 150867 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00370.parquet.




2022-12-28 19:13:06,113 - assign_clone_ids.ipynb - INFO - Participant P00374 (GeneLocus.TCR) has 142184 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00374.parquet.






2022-12-28 19:13:07,723 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 33, 'TRBVA*01': 7}


2022-12-28 19:13:08,195 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBVA*01': 7}


  df = pd.read_csv(



2022-12-28 19:13:11,169 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 6}


2022-12-28 19:13:11,472 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 15, 'TRBVA*01': 1}


2022-12-28 19:13:12,214 - assign_clone_ids.ipynb - INFO - Participant P00413 (GeneLocus.TCR) has 120220 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00413.tsv'), (160241, 132), (126740, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:13:12,663 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBV7-5*02': 40, 'TRBVA*01': 5}


2022-12-28 19:13:13,198 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 42, 'TRBVA*01': 5}


2022-12-28 19:13:13,279 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV7-5*02': 32, 'TRBVA*01': 3}




2022-12-28 19:13:13,541 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 47, 'TRBV7-5*02': 42, 'TRBVA*01': 6}


2022-12-28 19:13:15,016 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 57, 'TRBVA*01': 4}


2022-12-28 19:13:16,579 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 55, 'TRBVA*01': 4}


2022-12-28 19:13:16,786 - assign_clone_ids.ipynb - INFO - Participant P00377 (GeneLocus.TCR) has 179205 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00377.tsv'), (220373, 132), (189174, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:13:17,431 - assign_clone_ids.ipynb - INFO - Participant P00401 (GeneLocus.TCR) has 145643 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00401.tsv'), (196573, 132), (151661, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:13:18,284 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 48, 'TRBV8-1*01': 1, 'TRBVA*01': 3}




2022-12-28 19:13:18,626 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 5, 'TRBV7-5*02': 88, 'TRBVA*01': 4}


2022-12-28 19:13:19,868 - assign_clone_ids.ipynb - INFO - Participant P00373 (GeneLocus.TCR) has 169093 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00373.tsv'), (219029, 132), (175450, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:13:20,330 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 47, 'TRBVA*01': 6}


2022-12-28 19:13:21,766 - assign_clone_ids.ipynb - INFO - Participant P00403 (GeneLocus.TCR) has 144989 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00403.tsv'), (188206, 132), (150668, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:13:22,137 - assign_clone_ids.ipynb - INFO - Participant P00383 (GeneLocus.TCR) has 164660 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00383.tsv'), (224634, 132), (176468, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:13:22,853 - assign_clone_ids.ipynb - INFO - Participant P00393 (GeneLocus.TCR) has 177865 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00393.parquet.




2022-12-28 19:13:23,920 - assign_clone_ids.ipynb - INFO - Participant P00369 (GeneLocus.TCR) has 175275 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00369.tsv'), (234618, 132), (183350, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(









2022-12-28 19:13:28,338 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 2, 'TRBV7-5*01': 52, 'TRBVA*01': 6}


2022-12-28 19:13:29,384 - assign_clone_ids.ipynb - INFO - Participant P00372 (GeneLocus.TCR) has 190267 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00372.tsv'), (258600, 132), (200224, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:13:30,918 - assign_clone_ids.ipynb - INFO - Participant P00368 (GeneLocus.TCR) has 204633 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00368.tsv'), (265983, 132), (212398, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:13:32,714 - assign_clone_ids.ipynb - INFO - Participant P00413 (GeneLocus.TCR) has 120025 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00413.parquet.


  df = pd.read_csv(









2022-12-28 19:13:35,683 - assign_clone_ids.ipynb - INFO - Participant P00390 (GeneLocus.TCR) has 185819 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00390.tsv'), (245727, 132), (192863, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:13:36,051 - assign_clone_ids.ipynb - INFO - Participant P00380 (GeneLocus.TCR) has 210542 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00380.tsv'), (279923, 132), (219734, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(











2022-12-28 19:13:38,338 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 1}


2022-12-28 19:13:38,446 - assign_clone_ids.ipynb - INFO - Participant P00407 (GeneLocus.TCR) has 126298 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00407.tsv'), (168125, 132), (133000, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:13:42,094 - assign_clone_ids.ipynb - INFO - Participant P00395 (GeneLocus.TCR) has 187290 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00395.tsv'), (259311, 132), (196646, 144), <GeneLocus.TCR: 2>)]






  df = pd.read_csv(



2022-12-28 19:13:50,317 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 30, 'TRBVA*01': 3}


2022-12-28 19:13:51,907 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 13, 'TRBV8-2*01': 1}




2022-12-28 19:13:53,679 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBVA*01': 2}


















2022-12-28 19:13:59,788 - assign_clone_ids.ipynb - INFO - Participant P00398 (GeneLocus.TCR) has 189700 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00398.tsv'), (256303, 132), (202416, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:14:00,559 - assign_clone_ids.ipynb - INFO - Participant P00382 (GeneLocus.TCR) has 218014 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00382.tsv'), (280699, 132), (227144, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:00,935 - assign_clone_ids.ipynb - INFO - Participant P00401 (GeneLocus.TCR) has 145483 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00401.parquet.


2022-12-28 19:14:02,731 - assign_clone_ids.ipynb - INFO - Participant P00366 (GeneLocus.TCR) has 259274 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00366.tsv'), (319534, 132), (272296, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:03,162 - assign_clone_ids.ipynb - INFO - Participant P00377 (GeneLocus.TCR) has 179004 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00377.parquet.


2022-12-28 19:14:04,307 - assign_clone_ids.ipynb - INFO - Participant P00373 (GeneLocus.TCR) has 168863 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00373.parquet.


2022-12-28 19:14:04,585 - assign_clone_ids.ipynb - INFO - Participant P00369 (GeneLocus.TCR) has 175095 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00369.parquet.


2022-12-28 19:14:05,843 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 25, 'TRBVA*01': 3}


2022-12-28 19:14:05,964 - assign_clone_ids.ipynb - INFO - Participant P00403 (GeneLocus.TCR) has 144849 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00403.parquet.


2022-12-28 19:14:06,256 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 43, 'TRBVA*01': 3}


2022-12-28 19:14:06,289 - assign_clone_ids.ipynb - INFO - Participant P00372 (GeneLocus.TCR) has 190003 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00372.parquet.


2022-12-28 19:14:06,714 - assign_clone_ids.ipynb - INFO - Participant P00383 (GeneLocus.TCR) has 164442 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00383.parquet.






2022-12-28 19:14:08,659 - assign_clone_ids.ipynb - INFO - Participant P00414 (GeneLocus.TCR) has 130150 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00414.tsv'), (172757, 132), (135618, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:09,423 - assign_clone_ids.ipynb - INFO - Participant P00368 (GeneLocus.TCR) has 204312 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00368.parquet.




2022-12-28 19:14:09,630 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 4}


2022-12-28 19:14:11,013 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 16, 'TRBVA*01': 1}


2022-12-28 19:14:11,545 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 41, 'TRBVA*01': 3}






2022-12-28 19:14:15,480 - assign_clone_ids.ipynb - INFO - Participant P00396 (GeneLocus.TCR) has 212709 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00396.tsv'), (306288, 132), (223993, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:15,541 - assign_clone_ids.ipynb - INFO - Participant P00419 (GeneLocus.TCR) has 95530 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00419.tsv'), (129089, 132), (102826, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:16,354 - assign_clone_ids.ipynb - INFO - Participant P00409 (GeneLocus.TCR) has 171295 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00409.tsv'), (215782, 132), (178288, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:16,699 - assign_clone_ids.ipynb - INFO - Participant P00407 (GeneLocus.TCR) has 126065 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00407.parquet.






2022-12-28 19:14:17,505 - assign_clone_ids.ipynb - INFO - Participant P00379 (GeneLocus.TCR) has 266134 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00379.tsv'), (348622, 132), (276083, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:20,287 - assign_clone_ids.ipynb - INFO - Participant P00390 (GeneLocus.TCR) has 185561 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00390.parquet.






2022-12-28 19:14:20,775 - assign_clone_ids.ipynb - INFO - Participant P00408 (GeneLocus.TCR) has 188552 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00408.tsv'), (241738, 132), (195301, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:14:22,991 - assign_clone_ids.ipynb - INFO - Participant P00388 (GeneLocus.TCR) has 254317 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00388.tsv'), (337718, 132), (268624, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:14:26,601 - assign_clone_ids.ipynb - INFO - Participant P00395 (GeneLocus.TCR) has 186985 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00395.parquet.


2022-12-28 19:14:27,514 - assign_clone_ids.ipynb - INFO - Participant P00391 (GeneLocus.TCR) has 247668 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00391.tsv'), (321208, 132), (263253, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:14:28,567 - assign_clone_ids.ipynb - INFO - Participant P00419 (GeneLocus.TCR) has 95315 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00419.parquet.




2022-12-28 19:14:29,246 - assign_clone_ids.ipynb - INFO - Participant P00414 (GeneLocus.TCR) has 129970 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00414.parquet.


2022-12-28 19:14:30,382 - assign_clone_ids.ipynb - INFO - Participant P00380 (GeneLocus.TCR) has 210153 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00380.parquet.


2022-12-28 19:14:30,867 - assign_clone_ids.ipynb - INFO - Participant P00397 (GeneLocus.TCR) has 309696 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00397.tsv'), (411263, 132), (324028, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:32,275 - assign_clone_ids.ipynb - INFO - Participant P00384 (GeneLocus.TCR) has 326999 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00384.tsv'), (429841, 132), (344003, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:32,408 - assign_clone_ids.ipynb - INFO - Participant P00398 (GeneLocus.TCR) has 189392 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00398.parquet.


2022-12-28 19:14:33,650 - assign_clone_ids.ipynb - INFO - Participant P00416 (GeneLocus.TCR) has 187500 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00416.tsv'), (260005, 132), (197523, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:35,683 - assign_clone_ids.ipynb - INFO - Participant P00382 (GeneLocus.TCR) has 217620 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00382.parquet.


2022-12-28 19:14:36,568 - assign_clone_ids.ipynb - INFO - Participant P00389 (GeneLocus.TCR) has 309080 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00389.tsv'), (419423, 132), (328592, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:38,872 - assign_clone_ids.ipynb - INFO - Participant P00366 (GeneLocus.TCR) has 259007 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00366.parquet.


2022-12-28 19:14:40,856 - assign_clone_ids.ipynb - INFO - Participant P00411 (GeneLocus.TCR) has 222785 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00411.tsv'), (292759, 132), (232487, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:41,109 - assign_clone_ids.ipynb - INFO - Participant P00418 (GeneLocus.TCR) has 170737 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00418.tsv'), (232317, 132), (178713, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:41,826 - assign_clone_ids.ipynb - INFO - Participant P00376 (GeneLocus.TCR) has 293023 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00376.tsv'), (392571, 132), (313964, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:42,063 - assign_clone_ids.ipynb - INFO - Participant P00409 (GeneLocus.TCR) has 171075 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00409.parquet.


2022-12-28 19:14:42,922 - assign_clone_ids.ipynb - INFO - Participant P00404 (GeneLocus.TCR) has 270050 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00404.tsv'), (345357, 132), (281847, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:14:43,582 - assign_clone_ids.ipynb - INFO - Participant P00392 (GeneLocus.TCR) has 287130 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00392.tsv'), (393591, 132), (301942, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:44,779 - assign_clone_ids.ipynb - INFO - Participant P00396 (GeneLocus.TCR) has 212420 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00396.parquet.


  df = pd.read_csv(



2022-12-28 19:14:46,515 - assign_clone_ids.ipynb - INFO - Participant P00375 (GeneLocus.TCR) has 319710 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00375.tsv'), (445305, 132), (339080, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:14:47,567 - assign_clone_ids.ipynb - INFO - Participant P00399 (GeneLocus.TCR) has 301311 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00399.tsv'), (421226, 132), (324116, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:47,185 - assign_clone_ids.ipynb - INFO - Participant P00420 (GeneLocus.TCR) has 163342 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00420.tsv'), (209243, 132), (171105, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:47,153 - assign_clone_ids.ipynb - INFO - Participant P00415 (GeneLocus.TCR) has 179502 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00415.tsv'), (238982, 132), (186970, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:47,443 - assign_clone_ids.ipynb - INFO - Participant P00408 (GeneLocus.TCR) has 188263 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00408.parquet.


2022-12-28 19:14:47,664 - assign_clone_ids.ipynb - INFO - Participant P00406 (GeneLocus.TCR) has 301475 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00406.tsv'), (398700, 132), (317101, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:52,112 - assign_clone_ids.ipynb - INFO - Participant P00379 (GeneLocus.TCR) has 265727 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00379.parquet.


2022-12-28 19:14:52,167 - assign_clone_ids.ipynb - INFO - Participant P00417 (GeneLocus.TCR) has 212116 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00417.tsv'), (282102, 132), (223745, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:14:56,821 - assign_clone_ids.ipynb - INFO - Participant P00388 (GeneLocus.TCR) has 253923 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00388.parquet.


2022-12-28 19:14:57,303 - assign_clone_ids.ipynb - INFO - Participant P00416 (GeneLocus.TCR) has 187144 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00416.parquet.


  df = pd.read_csv(



2022-12-28 19:14:58,807 - assign_clone_ids.ipynb - INFO - Participant P00391 (GeneLocus.TCR) has 247280 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00391.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:15:02,613 - assign_clone_ids.ipynb - INFO - Participant P00412 (GeneLocus.TCR) has 248766 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00412.tsv'), (335996, 132), (260372, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:15:05,730 - assign_clone_ids.ipynb - INFO - Participant P00418 (GeneLocus.TCR) has 170441 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00418.parquet.


2022-12-28 19:15:06,083 - assign_clone_ids.ipynb - INFO - Participant P00420 (GeneLocus.TCR) has 163152 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00420.parquet.


2022-12-28 19:15:07,661 - assign_clone_ids.ipynb - INFO - Participant P00397 (GeneLocus.TCR) has 309137 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00397.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:15:14,627 - assign_clone_ids.ipynb - INFO - Participant P00411 (GeneLocus.TCR) has 222489 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00411.parquet.


2022-12-28 19:15:14,625 - assign_clone_ids.ipynb - INFO - Participant P00384 (GeneLocus.TCR) has 326498 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00384.parquet.


2022-12-28 19:15:14,625 - assign_clone_ids.ipynb - INFO - Participant P00415 (GeneLocus.TCR) has 179246 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00415.parquet.


2022-12-28 19:15:17,140 - assign_clone_ids.ipynb - INFO - Participant P00402 (GeneLocus.TCR) has 423863 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00402.tsv'), (567627, 132), (454535, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:15:18,391 - assign_clone_ids.ipynb - INFO - Participant P00404 (GeneLocus.TCR) has 269516 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00404.parquet.


2022-12-28 19:15:18,390 - assign_clone_ids.ipynb - INFO - Participant P00389 (GeneLocus.TCR) has 308574 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00389.parquet.


2022-12-28 19:15:18,390 - assign_clone_ids.ipynb - INFO - Participant P00417 (GeneLocus.TCR) has 211861 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00417.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:15:26,870 - assign_clone_ids.ipynb - INFO - Participant P00392 (GeneLocus.TCR) has 286673 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00392.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:15:37,721 - assign_clone_ids.ipynb - INFO - Participant P00376 (GeneLocus.TCR) has 292564 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00376.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(





2022-12-28 19:15:45,766 - assign_clone_ids.ipynb - INFO - Participant P00441 (GeneLocus.TCR) has 6526 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00441.tsv'), (7613, 132), (6575, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:15:50,238 - assign_clone_ids.ipynb - INFO - Participant P00406 (GeneLocus.TCR) has 301153 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00406.parquet.


2022-12-28 19:15:50,238 - assign_clone_ids.ipynb - INFO - Participant P00399 (GeneLocus.TCR) has 300878 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00399.parquet.


2022-12-28 19:15:50,239 - assign_clone_ids.ipynb - INFO - Participant P00375 (GeneLocus.TCR) has 319213 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00375.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:15:54,849 - assign_clone_ids.ipynb - INFO - Participant P00412 (GeneLocus.TCR) has 248429 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00412.parquet.


2022-12-28 19:16:00,084 - assign_clone_ids.ipynb - INFO - Participant P00441 (GeneLocus.TCR) has 6506 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00441.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:16:26,011 - assign_clone_ids.ipynb - INFO - Participant P00402 (GeneLocus.TCR) has 423392 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00402.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:17:23,340 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 6}


2022-12-28 19:17:25,593 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 4}






2022-12-28 19:17:28,302 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 7, 'TRBVA*01': 2}


2022-12-28 19:17:29,434 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 8, 'TRBVA*01': 1}


2022-12-28 19:17:31,647 - assign_clone_ids.ipynb - INFO - Participant P00459 (GeneLocus.TCR) has 33314 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00459.tsv'), (42434, 132), (35266, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:17:32,348 - assign_clone_ids.ipynb - INFO - Participant P00446 (GeneLocus.TCR) has 48868 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00446.tsv'), (64229, 132), (50356, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:17:32,362 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 5, 'TRBVA*01': 1}


2022-12-28 19:17:32,694 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 9, 'TRBVA*01': 2}






2022-12-28 19:17:35,303 - assign_clone_ids.ipynb - INFO - Participant P00459 (GeneLocus.TCR) has 33266 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00459.parquet.










2022-12-28 19:17:37,386 - assign_clone_ids.ipynb - INFO - Participant P00446 (GeneLocus.TCR) has 48818 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00446.parquet.


2022-12-28 19:17:39,947 - assign_clone_ids.ipynb - INFO - Participant P00431 (GeneLocus.TCR) has 79323 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00431.tsv'), (101931, 132), (80847, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:17:43,107 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV10-1*03': 1, 'TRBV7-5*01': 4, 'TRBV7-5*02': 18}


2022-12-28 19:17:43,257 - assign_clone_ids.ipynb - INFO - Participant P00453 (GeneLocus.TCR) has 69169 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00453.tsv'), (91367, 132), (70956, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:17:44,418 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10}


2022-12-28 19:17:44,913 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBVA*01': 1}


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:17:46,097 - assign_clone_ids.ipynb - INFO - Participant P00429 (GeneLocus.TCR) has 86991 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00429.tsv'), (118881, 132), (95141, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:17:47,176 - assign_clone_ids.ipynb - INFO - Participant P00444 (GeneLocus.TCR) has 68989 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00444.tsv'), (95617, 132), (72914, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:17:47,250 - assign_clone_ids.ipynb - INFO - Participant P00431 (GeneLocus.TCR) has 79175 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00431.parquet.


2022-12-28 19:17:47,780 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 14, 'TRBVA*01': 5}




2022-12-28 19:17:49,506 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 12, 'TRBVA*01': 2}


2022-12-28 19:17:50,050 - assign_clone_ids.ipynb - INFO - Participant P00453 (GeneLocus.TCR) has 69093 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00453.parquet.


2022-12-28 19:17:51,205 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 18, 'TRBVA*01': 2}








2022-12-28 19:17:54,642 - assign_clone_ids.ipynb - INFO - Participant P00444 (GeneLocus.TCR) has 68886 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00444.parquet.




  df = pd.read_csv(







2022-12-28 19:17:55,568 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 24, 'TRBVA*01': 1}


2022-12-28 19:17:55,609 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBVA*01': 1}


2022-12-28 19:17:56,355 - assign_clone_ids.ipynb - INFO - Participant P00429 (GeneLocus.TCR) has 86867 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00429.parquet.


2022-12-28 19:17:57,326 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 17, 'TRBVA*01': 5}


2022-12-28 19:17:58,157 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 16, 'TRBVA*01': 3}




2022-12-28 19:18:03,182 - assign_clone_ids.ipynb - INFO - Participant P00438 (GeneLocus.TCR) has 109404 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00438.tsv'), (145049, 132), (113744, 144), <GeneLocus.TCR: 2>)]






  df = pd.read_csv(









2022-12-28 19:18:04,500 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 22, 'TRBVA*01': 1}


2022-12-28 19:18:05,438 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV8-2*01': 1, 'TRBVA*01': 3}




2022-12-28 19:18:08,332 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBVA*01': 2}


2022-12-28 19:18:08,348 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 19, 'TRBVA*01': 4}


2022-12-28 19:18:08,424 - assign_clone_ids.ipynb - INFO - Participant P00435 (GeneLocus.TCR) has 120236 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00435.tsv'), (158632, 132), (127585, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:18:10,132 - assign_clone_ids.ipynb - INFO - Participant P00450 (GeneLocus.TCR) has 121274 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00450.tsv'), (157766, 132), (125271, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:18:11,117 - assign_clone_ids.ipynb - INFO - Participant P00426 (GeneLocus.TCR) has 144057 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00426.tsv'), (185447, 132), (150883, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:18:11,835 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBVA*01': 1}


  df = pd.read_csv(





2022-12-28 19:18:14,155 - assign_clone_ids.ipynb - INFO - Participant P00436 (GeneLocus.TCR) has 141149 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00436.tsv'), (169843, 132), (145806, 144), <GeneLocus.TCR: 2>)]








2022-12-28 19:18:15,855 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBVA*01': 6}


2022-12-28 19:18:17,272 - assign_clone_ids.ipynb - INFO - Participant P00438 (GeneLocus.TCR) has 109250 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00438.parquet.


2022-12-28 19:18:18,162 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 9, 'TRBVA*01': 1}






2022-12-28 19:18:19,774 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 9, 'TRBVA*01': 1}


2022-12-28 19:18:21,091 - assign_clone_ids.ipynb - INFO - Participant P00461 (GeneLocus.TCR) has 118977 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00461.tsv'), (157433, 132), (122547, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





2022-12-28 19:18:22,218 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 6, 'TRBVA*01': 1}




2022-12-28 19:18:27,690 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 13, 'TRBVA*01': 3}


2022-12-28 19:18:28,982 - assign_clone_ids.ipynb - INFO - Participant P00463 (GeneLocus.TCR) has 58464 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00463.tsv'), (71147, 132), (59304, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 19:18:32,600 - assign_clone_ids.ipynb - INFO - Participant P00464 (GeneLocus.TCR) has 65736 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00464.tsv'), (80751, 132), (66955, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:18:34,108 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBV7-5*02': 17}






2022-12-28 19:18:39,945 - assign_clone_ids.ipynb - INFO - Participant P00448 (GeneLocus.TCR) has 147279 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00448.tsv'), (195490, 132), (154573, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:18:40,208 - assign_clone_ids.ipynb - INFO - Participant P00463 (GeneLocus.TCR) has 58382 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00463.parquet.


2022-12-28 19:18:43,681 - assign_clone_ids.ipynb - INFO - Participant P00464 (GeneLocus.TCR) has 65648 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00464.parquet.


  df = pd.read_csv(



2022-12-28 19:18:53,205 - assign_clone_ids.ipynb - INFO - Participant P00428 (GeneLocus.TCR) has 156298 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00428.tsv'), (208485, 132), (162483, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:19:02,119 - assign_clone_ids.ipynb - INFO - Participant P00449 (GeneLocus.TCR) has 146721 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00449.tsv'), (188419, 132), (150915, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:02,247 - assign_clone_ids.ipynb - INFO - Participant P00424 (GeneLocus.TCR) has 167436 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00424.tsv'), (205898, 132), (172854, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:19:04,171 - assign_clone_ids.ipynb - INFO - Participant P00462 (GeneLocus.TCR) has 88567 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00462.tsv'), (114345, 132), (90549, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:04,653 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBVA*01': 4}


2022-12-28 19:19:05,944 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBV8-1*01': 1, 'TRBVA*01': 4}


2022-12-28 19:19:06,637 - assign_clone_ids.ipynb - INFO - Participant P00435 (GeneLocus.TCR) has 119988 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00435.parquet.


2022-12-28 19:19:06,771 - assign_clone_ids.ipynb - INFO - Participant P00450 (GeneLocus.TCR) has 121133 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00450.parquet.


2022-12-28 19:19:09,202 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 8}


2022-12-28 19:19:10,570 - assign_clone_ids.ipynb - INFO - Participant P00426 (GeneLocus.TCR) has 143875 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00426.parquet.


2022-12-28 19:19:10,573 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBVA*01': 2}




2022-12-28 19:19:11,113 - assign_clone_ids.ipynb - INFO - Participant P00461 (GeneLocus.TCR) has 118785 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00461.parquet.


2022-12-28 19:19:11,276 - assign_clone_ids.ipynb - INFO - Participant P00436 (GeneLocus.TCR) has 140992 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00436.parquet.


2022-12-28 19:19:11,805 - assign_clone_ids.ipynb - INFO - Participant P00456 (GeneLocus.TCR) has 164171 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00456.tsv'), (213901, 132), (172234, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:11,853 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 33, 'TRBVA*01': 7}


2022-12-28 19:19:12,305 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBVA*01': 1}


2022-12-28 19:19:12,448 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 32, 'TRBVA*01': 1}


2022-12-28 19:19:12,649 - assign_clone_ids.ipynb - INFO - Participant P00465 (GeneLocus.TCR) has 122720 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00465.tsv'), (156291, 132), (125697, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:19:13,389 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 31, 'TRBVA*01': 3}


2022-12-28 19:19:13,875 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 15}


2022-12-28 19:19:14,237 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 26, 'TRBVA*01': 3}


2022-12-28 19:19:15,049 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 18, 'TRBV7-5*02': 22, 'TRBVA*01': 9}






2022-12-28 19:19:16,031 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBVA*01': 5}




2022-12-28 19:19:16,910 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 51, 'TRBV8-1*01': 1, 'TRBVA*01': 4}


2022-12-28 19:19:16,992 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBVA*01': 2}


2022-12-28 19:19:17,466 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 30, 'TRBVA*01': 6}


2022-12-28 19:19:17,776 - assign_clone_ids.ipynb - INFO - Participant P00447 (GeneLocus.TCR) has 183369 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00447.tsv'), (236589, 132), (190826, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:19:17,937 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 11}




  df = pd.read_csv(





2022-12-28 19:19:20,350 - assign_clone_ids.ipynb - INFO - Participant P00462 (GeneLocus.TCR) has 88434 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00462.parquet.


2022-12-28 19:19:21,125 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 18, 'TRBV7-5*02': 37, 'TRBVA*01': 6}






2022-12-28 19:19:23,719 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 51, 'TRBVA*01': 3}








2022-12-28 19:19:24,743 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 3, 'TRBV7-5*02': 3, 'TRBVA*01': 1}








2022-12-28 19:19:25,474 - assign_clone_ids.ipynb - INFO - Participant P00451 (GeneLocus.TCR) has 189495 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00451.tsv'), (243802, 132), (197966, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:19:25,764 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 47, 'TRBVA*01': 6}




2022-12-28 19:19:26,933 - assign_clone_ids.ipynb - INFO - Participant P00448 (GeneLocus.TCR) has 147122 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00448.parquet.


  df = pd.read_csv(



2022-12-28 19:19:27,116 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 10}






2022-12-28 19:19:28,595 - assign_clone_ids.ipynb - INFO - Participant P00449 (GeneLocus.TCR) has 146509 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00449.parquet.












2022-12-28 19:19:31,243 - assign_clone_ids.ipynb - INFO - Participant P00427 (GeneLocus.TCR) has 196047 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00427.tsv'), (261861, 132), (206701, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:32,941 - assign_clone_ids.ipynb - INFO - Participant P00465 (GeneLocus.TCR) has 122516 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00465.parquet.


2022-12-28 19:19:33,374 - assign_clone_ids.ipynb - INFO - Participant P00428 (GeneLocus.TCR) has 156031 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00428.parquet.


2022-12-28 19:19:34,007 - assign_clone_ids.ipynb - INFO - Participant P00424 (GeneLocus.TCR) has 167216 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00424.parquet.


2022-12-28 19:19:34,241 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBVA*01': 2}
















2022-12-28 19:19:37,245 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 24, 'TRBV7-5*02': 29, 'TRBVA*01': 7}






2022-12-28 19:19:37,936 - assign_clone_ids.ipynb - INFO - Participant P00470 (GeneLocus.TCR) has 135033 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00470.tsv'), (171226, 132), (139811, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:38,666 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 30, 'TRBV7-5*02': 48, 'TRBVA*01': 10}




  df = pd.read_csv(



2022-12-28 19:19:39,825 - assign_clone_ids.ipynb - INFO - Participant P00456 (GeneLocus.TCR) has 163945 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00456.parquet.


2022-12-28 19:19:39,861 - assign_clone_ids.ipynb - INFO - Participant P00475 (GeneLocus.TCR) has 45902 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00475.tsv'), (61501, 132), (47912, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:19:40,146 - assign_clone_ids.ipynb - INFO - Participant P00469 (GeneLocus.TCR) has 111121 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00469.tsv'), (149111, 132), (115389, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:19:41,337 - assign_clone_ids.ipynb - INFO - Participant P00471 (GeneLocus.TCR) has 61198 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00471.tsv'), (79493, 132), (64645, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:42,491 - assign_clone_ids.ipynb - INFO - Participant P00457 (GeneLocus.TCR) has 176711 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00457.tsv'), (218885, 132), (181870, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:44,197 - assign_clone_ids.ipynb - INFO - Participant P00434 (GeneLocus.TCR) has 206463 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00434.tsv'), (274530, 132), (220845, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:45,455 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 43, 'TRBVA*01': 4}


2022-12-28 19:19:45,768 - assign_clone_ids.ipynb - INFO - Participant P00447 (GeneLocus.TCR) has 183037 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00447.parquet.


  df = pd.read_csv(



2022-12-28 19:19:46,984 - assign_clone_ids.ipynb - INFO - Participant P00468 (GeneLocus.TCR) has 146438 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00468.tsv'), (190461, 132), (152314, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:47,352 - assign_clone_ids.ipynb - INFO - Participant P00475 (GeneLocus.TCR) has 45847 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00475.parquet.


2022-12-28 19:19:48,046 - assign_clone_ids.ipynb - INFO - Participant P00439 (GeneLocus.TCR) has 213541 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00439.tsv'), (289962, 132), (222875, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:48,531 - assign_clone_ids.ipynb - INFO - Participant P00471 (GeneLocus.TCR) has 61116 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00471.parquet.




  df = pd.read_csv(



2022-12-28 19:19:50,802 - assign_clone_ids.ipynb - INFO - Participant P00466 (GeneLocus.TCR) has 163559 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00466.tsv'), (217581, 132), (170126, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:52,888 - assign_clone_ids.ipynb - INFO - Participant P00455 (GeneLocus.TCR) has 167180 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00455.tsv'), (224982, 132), (174547, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:53,164 - assign_clone_ids.ipynb - INFO - Participant P00470 (GeneLocus.TCR) has 134930 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00470.parquet.


2022-12-28 19:19:53,552 - assign_clone_ids.ipynb - INFO - Participant P00451 (GeneLocus.TCR) has 189236 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00451.parquet.


2022-12-28 19:19:53,937 - assign_clone_ids.ipynb - INFO - Participant P00469 (GeneLocus.TCR) has 110958 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00469.parquet.


  df = pd.read_csv(





  df = pd.read_csv(





2022-12-28 19:19:56,402 - assign_clone_ids.ipynb - INFO - Participant P00460 (GeneLocus.TCR) has 211939 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00460.tsv'), (277621, 132), (220716, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:57,351 - assign_clone_ids.ipynb - INFO - Participant P00472 (GeneLocus.TCR) has 89949 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00472.tsv'), (117637, 132), (94064, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:57,516 - assign_clone_ids.ipynb - INFO - Participant P00442 (GeneLocus.TCR) has 232536 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00442.tsv'), (313253, 132), (242927, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:19:57,995 - assign_clone_ids.ipynb - INFO - Participant P00452 (GeneLocus.TCR) has 205555 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00452.tsv'), (274934, 132), (216308, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:20:04,029 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBVA*01': 1}


2022-12-28 19:20:07,369 - assign_clone_ids.ipynb - INFO - Participant P00440 (GeneLocus.TCR) has 219502 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00440.tsv'), (296943, 132), (231823, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(





  df = pd.read_csv(







  df = pd.read_csv(



2022-12-28 19:20:15,535 - assign_clone_ids.ipynb - INFO - Participant P00445 (GeneLocus.TCR) has 252005 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00445.tsv'), (333829, 132), (263106, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:15,836 - assign_clone_ids.ipynb - INFO - Participant P00427 (GeneLocus.TCR) has 195828 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00427.parquet.






  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:20:19,991 - assign_clone_ids.ipynb - INFO - Participant P00432 (GeneLocus.TCR) has 226857 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00432.tsv'), (307231, 132), (242692, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:21,285 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBVA*01': 3}


2022-12-28 19:20:21,803 - assign_clone_ids.ipynb - INFO - Participant P00466 (GeneLocus.TCR) has 163307 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00466.parquet.


2022-12-28 19:20:22,610 - assign_clone_ids.ipynb - INFO - Participant P00468 (GeneLocus.TCR) has 146240 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00468.parquet.


2022-12-28 19:20:22,729 - assign_clone_ids.ipynb - INFO - Participant P00477 (GeneLocus.TCR) has 67955 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00477.tsv'), (90421, 132), (72180, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:20:24,811 - assign_clone_ids.ipynb - INFO - Participant P00457 (GeneLocus.TCR) has 176529 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00457.parquet.


2022-12-28 19:20:25,094 - assign_clone_ids.ipynb - INFO - Participant P00455 (GeneLocus.TCR) has 166980 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00455.parquet.


2022-12-28 19:20:26,215 - assign_clone_ids.ipynb - INFO - Participant P00472 (GeneLocus.TCR) has 89840 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00472.parquet.


2022-12-28 19:20:26,808 - assign_clone_ids.ipynb - INFO - Participant P00434 (GeneLocus.TCR) has 206179 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00434.parquet.


2022-12-28 19:20:27,143 - assign_clone_ids.ipynb - INFO - Participant P00439 (GeneLocus.TCR) has 213249 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00439.parquet.


2022-12-28 19:20:29,873 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 10, 'TRBVA*01': 3}


2022-12-28 19:20:30,161 - assign_clone_ids.ipynb - INFO - Participant P00422 (GeneLocus.TCR) has 241477 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00422.tsv'), (322553, 132), (251069, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:30,895 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBVA*01': 3}




2022-12-28 19:20:31,831 - assign_clone_ids.ipynb - INFO - Participant P00433 (GeneLocus.TCR) has 260978 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00433.tsv'), (348114, 132), (272883, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:32,074 - assign_clone_ids.ipynb - INFO - Participant P00477 (GeneLocus.TCR) has 67815 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00477.parquet.






2022-12-28 19:20:34,166 - assign_clone_ids.ipynb - INFO - Participant P00430 (GeneLocus.TCR) has 244697 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00430.tsv'), (335351, 132), (257160, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:34,759 - assign_clone_ids.ipynb - INFO - Participant P00443 (GeneLocus.TCR) has 247402 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00443.tsv'), (329795, 132), (258227, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 19:20:38,956 - assign_clone_ids.ipynb - INFO - Participant P00437 (GeneLocus.TCR) has 288516 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00437.tsv'), (378817, 132), (302137, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:20:39,412 - assign_clone_ids.ipynb - INFO - Participant P00425 (GeneLocus.TCR) has 261682 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00425.tsv'), (334020, 132), (273120, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:40,961 - assign_clone_ids.ipynb - INFO - Participant P00488 (GeneLocus.TCR) has 56381 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00488.tsv'), (74873, 132), (57892, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:41,911 - assign_clone_ids.ipynb - INFO - Participant P00460 (GeneLocus.TCR) has 211573 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00460.parquet.


2022-12-28 19:20:43,606 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9}


  df = pd.read_csv(



2022-12-28 19:20:44,868 - assign_clone_ids.ipynb - INFO - Participant P00423 (GeneLocus.TCR) has 277830 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00423.tsv'), (381538, 132), (290120, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:45,686 - assign_clone_ids.ipynb - INFO - Participant P00442 (GeneLocus.TCR) has 232052 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00442.parquet.


2022-12-28 19:20:47,113 - assign_clone_ids.ipynb - INFO - Participant P00488 (GeneLocus.TCR) has 56333 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00488.parquet.


2022-12-28 19:20:47,451 - assign_clone_ids.ipynb - INFO - Participant P00452 (GeneLocus.TCR) has 205260 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00452.parquet.


2022-12-28 19:20:48,739 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 28, 'TRBVA*01': 1}






2022-12-28 19:20:49,355 - assign_clone_ids.ipynb - INFO - Participant P00480 (GeneLocus.TCR) has 95396 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00480.tsv'), (125323, 132), (98758, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:20:51,560 - assign_clone_ids.ipynb - INFO - Participant P00440 (GeneLocus.TCR) has 219222 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00440.parquet.


2022-12-28 19:20:52,150 - assign_clone_ids.ipynb - INFO - Participant P00467 (GeneLocus.TCR) has 224603 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00467.tsv'), (284751, 132), (233435, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:55,255 - assign_clone_ids.ipynb - INFO - Participant P00432 (GeneLocus.TCR) has 226571 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00432.parquet.


2022-12-28 19:20:55,155 - assign_clone_ids.ipynb - INFO - Participant P00458 (GeneLocus.TCR) has 341403 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00458.tsv'), (440852, 132), (360393, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:20:55,255 - assign_clone_ids.ipynb - INFO - Participant P00445 (GeneLocus.TCR) has 251614 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00445.parquet.




2022-12-28 19:20:55,425 - assign_clone_ids.ipynb - INFO - Participant P00473 (GeneLocus.TCR) has 157286 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00473.tsv'), (199191, 132), (163500, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:20:59,978 - assign_clone_ids.ipynb - INFO - Participant P00422 (GeneLocus.TCR) has 241097 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00422.parquet.


2022-12-28 19:21:00,680 - assign_clone_ids.ipynb - INFO - Participant P00480 (GeneLocus.TCR) has 95276 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00480.parquet.


2022-12-28 19:21:01,222 - assign_clone_ids.ipynb - INFO - Participant P00483 (GeneLocus.TCR) has 80705 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00483.tsv'), (114149, 132), (84871, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:21:06,684 - assign_clone_ids.ipynb - INFO - Participant P00433 (GeneLocus.TCR) has 260547 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00433.parquet.


2022-12-28 19:21:06,685 - assign_clone_ids.ipynb - INFO - Participant P00443 (GeneLocus.TCR) has 247038 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00443.parquet.


2022-12-28 19:21:08,640 - assign_clone_ids.ipynb - INFO - Participant P00430 (GeneLocus.TCR) has 244399 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00430.parquet.


  df = pd.read_csv(



2022-12-28 19:21:11,374 - assign_clone_ids.ipynb - INFO - Participant P00454 (GeneLocus.TCR) has 349706 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00454.tsv'), (473343, 132), (372766, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:21:11,969 - assign_clone_ids.ipynb - INFO - Participant P00478 (GeneLocus.TCR) has 128137 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00478.tsv'), (173166, 132), (133406, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:21:15,711 - assign_clone_ids.ipynb - INFO - Participant P00483 (GeneLocus.TCR) has 80562 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00483.parquet.


2022-12-28 19:21:15,711 - assign_clone_ids.ipynb - INFO - Participant P00425 (GeneLocus.TCR) has 261239 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00425.parquet.


2022-12-28 19:21:15,711 - assign_clone_ids.ipynb - INFO - Participant P00437 (GeneLocus.TCR) has 288192 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00437.parquet.


2022-12-28 19:21:18,098 - assign_clone_ids.ipynb - INFO - Participant P00421 (GeneLocus.TCR) has 406818 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00421.tsv'), (543802, 132), (432513, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:21:19,980 - assign_clone_ids.ipynb - INFO - Participant P00473 (GeneLocus.TCR) has 157043 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00473.parquet.


2022-12-28 19:21:20,689 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1}




2022-12-28 19:21:24,303 - assign_clone_ids.ipynb - INFO - Participant P00496 (GeneLocus.TCR) has 22661 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00496.tsv'), (28411, 132), (23138, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:21:30,581 - assign_clone_ids.ipynb - INFO - Participant P00467 (GeneLocus.TCR) has 224316 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00467.parquet.


2022-12-28 19:21:30,582 - assign_clone_ids.ipynb - INFO - Participant P00423 (GeneLocus.TCR) has 277390 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00423.parquet.


  df = pd.read_csv(



2022-12-28 19:21:34,794 - assign_clone_ids.ipynb - INFO - Participant P00496 (GeneLocus.TCR) has 22609 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00496.parquet.


2022-12-28 19:21:34,794 - assign_clone_ids.ipynb - INFO - Participant P00478 (GeneLocus.TCR) has 127945 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00478.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:21:43,920 - assign_clone_ids.ipynb - INFO - Participant P00458 (GeneLocus.TCR) has 340909 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00458.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:21:51,035 - assign_clone_ids.ipynb - INFO - Participant P00454 (GeneLocus.TCR) has 349124 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00454.parquet.


  df = pd.read_csv(



2022-12-28 19:22:01,787 - assign_clone_ids.ipynb - INFO - Participant P00421 (GeneLocus.TCR) has 406314 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00421.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:22:54,323 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 6, 'TRBV7-5*02': 13, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 19:22:58,579 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 4}


2022-12-28 19:23:00,247 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 5, 'TRBV8-2*01': 1}








2022-12-28 19:23:06,236 - assign_clone_ids.ipynb - INFO - Participant P00506 (GeneLocus.TCR) has 29490 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00506.tsv'), (39179, 132), (30294, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:23:08,668 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 3}


2022-12-28 19:23:10,273 - assign_clone_ids.ipynb - INFO - Participant P00506 (GeneLocus.TCR) has 29449 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00506.parquet.






2022-12-28 19:23:11,389 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 27, 'TRBVA*01': 2}


2022-12-28 19:23:12,839 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBVA*01': 1}


2022-12-28 19:23:12,975 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5}


2022-12-28 19:23:13,838 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV8-2*01': 1, 'TRBVA*01': 3}






2022-12-28 19:23:16,292 - assign_clone_ids.ipynb - INFO - Participant P00481 (GeneLocus.TCR) has 122111 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00481.tsv'), (161165, 132), (128393, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:23:19,134 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBVA*01': 1}




2022-12-28 19:23:19,194 - assign_clone_ids.ipynb - INFO - Participant P00515 (GeneLocus.TCR) has 51487 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00515.tsv'), (68796, 132), (53330, 144), <GeneLocus.TCR: 2>)]










2022-12-28 19:23:19,793 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBVA*01': 3}


2022-12-28 19:23:22,761 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 4, 'TRBV7-5*02': 7, 'TRBVA*01': 1}


  df = pd.read_csv(



2022-12-28 19:23:23,753 - assign_clone_ids.ipynb - INFO - Participant P00507 (GeneLocus.TCR) has 62421 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00507.tsv'), (85830, 132), (65646, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:23:25,599 - assign_clone_ids.ipynb - INFO - Participant P00515 (GeneLocus.TCR) has 51424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00515.parquet.


2022-12-28 19:23:26,524 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 20}


2022-12-28 19:23:27,358 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBVA*01': 1}


2022-12-28 19:23:27,797 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBVA*01': 5}


2022-12-28 19:23:27,970 - assign_clone_ids.ipynb - INFO - Participant P00479 (GeneLocus.TCR) has 166778 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00479.tsv'), (215249, 132), (171616, 144), <GeneLocus.TCR: 2>)]








2022-12-28 19:23:30,502 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 17, 'TRBV7-5*02': 19, 'TRBVA*01': 1}


2022-12-28 19:23:30,779 - assign_clone_ids.ipynb - INFO - Participant P00481 (GeneLocus.TCR) has 121918 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00481.parquet.


2022-12-28 19:23:30,945 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 2, 'TRBV7-5*01': 16, 'TRBV7-5*02': 14, 'TRBV8-1*01': 1, 'TRBVA*01': 2}


2022-12-28 19:23:31,639 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBVA*01': 1}


2022-12-28 19:23:31,740 - assign_clone_ids.ipynb - INFO - Participant P00507 (GeneLocus.TCR) has 62338 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00507.parquet.




2022-12-28 19:23:33,507 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 6, 'TRBVA*01': 3}


2022-12-28 19:23:33,534 - assign_clone_ids.ipynb - INFO - Participant P00493 (GeneLocus.TCR) has 78072 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00493.tsv'), (142940, 132), (100848, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:23:35,404 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 32, 'TRBVA*01': 4}


2022-12-28 19:23:36,482 - assign_clone_ids.ipynb - INFO - Participant P00489 (GeneLocus.TCR) has 128501 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00489.tsv'), (161732, 132), (131920, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:23:36,958 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 37, 'TRBVA*01': 3}




2022-12-28 19:23:37,748 - assign_clone_ids.ipynb - INFO - Participant P00519 (GeneLocus.TCR) has 96722 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00519.tsv'), (120096, 132), (98994, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:23:38,366 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 29, 'TRBVA*01': 3}






2022-12-28 19:23:40,949 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 41, 'TRBV8-1*01': 1, 'TRBVA*01': 4}












2022-12-28 19:23:42,226 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 3}








2022-12-28 19:23:43,972 - assign_clone_ids.ipynb - INFO - Participant P00482 (GeneLocus.TCR) has 148809 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00482.tsv'), (212190, 132), (158038, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:23:44,293 - assign_clone_ids.ipynb - INFO - Participant P00493 (GeneLocus.TCR) has 77944 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00493.parquet.


2022-12-28 19:23:44,944 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 37, 'TRBVA*01': 3}




  df = pd.read_csv(





2022-12-28 19:23:48,495 - assign_clone_ids.ipynb - INFO - Participant P00503 (GeneLocus.TCR) has 112577 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00503.tsv'), (133565, 132), (115745, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:23:49,607 - assign_clone_ids.ipynb - INFO - Participant P00479 (GeneLocus.TCR) has 166401 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00479.parquet.


2022-12-28 19:23:50,030 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 28, 'TRBV8-2*01': 1, 'TRBVA*01': 5}


2022-12-28 19:23:50,409 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 53, 'TRBVA*01': 5}


  df = pd.read_csv(



2022-12-28 19:23:51,290 - assign_clone_ids.ipynb - INFO - Participant P00501 (GeneLocus.TCR) has 104787 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00501.tsv'), (145688, 132), (109007, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:23:53,232 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 1}




2022-12-28 19:23:54,074 - assign_clone_ids.ipynb - INFO - Participant P00519 (GeneLocus.TCR) has 96533 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00519.parquet.




  df = pd.read_csv(



2022-12-28 19:23:54,665 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 12, 'TRBVA*01': 8}




2022-12-28 19:23:56,804 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBV7-5*02': 1, 'TRBVA*01': 2}


2022-12-28 19:23:57,488 - assign_clone_ids.ipynb - INFO - Participant P00491 (GeneLocus.TCR) has 150554 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00491.tsv'), (211833, 132), (157585, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:23:58,933 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 19, 'TRBVA*01': 2}






2022-12-28 19:23:59,498 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 8}


2022-12-28 19:24:00,410 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV10-1*03': 1, 'TRBV7-5*01': 58, 'TRBV8-2*01': 1, 'TRBVA*01': 10}


2022-12-28 19:24:02,929 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 44, 'TRBVA*01': 6}


  df = pd.read_csv(



2022-12-28 19:24:03,249 - assign_clone_ids.ipynb - INFO - Participant P00489 (GeneLocus.TCR) has 128263 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00489.parquet.






2022-12-28 19:24:04,425 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 41, 'TRBVA*01': 3}




2022-12-28 19:24:06,067 - assign_clone_ids.ipynb - INFO - Participant P00514 (GeneLocus.TCR) has 125337 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00514.tsv'), (165093, 132), (133514, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:06,208 - assign_clone_ids.ipynb - INFO - Participant P00485 (GeneLocus.TCR) has 188675 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00485.tsv'), (235098, 132), (196243, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:06,364 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 44, 'TRBVA*01': 4}


  df = pd.read_csv(







2022-12-28 19:24:07,123 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 51, 'TRBVA*01': 6}


2022-12-28 19:24:08,524 - assign_clone_ids.ipynb - INFO - Participant P00509 (GeneLocus.TCR) has 131301 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00509.tsv'), (182790, 132), (137596, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:08,720 - assign_clone_ids.ipynb - INFO - Participant P00486 (GeneLocus.TCR) has 200838 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00486.tsv'), (263814, 132), (212610, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:09,879 - assign_clone_ids.ipynb - INFO - Participant P00501 (GeneLocus.TCR) has 104643 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00501.parquet.


2022-12-28 19:24:10,168 - assign_clone_ids.ipynb - INFO - Participant P00482 (GeneLocus.TCR) has 148614 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00482.parquet.


2022-12-28 19:24:10,331 - assign_clone_ids.ipynb - INFO - Participant P00494 (GeneLocus.TCR) has 131926 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00494.tsv'), (170996, 132), (135630, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:24:11,488 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 32, 'TRBVA*01': 1}


2022-12-28 19:24:11,655 - assign_clone_ids.ipynb - INFO - Participant P00503 (GeneLocus.TCR) has 112438 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00503.parquet.




2022-12-28 19:24:13,992 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBV7-5*02': 44}












2022-12-28 19:24:15,952 - assign_clone_ids.ipynb - INFO - Participant P00508 (GeneLocus.TCR) has 151741 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00508.tsv'), (198755, 132), (157874, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:24:16,397 - assign_clone_ids.ipynb - INFO - Participant P00484 (GeneLocus.TCR) has 196592 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00484.tsv'), (268961, 132), (203683, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:18,280 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 34, 'TRBVA*01': 2}


2022-12-28 19:24:18,370 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 22, 'TRBV7-5*02': 35, 'TRBVA*01': 10}


2022-12-28 19:24:19,800 - assign_clone_ids.ipynb - INFO - Participant P00502 (GeneLocus.TCR) has 153105 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00502.tsv'), (196057, 132), (161126, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:24:21,348 - assign_clone_ids.ipynb - INFO - Participant P00490 (GeneLocus.TCR) has 178191 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00490.tsv'), (232364, 132), (185068, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:21,926 - assign_clone_ids.ipynb - INFO - Participant P00500 (GeneLocus.TCR) has 182634 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00500.tsv'), (233494, 132), (194015, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:22,960 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBV7-5*02': 37, 'TRBVA*01': 4}


2022-12-28 19:24:24,142 - assign_clone_ids.ipynb - INFO - Participant P00521 (GeneLocus.TCR) has 116813 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00521.tsv'), (159262, 132), (120946, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:24,348 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15}








2022-12-28 19:24:25,288 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 18, 'TRBVA*01': 6}




2022-12-28 19:24:25,822 - assign_clone_ids.ipynb - INFO - Participant P00514 (GeneLocus.TCR) has 125146 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00514.parquet.


2022-12-28 19:24:25,924 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 2, 'TRBV7-5*02': 60, 'TRBVA*01': 3}


2022-12-28 19:24:26,683 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBVA*01': 4}


  df = pd.read_csv(





2022-12-28 19:24:27,495 - assign_clone_ids.ipynb - INFO - Participant P00491 (GeneLocus.TCR) has 150358 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00491.parquet.




2022-12-28 19:24:28,798 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 14, 'TRBV7-5*02': 23, 'TRBVA*01': 3}


2022-12-28 19:24:30,344 - assign_clone_ids.ipynb - INFO - Participant P00509 (GeneLocus.TCR) has 131098 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00509.parquet.










2022-12-28 19:24:31,575 - assign_clone_ids.ipynb - INFO - Participant P00518 (GeneLocus.TCR) has 172774 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00518.tsv'), (220854, 132), (178389, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 19:24:33,007 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 17, 'TRBVA*01': 1}




2022-12-28 19:24:38,124 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 7}




  df = pd.read_csv(









2022-12-28 19:24:39,988 - assign_clone_ids.ipynb - INFO - Participant P00512 (GeneLocus.TCR) has 186619 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00512.tsv'), (236404, 132), (193446, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:40,024 - assign_clone_ids.ipynb - INFO - Participant P00485 (GeneLocus.TCR) has 188484 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00485.parquet.


2022-12-28 19:24:40,142 - assign_clone_ids.ipynb - INFO - Participant P00494 (GeneLocus.TCR) has 131775 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00494.parquet.


2022-12-28 19:24:43,091 - assign_clone_ids.ipynb - INFO - Participant P00499 (GeneLocus.TCR) has 217514 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00499.tsv'), (285212, 132), (226991, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:43,339 - assign_clone_ids.ipynb - INFO - Participant P00504 (GeneLocus.TCR) has 199863 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00504.tsv'), (267197, 132), (207657, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:24:43,845 - assign_clone_ids.ipynb - INFO - Participant P00508 (GeneLocus.TCR) has 151500 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00508.parquet.










2022-12-28 19:24:46,534 - assign_clone_ids.ipynb - INFO - Participant P00521 (GeneLocus.TCR) has 116647 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00521.parquet.




2022-12-28 19:24:47,155 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 5, 'TRBV7-5*02': 5, 'TRBVA*01': 2}


2022-12-28 19:24:47,237 - assign_clone_ids.ipynb - INFO - Participant P00486 (GeneLocus.TCR) has 200586 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00486.parquet.


2022-12-28 19:24:47,304 - assign_clone_ids.ipynb - INFO - Participant P00484 (GeneLocus.TCR) has 196273 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00484.parquet.




2022-12-28 19:24:47,549 - assign_clone_ids.ipynb - INFO - Participant P00502 (GeneLocus.TCR) has 152890 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00502.parquet.


  df = pd.read_csv(



2022-12-28 19:24:49,028 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 35, 'TRBVA*01': 4}




2022-12-28 19:24:52,218 - assign_clone_ids.ipynb - INFO - Participant P00490 (GeneLocus.TCR) has 177957 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00490.parquet.




2022-12-28 19:24:54,230 - assign_clone_ids.ipynb - INFO - Participant P00500 (GeneLocus.TCR) has 182381 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00500.parquet.


2022-12-28 19:24:54,235 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 41, 'TRBVA*01': 3}


2022-12-28 19:24:55,288 - assign_clone_ids.ipynb - INFO - Participant P00498 (GeneLocus.TCR) has 224686 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00498.tsv'), (305214, 132), (235613, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:24:56,381 - assign_clone_ids.ipynb - INFO - Participant P00495 (GeneLocus.TCR) has 184022 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00495.tsv'), (254102, 132), (193302, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:57,802 - assign_clone_ids.ipynb - INFO - Participant P00523 (GeneLocus.TCR) has 160125 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00523.tsv'), (213131, 132), (166117, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:24:59,468 - assign_clone_ids.ipynb - INFO - Participant P00534 (GeneLocus.TCR) has 53016 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00534.tsv'), (73979, 132), (55300, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:00,824 - assign_clone_ids.ipynb - INFO - Participant P00517 (GeneLocus.TCR) has 206999 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00517.tsv'), (259131, 132), (215549, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:25:01,187 - assign_clone_ids.ipynb - INFO - Participant P00527 (GeneLocus.TCR) has 107413 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00527.tsv'), (138825, 132), (111441, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:25:03,221 - assign_clone_ids.ipynb - INFO - Participant P00528 (GeneLocus.TCR) has 112668 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00528.tsv'), (150384, 132), (118415, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:03,718 - assign_clone_ids.ipynb - INFO - Participant P00529 (GeneLocus.TCR) has 77890 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00529.tsv'), (105879, 132), (80662, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:25:05,289 - assign_clone_ids.ipynb - INFO - Participant P00518 (GeneLocus.TCR) has 172511 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00518.parquet.


2022-12-28 19:25:05,373 - assign_clone_ids.ipynb - INFO - Participant P00534 (GeneLocus.TCR) has 52929 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00534.parquet.


2022-12-28 19:25:06,668 - assign_clone_ids.ipynb - INFO - Participant P00516 (GeneLocus.TCR) has 202998 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00516.tsv'), (274834, 132), (212728, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:25:08,838 - assign_clone_ids.ipynb - INFO - Participant P00505 (GeneLocus.TCR) has 228760 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00505.tsv'), (302162, 132), (240327, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:10,107 - assign_clone_ids.ipynb - INFO - Participant P00476 (GeneLocus.TCR) has 260192 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00476.tsv'), (358794, 132), (273235, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:12,158 - assign_clone_ids.ipynb - INFO - Participant P00504 (GeneLocus.TCR) has 199526 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00504.parquet.


2022-12-28 19:25:12,863 - assign_clone_ids.ipynb - INFO - Participant P00529 (GeneLocus.TCR) has 77775 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00529.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:25:13,705 - assign_clone_ids.ipynb - INFO - Participant P00525 (GeneLocus.TCR) has 153902 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00525.tsv'), (213383, 132), (160614, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:15,084 - assign_clone_ids.ipynb - INFO - Participant P00522 (GeneLocus.TCR) has 204056 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00522.tsv'), (274160, 132), (213464, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:15,723 - assign_clone_ids.ipynb - INFO - Participant P00513 (GeneLocus.TCR) has 235827 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00513.tsv'), (306677, 132), (246471, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:17,280 - assign_clone_ids.ipynb - INFO - Participant P00487 (GeneLocus.TCR) has 289897 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00487.tsv'), (385985, 132), (305290, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:21,193 - assign_clone_ids.ipynb - INFO - Participant P00511 (GeneLocus.TCR) has 256010 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00511.tsv'), (341863, 132), (271292, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:25:39,273 - assign_clone_ids.ipynb - INFO - Participant P00520 (GeneLocus.TCR) has 272150 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00520.tsv'), (363693, 132), (284839, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:40,400 - assign_clone_ids.ipynb - INFO - Participant P00524 (GeneLocus.TCR) has 170016 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00524.tsv'), (227152, 132), (178374, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:25:44,890 - assign_clone_ids.ipynb - INFO - Participant P00492 (GeneLocus.TCR) has 259088 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00492.tsv'), (332914, 132), (269979, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:45,952 - assign_clone_ids.ipynb - INFO - Participant P00512 (GeneLocus.TCR) has 186430 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00512.parquet.


  df = pd.read_csv(



2022-12-28 19:25:47,976 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 4}


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:25:50,460 - assign_clone_ids.ipynb - INFO - Participant P00474 (GeneLocus.TCR) has 332657 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00474.tsv'), (463330, 132), (357178, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:25:51,086 - assign_clone_ids.ipynb - INFO - Participant P00499 (GeneLocus.TCR) has 217249 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00499.parquet.




  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:26:02,064 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 1}


  df = pd.read_csv(



2022-12-28 19:26:06,046 - assign_clone_ids.ipynb - INFO - Participant P00542 (GeneLocus.TCR) has 49231 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00542.tsv'), (60834, 132), (49941, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:26:10,991 - assign_clone_ids.ipynb - INFO - Participant P00528 (GeneLocus.TCR) has 112563 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00528.parquet.


2022-12-28 19:26:16,995 - assign_clone_ids.ipynb - INFO - Participant P00527 (GeneLocus.TCR) has 107308 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00527.parquet.


  df = pd.read_csv(



2022-12-28 19:26:25,222 - assign_clone_ids.ipynb - INFO - Participant P00542 (GeneLocus.TCR) has 49113 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00542.parquet.


  df = pd.read_csv(



2022-12-28 19:26:31,616 - assign_clone_ids.ipynb - INFO - Participant P00543 (GeneLocus.TCR) has 112535 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00543.tsv'), (139359, 132), (115491, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:26:52,151 - assign_clone_ids.ipynb - INFO - Participant P00543 (GeneLocus.TCR) has 112360 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00543.parquet.


2022-12-28 19:26:56,163 - assign_clone_ids.ipynb - INFO - Participant P00526 (GeneLocus.TCR) has 191692 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00526.tsv'), (267763, 132), (202684, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:01,652 - assign_clone_ids.ipynb - INFO - Participant P00523 (GeneLocus.TCR) has 159921 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00523.parquet.


2022-12-28 19:27:02,526 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24, 'TRBVA*01': 2}


2022-12-28 19:27:02,525 - assign_clone_ids.ipynb - INFO - Participant P00498 (GeneLocus.TCR) has 224422 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00498.parquet.


2022-12-28 19:27:02,993 - assign_clone_ids.ipynb - INFO - Participant P00495 (GeneLocus.TCR) has 183797 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00495.parquet.


2022-12-28 19:27:03,535 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 13, 'TRBVA*01': 2}


2022-12-28 19:27:04,177 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBVA*01': 3}


2022-12-28 19:27:04,217 - assign_clone_ids.ipynb - INFO - Participant P00525 (GeneLocus.TCR) has 153709 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00525.parquet.


2022-12-28 19:27:04,498 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 56, 'TRBVA*01': 2}


2022-12-28 19:27:04,917 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBVA*01': 2}


2022-12-28 19:27:05,014 - assign_clone_ids.ipynb - INFO - Participant P00517 (GeneLocus.TCR) has 206743 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00517.parquet.


2022-12-28 19:27:05,836 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 19, 'TRBVA*01': 6}


2022-12-28 19:27:06,089 - assign_clone_ids.ipynb - INFO - Participant P00505 (GeneLocus.TCR) has 228453 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00505.parquet.


2022-12-28 19:27:06,073 - assign_clone_ids.ipynb - INFO - Participant P00516 (GeneLocus.TCR) has 202747 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00516.parquet.


2022-12-28 19:27:06,385 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 27, 'TRBVA*01': 2}


2022-12-28 19:27:06,452 - assign_clone_ids.ipynb - INFO - Participant P00522 (GeneLocus.TCR) has 203856 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00522.parquet.


2022-12-28 19:27:07,264 - assign_clone_ids.ipynb - INFO - Participant P00524 (GeneLocus.TCR) has 169778 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00524.parquet.


2022-12-28 19:27:08,479 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 4, 'TRBV7-5*02': 13, 'TRBVA*01': 1}


2022-12-28 19:27:08,669 - assign_clone_ids.ipynb - INFO - Participant P00511 (GeneLocus.TCR) has 255670 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00511.parquet.


2022-12-28 19:27:09,027 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBVA*01': 7}


  df = pd.read_csv(



2022-12-28 19:27:09,429 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 20}


2022-12-28 19:27:09,507 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 32, 'TRBVA*01': 5}


2022-12-28 19:27:10,382 - assign_clone_ids.ipynb - INFO - Participant P00513 (GeneLocus.TCR) has 235537 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00513.parquet.


2022-12-28 19:27:10,383 - assign_clone_ids.ipynb - INFO - Participant P00487 (GeneLocus.TCR) has 289548 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00487.parquet.


2022-12-28 19:27:10,421 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 27, 'TRBVA*01': 3}


2022-12-28 19:27:10,613 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 15, 'TRBV7-5*02': 28, 'TRBVA*01': 3}


2022-12-28 19:27:11,732 - assign_clone_ids.ipynb - INFO - Participant P00530 (GeneLocus.TCR) has 149442 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00530.tsv'), (197965, 132), (155321, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:27:11,889 - assign_clone_ids.ipynb - INFO - Participant P00476 (GeneLocus.TCR) has 259738 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00476.parquet.






















2022-12-28 19:27:18,987 - assign_clone_ids.ipynb - INFO - Participant P00492 (GeneLocus.TCR) has 258650 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00492.parquet.






2022-12-28 19:27:20,316 - assign_clone_ids.ipynb - INFO - Participant P00520 (GeneLocus.TCR) has 271741 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00520.parquet.


2022-12-28 19:27:21,083 - assign_clone_ids.ipynb - INFO - Participant P00526 (GeneLocus.TCR) has 191348 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00526.parquet.










2022-12-28 19:27:23,204 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 16, 'TRBVA*01': 1}




2022-12-28 19:27:25,068 - assign_clone_ids.ipynb - INFO - Participant P00510 (GeneLocus.TCR) has 304801 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00510.tsv'), (409310, 132), (319589, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:28,837 - assign_clone_ids.ipynb - INFO - Participant P00497 (GeneLocus.TCR) has 352806 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00497.tsv'), (470349, 132), (375545, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:29,244 - assign_clone_ids.ipynb - INFO - Participant P00530 (GeneLocus.TCR) has 149230 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00530.parquet.


2022-12-28 19:27:30,262 - assign_clone_ids.ipynb - INFO - Participant P00537 (GeneLocus.TCR) has 122796 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00537.tsv'), (172727, 132), (129004, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:32,946 - assign_clone_ids.ipynb - INFO - Participant P00535 (GeneLocus.TCR) has 145306 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00535.tsv'), (197822, 132), (151189, 144), <GeneLocus.TCR: 2>)]




  df = pd.read_csv(



2022-12-28 19:27:35,220 - assign_clone_ids.ipynb - INFO - Participant P00474 (GeneLocus.TCR) has 332008 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00474.parquet.


2022-12-28 19:27:35,539 - assign_clone_ids.ipynb - INFO - Participant P00547 (GeneLocus.TCR) has 164453 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00547.tsv'), (221378, 132), (171029, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:27:39,526 - assign_clone_ids.ipynb - INFO - Participant P00548 (GeneLocus.TCR) has 142212 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00548.tsv'), (194975, 132), (148817, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:39,889 - assign_clone_ids.ipynb - INFO - Participant P00544 (GeneLocus.TCR) has 180572 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00544.tsv'), (220377, 132), (186571, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:27:42,537 - assign_clone_ids.ipynb - INFO - Participant P00540 (GeneLocus.TCR) has 170047 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00540.tsv'), (217285, 132), (176630, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:43,095 - assign_clone_ids.ipynb - INFO - Participant P00531 (GeneLocus.TCR) has 202164 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00531.tsv'), (266582, 132), (216037, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:43,978 - assign_clone_ids.ipynb - INFO - Participant P00536 (GeneLocus.TCR) has 216085 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00536.tsv'), (283101, 132), (223188, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:44,546 - assign_clone_ids.ipynb - INFO - Participant P00538 (GeneLocus.TCR) has 186036 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00538.tsv'), (253841, 132), (194837, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:45,432 - assign_clone_ids.ipynb - INFO - Participant P00537 (GeneLocus.TCR) has 122562 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00537.parquet.


2022-12-28 19:27:47,697 - assign_clone_ids.ipynb - INFO - Participant P00546 (GeneLocus.TCR) has 194865 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00546.tsv'), (252624, 132), (204297, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:47,347 - assign_clone_ids.ipynb - INFO - Participant P00539 (GeneLocus.TCR) has 226808 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00539.tsv'), (289530, 132), (236369, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:27:51,262 - assign_clone_ids.ipynb - INFO - Participant P00533 (GeneLocus.TCR) has 200519 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00533.tsv'), (258612, 132), (207356, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:52,173 - assign_clone_ids.ipynb - INFO - Participant P00535 (GeneLocus.TCR) has 145140 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00535.parquet.


2022-12-28 19:27:54,247 - assign_clone_ids.ipynb - INFO - Participant P00547 (GeneLocus.TCR) has 164249 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00547.parquet.


2022-12-28 19:27:55,986 - assign_clone_ids.ipynb - INFO - Participant P00551 (GeneLocus.TCR) has 174895 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00551.tsv'), (225874, 132), (181773, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:27:57,811 - assign_clone_ids.ipynb - INFO - Participant P00541 (GeneLocus.TCR) has 277120 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00541.tsv'), (383389, 132), (294166, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:27:58,802 - assign_clone_ids.ipynb - INFO - Participant P00548 (GeneLocus.TCR) has 141990 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00548.parquet.


2022-12-28 19:28:02,613 - assign_clone_ids.ipynb - INFO - Participant P00544 (GeneLocus.TCR) has 180245 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00544.parquet.


2022-12-28 19:28:02,614 - assign_clone_ids.ipynb - INFO - Participant P00510 (GeneLocus.TCR) has 304207 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00510.parquet.


2022-12-28 19:28:06,602 - assign_clone_ids.ipynb - INFO - Participant P00540 (GeneLocus.TCR) has 169783 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00540.parquet.


  df = pd.read_csv(



2022-12-28 19:28:09,881 - assign_clone_ids.ipynb - INFO - Participant P00538 (GeneLocus.TCR) has 185734 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00538.parquet.


  df = pd.read_csv(



2022-12-28 19:28:13,131 - assign_clone_ids.ipynb - INFO - Participant P00531 (GeneLocus.TCR) has 201937 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00531.parquet.


2022-12-28 19:28:13,134 - assign_clone_ids.ipynb - INFO - Participant P00536 (GeneLocus.TCR) has 215759 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00536.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:28:16,427 - assign_clone_ids.ipynb - INFO - Participant P00546 (GeneLocus.TCR) has 194635 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00546.parquet.


2022-12-28 19:28:16,427 - assign_clone_ids.ipynb - INFO - Participant P00497 (GeneLocus.TCR) has 352324 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00497.parquet.


2022-12-28 19:28:17,864 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 43, 'TRBVA*01': 2}


2022-12-28 19:28:20,718 - assign_clone_ids.ipynb - INFO - Participant P00539 (GeneLocus.TCR) has 226560 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00539.parquet.


2022-12-28 19:28:25,460 - assign_clone_ids.ipynb - INFO - Participant P00533 (GeneLocus.TCR) has 200232 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00533.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:28:30,955 - assign_clone_ids.ipynb - INFO - Participant P00551 (GeneLocus.TCR) has 174645 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00551.parquet.




  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:28:41,691 - assign_clone_ids.ipynb - INFO - Participant P00541 (GeneLocus.TCR) has 276663 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00541.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(





2022-12-28 19:28:59,311 - assign_clone_ids.ipynb - INFO - Participant P00572 (GeneLocus.TCR) has 10446 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00572.tsv'), (12937, 132), (10657, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:29:00,551 - assign_clone_ids.ipynb - INFO - Participant P00572 (GeneLocus.TCR) has 10427 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00572.parquet.


2022-12-28 19:29:04,280 - assign_clone_ids.ipynb - INFO - Participant P00549 (GeneLocus.TCR) has 304420 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00549.tsv'), (399632, 132), (323024, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:29:36,119 - assign_clone_ids.ipynb - INFO - Participant P00549 (GeneLocus.TCR) has 303928 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00549.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:30:05,975 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 4}




2022-12-28 19:30:08,799 - assign_clone_ids.ipynb - INFO - Participant P00575 (GeneLocus.TCR) has 14769 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00575.tsv'), (17672, 132), (14869, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:30:10,760 - assign_clone_ids.ipynb - INFO - Participant P00575 (GeneLocus.TCR) has 14732 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00575.parquet.


2022-12-28 19:30:12,929 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 24}


2022-12-28 19:30:13,886 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 2, 'TRBVA*01': 1}


2022-12-28 19:30:15,022 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 38, 'TRBVA*01': 3}




2022-12-28 19:30:16,749 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 8, 'TRBV7-5*02': 2, 'TRBVA*01': 3}


2022-12-28 19:30:17,807 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 10, 'TRBVA*01': 6}




  df = pd.read_csv(



2022-12-28 19:30:19,046 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV8-2*01': 1}


2022-12-28 19:30:19,087 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 15, 'TRBVA*01': 4}


2022-12-28 19:30:21,141 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 10, 'TRBV8-2*01': 1, 'TRBVA*01': 2}








2022-12-28 19:30:23,140 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 35, 'TRBVA*01': 5}










2022-12-28 19:30:25,749 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 13}


2022-12-28 19:30:25,817 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 12, 'TRBVA*01': 3}


2022-12-28 19:30:26,426 - assign_clone_ids.ipynb - INFO - Participant P00560 (GeneLocus.TCR) has 81095 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00560.tsv'), (107538, 132), (83526, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:30:27,811 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 31, 'TRBVA*01': 4}


2022-12-28 19:30:28,127 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 31, 'TRBVA*01': 2}


2022-12-28 19:30:28,719 - assign_clone_ids.ipynb - INFO - Participant P00565 (GeneLocus.TCR) has 83937 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00565.tsv'), (98903, 132), (85264, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:30:30,887 - assign_clone_ids.ipynb - INFO - Participant P00588 (GeneLocus.TCR) has 67529 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00588.tsv'), (83030, 132), (68844, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:30:32,403 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 11}






2022-12-28 19:30:34,186 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 3}




2022-12-28 19:30:34,979 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5}


2022-12-28 19:30:34,983 - assign_clone_ids.ipynb - INFO - Participant P00573 (GeneLocus.TCR) has 69674 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00573.tsv'), (90689, 132), (72636, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:30:35,010 - assign_clone_ids.ipynb - INFO - Participant P00560 (GeneLocus.TCR) has 80966 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00560.parquet.


2022-12-28 19:30:35,896 - assign_clone_ids.ipynb - INFO - Participant P00567 (GeneLocus.TCR) has 86215 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00567.tsv'), (113013, 132), (88559, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:30:36,649 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBVA*01': 2}


2022-12-28 19:30:37,916 - assign_clone_ids.ipynb - INFO - Participant P00561 (GeneLocus.TCR) has 104929 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00561.tsv'), (140800, 132), (108702, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:30:38,252 - assign_clone_ids.ipynb - INFO - Participant P00588 (GeneLocus.TCR) has 67448 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00588.parquet.






2022-12-28 19:30:39,732 - assign_clone_ids.ipynb - INFO - Participant P00565 (GeneLocus.TCR) has 83801 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00565.parquet.




2022-12-28 19:30:41,036 - assign_clone_ids.ipynb - INFO - Participant P00557 (GeneLocus.TCR) has 123470 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00557.tsv'), (161433, 132), (128311, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:30:41,855 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 19:30:42,033 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 14, 'TRBVA*01': 4}


2022-12-28 19:30:43,382 - assign_clone_ids.ipynb - INFO - Participant P00576 (GeneLocus.TCR) has 83090 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00576.tsv'), (110786, 132), (86135, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:30:45,072 - assign_clone_ids.ipynb - INFO - Participant P00573 (GeneLocus.TCR) has 69531 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00573.parquet.


2022-12-28 19:30:45,156 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 22, 'TRBVA*01': 6}


2022-12-28 19:30:47,115 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 21, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 19:30:47,977 - assign_clone_ids.ipynb - INFO - Participant P00567 (GeneLocus.TCR) has 86098 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00567.parquet.


  df = pd.read_csv(



2022-12-28 19:30:52,511 - assign_clone_ids.ipynb - INFO - Participant P00590 (GeneLocus.TCR) has 59467 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00590.tsv'), (82016, 132), (61313, 144), <GeneLocus.TCR: 2>)]






  df = pd.read_csv(



2022-12-28 19:30:56,897 - assign_clone_ids.ipynb - INFO - Participant P00576 (GeneLocus.TCR) has 82900 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00576.parquet.


2022-12-28 19:30:57,567 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 7, 'TRBV7-5*02': 15}


  df = pd.read_csv(





  df = pd.read_csv(





  df = pd.read_csv(



2022-12-28 19:31:07,080 - assign_clone_ids.ipynb - INFO - Participant P00590 (GeneLocus.TCR) has 59361 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00590.parquet.


2022-12-28 19:31:08,424 - assign_clone_ids.ipynb - INFO - Participant P00580 (GeneLocus.TCR) has 138962 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00580.tsv'), (187198, 132), (143913, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:31:10,158 - assign_clone_ids.ipynb - INFO - Participant P00561 (GeneLocus.TCR) has 104797 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00561.parquet.


  df = pd.read_csv(



2022-12-28 19:31:12,032 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBV8-1*01': 1, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 19:31:12,082 - assign_clone_ids.ipynb - INFO - Participant P00557 (GeneLocus.TCR) has 123269 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00557.parquet.


2022-12-28 19:31:14,088 - assign_clone_ids.ipynb - INFO - Participant P00559 (GeneLocus.TCR) has 121129 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00559.tsv'), (175390, 132), (134451, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:31:16,077 - assign_clone_ids.ipynb - INFO - Participant P00553 (GeneLocus.TCR) has 182357 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00553.tsv'), (239344, 132), (189511, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:16,128 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 51, 'TRBVA*01': 1}


2022-12-28 19:31:16,668 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 36, 'TRBV7-5*02': 1, 'TRBVA*01': 3}


2022-12-28 19:31:16,780 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 27, 'TRBVA*01': 2}


2022-12-28 19:31:16,744 - assign_clone_ids.ipynb - INFO - Participant P00555 (GeneLocus.TCR) has 162668 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00555.tsv'), (212798, 132), (168224, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:16,920 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 42, 'TRBVA*01': 1}


2022-12-28 19:31:17,147 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 24, 'TRBVA*01': 2}


2022-12-28 19:31:17,706 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 33, 'TRBVA*01': 2}


2022-12-28 19:31:18,154 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 43, 'TRBVA*01': 8}


2022-12-28 19:31:18,429 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 25, 'TRBVA*01': 3}


2022-12-28 19:31:18,520 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBVA*01': 2}


2022-12-28 19:31:19,396 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 25, 'TRBVA*01': 7}


2022-12-28 19:31:19,939 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 36, 'TRBVA*01': 4}


2022-12-28 19:31:20,570 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*02': 41, 'TRBVA*01': 5}


2022-12-28 19:31:21,904 - assign_clone_ids.ipynb - INFO - Participant P00570 (GeneLocus.TCR) has 121667 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00570.tsv'), (154611, 132), (126409, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:22,486 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 34, 'TRBVA*01': 3}




2022-12-28 19:31:24,134 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 44, 'TRBVA*01': 1}


2022-12-28 19:31:25,503 - assign_clone_ids.ipynb - INFO - Participant P00587 (GeneLocus.TCR) has 158975 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00587.tsv'), (212963, 132), (164671, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:31:25,777 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 42, 'TRBV7-5*02': 1, 'TRBVA*01': 5}


  df = pd.read_csv(







2022-12-28 19:31:27,365 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 69, 'TRBVA*01': 6}


2022-12-28 19:31:27,570 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 17, 'TRBV7-5*02': 14, 'TRBVA*01': 2}










2022-12-28 19:31:31,808 - assign_clone_ids.ipynb - INFO - Participant P00545 (GeneLocus.TCR) has 245497 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00545.tsv'), (322738, 132), (254996, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:31,949 - assign_clone_ids.ipynb - INFO - Participant P00550 (GeneLocus.TCR) has 200584 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00550.tsv'), (266983, 132), (209056, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:31,967 - assign_clone_ids.ipynb - INFO - Participant P00584 (GeneLocus.TCR) has 191921 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00584.tsv'), (248175, 132), (201178, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:31,997 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 42, 'TRBVA*01': 4}






2022-12-28 19:31:32,276 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBV7-5*02': 18, 'TRBVA*01': 1}




2022-12-28 19:31:33,153 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 85, 'TRBV8-2*01': 1, 'TRBVA*01': 10}


2022-12-28 19:31:33,748 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBVA*01': 6}




2022-12-28 19:31:34,681 - assign_clone_ids.ipynb - INFO - Participant P00559 (GeneLocus.TCR) has 120928 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00559.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:31:36,440 - assign_clone_ids.ipynb - INFO - Participant P00580 (GeneLocus.TCR) has 138796 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00580.parquet.


2022-12-28 19:31:36,451 - assign_clone_ids.ipynb - INFO - Participant P00552 (GeneLocus.TCR) has 209552 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00552.tsv'), (281988, 132), (222353, 144), <GeneLocus.TCR: 2>)]










2022-12-28 19:31:40,466 - assign_clone_ids.ipynb - INFO - Participant P00592 (GeneLocus.TCR) has 103538 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00592.tsv'), (139142, 132), (109047, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:40,534 - assign_clone_ids.ipynb - INFO - Participant P00586 (GeneLocus.TCR) has 158043 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00586.tsv'), (202583, 132), (162203, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:41,065 - assign_clone_ids.ipynb - INFO - Participant P00583 (GeneLocus.TCR) has 154845 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00583.tsv'), (203197, 132), (162751, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:31:41,875 - assign_clone_ids.ipynb - INFO - Participant P00596 (GeneLocus.TCR) has 79959 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00596.tsv'), (104545, 132), (83508, 144), <GeneLocus.TCR: 2>)]








  df = pd.read_csv(





  df = pd.read_csv(



2022-12-28 19:31:52,621 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 3, 'TRBV7-5*01': 11, 'TRBV7-5*02': 19, 'TRBVA*01': 2}










2022-12-28 19:32:06,239 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 13, 'TRBVA*01': 6}






2022-12-28 19:32:15,029 - assign_clone_ids.ipynb - INFO - Participant P00570 (GeneLocus.TCR) has 121515 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00570.parquet.


2022-12-28 19:32:15,561 - assign_clone_ids.ipynb - INFO - Participant P00568 (GeneLocus.TCR) has 191665 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00568.tsv'), (248289, 132), (201235, 144), <GeneLocus.TCR: 2>)]














2022-12-28 19:32:19,018 - assign_clone_ids.ipynb - INFO - Participant P00596 (GeneLocus.TCR) has 79877 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00596.parquet.






2022-12-28 19:32:21,569 - assign_clone_ids.ipynb - INFO - Participant P00555 (GeneLocus.TCR) has 162435 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00555.parquet.


2022-12-28 19:32:21,740 - assign_clone_ids.ipynb - INFO - Participant P00592 (GeneLocus.TCR) has 103407 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00592.parquet.


2022-12-28 19:32:23,060 - assign_clone_ids.ipynb - INFO - Participant P00553 (GeneLocus.TCR) has 182146 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00553.parquet.


2022-12-28 19:32:24,165 - assign_clone_ids.ipynb - INFO - Participant P00587 (GeneLocus.TCR) has 158666 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00587.parquet.


  df = pd.read_csv(



2022-12-28 19:32:27,706 - assign_clone_ids.ipynb - INFO - Participant P00584 (GeneLocus.TCR) has 191686 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00584.parquet.


2022-12-28 19:32:28,063 - assign_clone_ids.ipynb - INFO - Participant P00583 (GeneLocus.TCR) has 154665 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00583.parquet.


2022-12-28 19:32:30,097 - assign_clone_ids.ipynb - INFO - Participant P00545 (GeneLocus.TCR) has 245155 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00545.parquet.


2022-12-28 19:32:30,463 - assign_clone_ids.ipynb - INFO - Participant P00550 (GeneLocus.TCR) has 200364 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00550.parquet.


2022-12-28 19:32:31,162 - assign_clone_ids.ipynb - INFO - Participant P00589 (GeneLocus.TCR) has 179770 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00589.tsv'), (239700, 132), (187468, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:31,793 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15, 'TRBV7-5*02': 27, 'TRBVA*01': 5}


2022-12-28 19:32:33,295 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBV7-5*02': 27, 'TRBVA*01': 5}


2022-12-28 19:32:34,519 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 15, 'TRBV7-5*02': 35, 'TRBVA*01': 5}


2022-12-28 19:32:35,549 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 17, 'TRBV7-5*02': 28, 'TRBV8-2*01': 1, 'TRBVA*01': 2}


2022-12-28 19:32:36,939 - assign_clone_ids.ipynb - INFO - Participant P00594 (GeneLocus.TCR) has 151342 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00594.tsv'), (196056, 132), (157779, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:32:38,393 - assign_clone_ids.ipynb - INFO - Participant P00552 (GeneLocus.TCR) has 209274 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00552.parquet.


2022-12-28 19:32:40,246 - assign_clone_ids.ipynb - INFO - Participant P00601 (GeneLocus.TCR) has 143799 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00601.tsv'), (189089, 132), (149919, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:40,256 - assign_clone_ids.ipynb - INFO - Participant P00578 (GeneLocus.TCR) has 191278 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00578.tsv'), (258239, 132), (198275, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:41,194 - assign_clone_ids.ipynb - INFO - Participant P00563 (GeneLocus.TCR) has 212905 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00563.tsv'), (276034, 132), (225073, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:41,647 - assign_clone_ids.ipynb - INFO - Participant P00586 (GeneLocus.TCR) has 157781 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00586.parquet.


2022-12-28 19:32:41,661 - assign_clone_ids.ipynb - INFO - Participant P00566 (GeneLocus.TCR) has 176334 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00566.tsv'), (232838, 132), (185894, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:41,700 - assign_clone_ids.ipynb - INFO - Participant P00571 (GeneLocus.TCR) has 210866 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00571.tsv'), (269852, 132), (218329, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:32:42,526 - assign_clone_ids.ipynb - INFO - Participant P00591 (GeneLocus.TCR) has 201091 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00591.tsv'), (265988, 132), (209534, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:42,694 - assign_clone_ids.ipynb - INFO - Participant P00585 (GeneLocus.TCR) has 192931 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00585.tsv'), (257163, 132), (200623, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:44,337 - assign_clone_ids.ipynb - INFO - Participant P00582 (GeneLocus.TCR) has 205709 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00582.tsv'), (271885, 132), (213282, 144), <GeneLocus.TCR: 2>)]








2022-12-28 19:32:46,765 - assign_clone_ids.ipynb - INFO - Participant P00569 (GeneLocus.TCR) has 267997 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00569.tsv'), (353571, 132), (284216, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:46,898 - assign_clone_ids.ipynb - INFO - Participant P00558 (GeneLocus.TCR) has 245661 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00558.tsv'), (310163, 132), (256965, 144), <GeneLocus.TCR: 2>)]












2022-12-28 19:32:51,321 - assign_clone_ids.ipynb - INFO - Participant P00568 (GeneLocus.TCR) has 191196 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00568.parquet.


  df = pd.read_csv(





2022-12-28 19:32:53,140 - assign_clone_ids.ipynb - INFO - Participant P00589 (GeneLocus.TCR) has 179445 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00589.parquet.


  df = pd.read_csv(



2022-12-28 19:32:54,201 - assign_clone_ids.ipynb - INFO - Participant P00581 (GeneLocus.TCR) has 271882 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00581.tsv'), (360010, 132), (283179, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:56,159 - assign_clone_ids.ipynb - INFO - Participant P00593 (GeneLocus.TCR) has 231581 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00593.tsv'), (312600, 132), (246311, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:32:56,358 - assign_clone_ids.ipynb - INFO - Participant P00594 (GeneLocus.TCR) has 151178 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00594.parquet.


2022-12-28 19:32:57,197 - assign_clone_ids.ipynb - INFO - Participant P00579 (GeneLocus.TCR) has 267883 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00579.tsv'), (365834, 132), (282361, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:32:58,217 - assign_clone_ids.ipynb - INFO - Participant P00601 (GeneLocus.TCR) has 143626 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00601.parquet.


2022-12-28 19:32:59,845 - assign_clone_ids.ipynb - INFO - Participant P00554 (GeneLocus.TCR) has 303131 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00554.tsv'), (401390, 132), (317574, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:33:02,679 - assign_clone_ids.ipynb - INFO - Participant P00556 (GeneLocus.TCR) has 259650 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00556.tsv'), (337241, 132), (270271, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:33:03,520 - assign_clone_ids.ipynb - INFO - Participant P00574 (GeneLocus.TCR) has 300177 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00574.tsv'), (390669, 132), (316131, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:33:05,035 - assign_clone_ids.ipynb - INFO - Participant P00577 (GeneLocus.TCR) has 281308 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00577.tsv'), (401933, 132), (297626, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:33:15,249 - assign_clone_ids.ipynb - INFO - Participant P00562 (GeneLocus.TCR) has 285563 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00562.tsv'), (393035, 132), (300762, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:33:25,562 - assign_clone_ids.ipynb - INFO - Participant P00593 (GeneLocus.TCR) has 231160 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00593.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:34:01,976 - assign_clone_ids.ipynb - INFO - Participant P00566 (GeneLocus.TCR) has 176132 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00566.parquet.


2022-12-28 19:34:03,078 - assign_clone_ids.ipynb - INFO - Participant P00571 (GeneLocus.TCR) has 210626 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00571.parquet.


2022-12-28 19:34:06,238 - assign_clone_ids.ipynb - INFO - Participant P00591 (GeneLocus.TCR) has 200818 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00591.parquet.


2022-12-28 19:34:06,827 - assign_clone_ids.ipynb - INFO - Participant P00578 (GeneLocus.TCR) has 191042 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00578.parquet.


2022-12-28 19:34:06,957 - assign_clone_ids.ipynb - INFO - Participant P00563 (GeneLocus.TCR) has 212578 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00563.parquet.


2022-12-28 19:34:07,396 - assign_clone_ids.ipynb - INFO - Participant P00597 (GeneLocus.TCR) has 240705 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00597.tsv'), (312009, 132), (252127, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:34:07,734 - assign_clone_ids.ipynb - INFO - Participant P00582 (GeneLocus.TCR) has 205376 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00582.parquet.


2022-12-28 19:34:09,153 - assign_clone_ids.ipynb - INFO - Participant P00585 (GeneLocus.TCR) has 192514 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00585.parquet.


2022-12-28 19:34:12,763 - assign_clone_ids.ipynb - INFO - Participant P00558 (GeneLocus.TCR) has 245337 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00558.parquet.


2022-12-28 19:34:13,565 - assign_clone_ids.ipynb - INFO - Participant P00569 (GeneLocus.TCR) has 267686 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00569.parquet.


2022-12-28 19:34:14,302 - assign_clone_ids.ipynb - INFO - Participant P00556 (GeneLocus.TCR) has 259337 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00556.parquet.


2022-12-28 19:34:14,468 - assign_clone_ids.ipynb - INFO - Participant P00579 (GeneLocus.TCR) has 267468 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00579.parquet.


2022-12-28 19:34:14,849 - assign_clone_ids.ipynb - INFO - Participant P00581 (GeneLocus.TCR) has 271483 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00581.parquet.


2022-12-28 19:34:15,284 - assign_clone_ids.ipynb - INFO - Participant P00554 (GeneLocus.TCR) has 302710 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00554.parquet.


2022-12-28 19:34:15,944 - assign_clone_ids.ipynb - INFO - Participant P00574 (GeneLocus.TCR) has 299809 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00574.parquet.


  df = pd.read_csv(



2022-12-28 19:34:17,647 - assign_clone_ids.ipynb - INFO - Participant P00577 (GeneLocus.TCR) has 280940 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00577.parquet.


2022-12-28 19:34:20,331 - assign_clone_ids.ipynb - INFO - Participant P00595 (GeneLocus.TCR) has 228809 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00595.tsv'), (300013, 132), (237714, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:34:21,928 - assign_clone_ids.ipynb - INFO - Participant P00564 (GeneLocus.TCR) has 304798 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00564.tsv'), (397950, 132), (318177, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:34:22,094 - assign_clone_ids.ipynb - INFO - Participant P00599 (GeneLocus.TCR) has 243596 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00599.tsv'), (336428, 132), (257463, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:34:26,183 - assign_clone_ids.ipynb - INFO - Participant P00598 (GeneLocus.TCR) has 261792 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00598.tsv'), (358736, 132), (282341, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:34:33,662 - assign_clone_ids.ipynb - INFO - Participant P00597 (GeneLocus.TCR) has 240290 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00597.parquet.


2022-12-28 19:34:35,485 - assign_clone_ids.ipynb - INFO - Participant P00562 (GeneLocus.TCR) has 285172 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00562.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:34:40,089 - assign_clone_ids.ipynb - INFO - Participant P00532 (GeneLocus.TCR) has 419186 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00532.tsv'), (582831, 132), (450952, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:34:44,168 - assign_clone_ids.ipynb - INFO - Participant P00595 (GeneLocus.TCR) has 228358 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00595.parquet.


2022-12-28 19:34:49,441 - assign_clone_ids.ipynb - INFO - Participant P00599 (GeneLocus.TCR) has 243257 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00599.parquet.


2022-12-28 19:34:55,424 - assign_clone_ids.ipynb - INFO - Participant P00598 (GeneLocus.TCR) has 261468 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00598.parquet.


2022-12-28 19:34:55,141 - assign_clone_ids.ipynb - INFO - Participant P00564 (GeneLocus.TCR) has 304285 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00564.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:35:23,978 - assign_clone_ids.ipynb - INFO - Participant P00532 (GeneLocus.TCR) has 418397 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00532.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:36:23,405 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 4, 'TRBVA*01': 4}


2022-12-28 19:36:25,874 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 15}




2022-12-28 19:36:29,117 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 21, 'TRBV7-5*02': 24, 'TRBVA*01': 6}


2022-12-28 19:36:30,927 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 7, 'TRBVA*01': 1}






2022-12-28 19:36:31,905 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBVA*01': 5}






2022-12-28 19:36:36,916 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 15}


2022-12-28 19:36:36,930 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBV7-5*02': 18, 'TRBVA*01': 1}


2022-12-28 19:36:38,248 - assign_clone_ids.ipynb - INFO - Participant P00635 (GeneLocus.TCR) has 87676 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00635.tsv'), (112512, 132), (91359, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:36:38,593 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 13}






2022-12-28 19:36:40,571 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 13, 'TRBVA*01': 3}


2022-12-28 19:36:40,999 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 45, 'TRBVA*01': 5}


2022-12-28 19:36:41,755 - assign_clone_ids.ipynb - INFO - Participant P00603 (GeneLocus.TCR) has 61544 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00603.tsv'), (77951, 132), (62940, 144), <GeneLocus.TCR: 2>)]








2022-12-28 19:36:44,406 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 26, 'TRBVA*01': 3}


2022-12-28 19:36:44,988 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 9, 'TRBV7-5*02': 16, 'TRBVA*01': 2}




2022-12-28 19:36:45,401 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 4}






2022-12-28 19:36:47,209 - assign_clone_ids.ipynb - INFO - Participant P00632 (GeneLocus.TCR) has 110745 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00632.tsv'), (156691, 132), (120810, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:36:49,025 - assign_clone_ids.ipynb - INFO - Participant P00603 (GeneLocus.TCR) has 61477 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00603.parquet.


2022-12-28 19:36:49,722 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 11, 'TRBVA*01': 1}


2022-12-28 19:36:49,777 - assign_clone_ids.ipynb - INFO - Participant P00635 (GeneLocus.TCR) has 87546 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00635.parquet.


2022-12-28 19:36:51,489 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 35, 'TRBVA*01': 4}


2022-12-28 19:36:51,665 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14, 'TRBV7-5*02': 14, 'TRBVA*01': 4}


2022-12-28 19:36:52,093 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBV7-5*02': 24}


2022-12-28 19:36:52,567 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 18, 'TRBVA*01': 1}






2022-12-28 19:36:53,290 - assign_clone_ids.ipynb - INFO - Participant P00637 (GeneLocus.TCR) has 127081 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00637.tsv'), (176036, 132), (135337, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:36:54,980 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 14}


2022-12-28 19:36:56,178 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBVA*01': 1}


2022-12-28 19:36:56,660 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 41, 'TRBVA*01': 8}








2022-12-28 19:36:59,184 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 18, 'TRBVA*01': 2}


2022-12-28 19:36:59,869 - assign_clone_ids.ipynb - INFO - Participant P00612 (GeneLocus.TCR) has 124224 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00612.tsv'), (174378, 132), (128498, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:37:00,158 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 36, 'TRBV8-1*01': 1, 'TRBVA*01': 3}


2022-12-28 19:37:00,971 - assign_clone_ids.ipynb - INFO - Participant P00625 (GeneLocus.TCR) has 97848 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00625.tsv'), (129501, 132), (101482, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:01,239 - assign_clone_ids.ipynb - INFO - Participant P00632 (GeneLocus.TCR) has 110543 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00632.parquet.


  df = pd.read_csv(



2022-12-28 19:37:02,301 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 49, 'TRBVA*01': 5}






2022-12-28 19:37:02,772 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 35, 'TRBV8-1*01': 1, 'TRBVA*01': 1}


2022-12-28 19:37:02,922 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV10-1*03': 1, 'TRBV7-5*01': 25, 'TRBV7-5*02': 44, 'TRBVA*01': 5}


2022-12-28 19:37:03,287 - assign_clone_ids.ipynb - INFO - Participant P00626 (GeneLocus.TCR) has 86840 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00626.tsv'), (113372, 132), (89401, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:03,593 - assign_clone_ids.ipynb - INFO - Participant P00629 (GeneLocus.TCR) has 118231 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00629.tsv'), (163054, 132), (132084, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:03,920 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 31, 'TRBVA*01': 2}


2022-12-28 19:37:04,153 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8, 'TRBV7-5*02': 9, 'TRBVA*01': 1}












2022-12-28 19:37:06,347 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 49, 'TRBVA*01': 7}








2022-12-28 19:37:08,794 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 13, 'TRBV7-5*02': 21, 'TRBVA*01': 9}


  df = pd.read_csv(



2022-12-28 19:37:09,546 - assign_clone_ids.ipynb - INFO - Participant P00624 (GeneLocus.TCR) has 146474 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00624.tsv'), (193393, 132), (151296, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:09,552 - assign_clone_ids.ipynb - INFO - Participant P00637 (GeneLocus.TCR) has 126928 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00637.parquet.








2022-12-28 19:37:14,586 - assign_clone_ids.ipynb - INFO - Participant P00600 (GeneLocus.TCR) has 146224 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00600.tsv'), (187423, 132), (151450, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:37:15,318 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 26, 'TRBVA*01': 3}


2022-12-28 19:37:15,743 - assign_clone_ids.ipynb - INFO - Participant P00626 (GeneLocus.TCR) has 86745 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00626.parquet.


2022-12-28 19:37:16,797 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV25/OR9-2*01': 1, 'TRBV7-5*01': 13, 'TRBVA*01': 1}






  df = pd.read_csv(



2022-12-28 19:37:19,029 - assign_clone_ids.ipynb - INFO - Participant P00640 (GeneLocus.TCR) has 66941 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00640.tsv'), (89701, 132), (69236, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:19,844 - assign_clone_ids.ipynb - INFO - Participant P00625 (GeneLocus.TCR) has 97706 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00625.parquet.


2022-12-28 19:37:20,649 - assign_clone_ids.ipynb - INFO - Participant P00612 (GeneLocus.TCR) has 124065 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00612.parquet.


2022-12-28 19:37:20,933 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 28, 'TRBVA*01': 3}


2022-12-28 19:37:21,196 - assign_clone_ids.ipynb - INFO - Participant P00631 (GeneLocus.TCR) has 145663 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00631.tsv'), (182476, 132), (149400, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:37:21,503 - assign_clone_ids.ipynb - INFO - Participant P00629 (GeneLocus.TCR) has 118068 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00629.parquet.






  df = pd.read_csv(













2022-12-28 19:37:26,024 - assign_clone_ids.ipynb - INFO - Participant P00608 (GeneLocus.TCR) has 137871 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00608.tsv'), (179971, 132), (141503, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:26,181 - assign_clone_ids.ipynb - INFO - Participant P00609 (GeneLocus.TCR) has 133576 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00609.tsv'), (179332, 132), (139662, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:37:27,161 - assign_clone_ids.ipynb - INFO - Participant P00633 (GeneLocus.TCR) has 211813 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00633.tsv'), (284749, 132), (221597, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:37:28,602 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 10, 'TRBV7-5*02': 10}


2022-12-28 19:37:29,374 - assign_clone_ids.ipynb - INFO - Participant P00605 (GeneLocus.TCR) has 185190 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00605.tsv'), (236768, 132), (191962, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:29,702 - assign_clone_ids.ipynb - INFO - Participant P00640 (GeneLocus.TCR) has 66864 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00640.parquet.


2022-12-28 19:37:30,113 - assign_clone_ids.ipynb - INFO - Participant P00614 (GeneLocus.TCR) has 196442 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00614.tsv'), (260622, 132), (204073, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:30,573 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 94, 'TRBVA*01': 4}


2022-12-28 19:37:30,609 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBVA*01': 3}


2022-12-28 19:37:31,247 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 12, 'TRBV7-5*02': 21, 'TRBVA*01': 1}


2022-12-28 19:37:31,917 - assign_clone_ids.ipynb - INFO - Participant P00623 (GeneLocus.TCR) has 193304 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00623.tsv'), (261628, 132), (201265, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:37:33,949 - assign_clone_ids.ipynb - INFO - Participant P00624 (GeneLocus.TCR) has 146235 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00624.parquet.


2022-12-28 19:37:36,152 - assign_clone_ids.ipynb - INFO - Participant P00618 (GeneLocus.TCR) has 200540 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00618.tsv'), (256226, 132), (210460, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:36,377 - assign_clone_ids.ipynb - INFO - Participant P00610 (GeneLocus.TCR) has 168984 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00610.tsv'), (221948, 132), (175311, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:36,448 - assign_clone_ids.ipynb - INFO - Participant P00622 (GeneLocus.TCR) has 162254 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00622.tsv'), (205358, 132), (167174, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:37,314 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6}


2022-12-28 19:37:39,490 - assign_clone_ids.ipynb - INFO - Participant P00600 (GeneLocus.TCR) has 145988 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00600.parquet.










2022-12-28 19:37:41,569 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 19, 'TRBV7-5*02': 29, 'TRBVA*01': 4}




2022-12-28 19:37:41,804 - assign_clone_ids.ipynb - INFO - Participant P00604 (GeneLocus.TCR) has 119919 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00604.tsv'), (155805, 132), (123732, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:42,774 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 40, 'TRBVA*01': 4}


2022-12-28 19:37:42,944 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 21, 'TRBV7-5*02': 32, 'TRBVA*01': 3}


  df = pd.read_csv(



2022-12-28 19:37:43,860 - assign_clone_ids.ipynb - INFO - Participant P00628 (GeneLocus.TCR) has 230620 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00628.tsv'), (310194, 132), (242486, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:43,966 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3, 'TRBV7-5*02': 36, 'TRBVA*01': 5}


2022-12-28 19:37:44,594 - assign_clone_ids.ipynb - INFO - Participant P00608 (GeneLocus.TCR) has 137678 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00608.parquet.


2022-12-28 19:37:45,522 - assign_clone_ids.ipynb - INFO - Participant P00619 (GeneLocus.TCR) has 110809 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00619.tsv'), (147887, 132), (115150, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:37:46,456 - assign_clone_ids.ipynb - INFO - Participant P00631 (GeneLocus.TCR) has 145300 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00631.parquet.




2022-12-28 19:37:49,013 - assign_clone_ids.ipynb - INFO - Participant P00609 (GeneLocus.TCR) has 133386 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00609.parquet.


2022-12-28 19:37:49,750 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 65, 'TRBVA*01': 7}


2022-12-28 19:37:50,265 - assign_clone_ids.ipynb - INFO - Participant P00636 (GeneLocus.TCR) has 183268 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00636.tsv'), (244484, 132), (194144, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:51,460 - assign_clone_ids.ipynb - INFO - Participant P00645 (GeneLocus.TCR) has 48134 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00645.tsv'), (70541, 132), (54077, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:37:54,921 - assign_clone_ids.ipynb - INFO - Participant P00605 (GeneLocus.TCR) has 184906 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00605.parquet.


  df = pd.read_csv(



2022-12-28 19:37:56,790 - assign_clone_ids.ipynb - INFO - Participant P00606 (GeneLocus.TCR) has 254060 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00606.tsv'), (333369, 132), (264416, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:56,924 - assign_clone_ids.ipynb - INFO - Participant P00604 (GeneLocus.TCR) has 119771 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00604.parquet.


2022-12-28 19:37:57,090 - assign_clone_ids.ipynb - INFO - Participant P00623 (GeneLocus.TCR) has 192987 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00623.parquet.


2022-12-28 19:37:57,791 - assign_clone_ids.ipynb - INFO - Participant P00645 (GeneLocus.TCR) has 48037 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00645.parquet.








2022-12-28 19:37:59,232 - assign_clone_ids.ipynb - INFO - Participant P00620 (GeneLocus.TCR) has 142873 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00620.tsv'), (191585, 132), (150666, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:37:59,429 - assign_clone_ids.ipynb - INFO - Participant P00633 (GeneLocus.TCR) has 211523 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00633.parquet.


2022-12-28 19:37:59,536 - assign_clone_ids.ipynb - INFO - Participant P00611 (GeneLocus.TCR) has 239636 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00611.tsv'), (308680, 132), (248946, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:37:59,900 - assign_clone_ids.ipynb - INFO - Participant P00610 (GeneLocus.TCR) has 168706 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00610.parquet.




2022-12-28 19:38:01,393 - assign_clone_ids.ipynb - INFO - Participant P00602 (GeneLocus.TCR) has 247045 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00602.tsv'), (325476, 132), (261216, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:02,908 - assign_clone_ids.ipynb - INFO - Participant P00622 (GeneLocus.TCR) has 161972 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00622.parquet.


2022-12-28 19:38:02,908 - assign_clone_ids.ipynb - INFO - Participant P00614 (GeneLocus.TCR) has 196181 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00614.parquet.


2022-12-28 19:38:02,962 - assign_clone_ids.ipynb - INFO - Participant P00621 (GeneLocus.TCR) has 251335 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00621.tsv'), (341019, 132), (264984, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:38:03,177 - assign_clone_ids.ipynb - INFO - Participant P00641 (GeneLocus.TCR) has 121006 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00641.tsv'), (167243, 132), (129217, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:38:03,485 - assign_clone_ids.ipynb - INFO - Participant P00619 (GeneLocus.TCR) has 110654 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00619.parquet.




2022-12-28 19:38:07,640 - assign_clone_ids.ipynb - INFO - Participant P00613 (GeneLocus.TCR) has 181752 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00613.tsv'), (234917, 132), (188974, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:38:10,097 - assign_clone_ids.ipynb - INFO - Participant P00638 (GeneLocus.TCR) has 252803 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00638.tsv'), (342160, 132), (265104, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:10,924 - assign_clone_ids.ipynb - INFO - Participant P00618 (GeneLocus.TCR) has 200256 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00618.parquet.


2022-12-28 19:38:11,959 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 23, 'TRBV7-5*02': 32, 'TRBV8-2*01': 1, 'TRBVA*01': 7}


2022-12-28 19:38:12,816 - assign_clone_ids.ipynb - INFO - Participant P00628 (GeneLocus.TCR) has 230259 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00628.parquet.


2022-12-28 19:38:15,159 - assign_clone_ids.ipynb - INFO - Participant P00615 (GeneLocus.TCR) has 220992 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00615.tsv'), (278523, 132), (229353, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:17,517 - assign_clone_ids.ipynb - INFO - Participant P00607 (GeneLocus.TCR) has 366121 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00607.tsv'), (499501, 132), (390801, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:17,558 - assign_clone_ids.ipynb - INFO - Participant P00620 (GeneLocus.TCR) has 142676 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00620.parquet.


2022-12-28 19:38:17,943 - assign_clone_ids.ipynb - INFO - Participant P00616 (GeneLocus.TCR) has 206083 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00616.tsv'), (299343, 132), (217950, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:18,164 - assign_clone_ids.ipynb - INFO - Participant P00636 (GeneLocus.TCR) has 183028 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00636.parquet.


2022-12-28 19:38:18,803 - assign_clone_ids.ipynb - INFO - Participant P00641 (GeneLocus.TCR) has 120849 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00641.parquet.


2022-12-28 19:38:19,765 - assign_clone_ids.ipynb - INFO - Participant P00627 (GeneLocus.TCR) has 260452 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00627.tsv'), (351135, 132), (277639, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:24,309 - assign_clone_ids.ipynb - INFO - Participant P00642 (GeneLocus.TCR) has 172365 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00642.tsv'), (231925, 132), (179819, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:38:27,441 - assign_clone_ids.ipynb - INFO - Participant P00606 (GeneLocus.TCR) has 253716 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00606.parquet.


  df = pd.read_csv(



2022-12-28 19:38:28,891 - assign_clone_ids.ipynb - INFO - Participant P00613 (GeneLocus.TCR) has 181408 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00613.parquet.


2022-12-28 19:38:28,891 - assign_clone_ids.ipynb - INFO - Participant P00611 (GeneLocus.TCR) has 239206 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00611.parquet.


2022-12-28 19:38:29,730 - assign_clone_ids.ipynb - INFO - Participant P00617 (GeneLocus.TCR) has 250044 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00617.tsv'), (341819, 132), (261541, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:38:31,855 - assign_clone_ids.ipynb - INFO - Participant P00639 (GeneLocus.TCR) has 241779 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00639.tsv'), (340553, 132), (255335, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



2022-12-28 19:38:36,906 - assign_clone_ids.ipynb - INFO - Participant P00630 (GeneLocus.TCR) has 256799 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00630.tsv'), (349746, 132), (272053, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:38:40,595 - assign_clone_ids.ipynb - INFO - Participant P00638 (GeneLocus.TCR) has 252452 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00638.parquet.


2022-12-28 19:38:40,595 - assign_clone_ids.ipynb - INFO - Participant P00602 (GeneLocus.TCR) has 246700 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00602.parquet.


2022-12-28 19:38:40,595 - assign_clone_ids.ipynb - INFO - Participant P00615 (GeneLocus.TCR) has 220660 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00615.parquet.


2022-12-28 19:38:40,602 - assign_clone_ids.ipynb - INFO - Participant P00621 (GeneLocus.TCR) has 251004 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00621.parquet.


  df = pd.read_csv(



2022-12-28 19:38:41,455 - assign_clone_ids.ipynb - INFO - Participant P00616 (GeneLocus.TCR) has 205801 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00616.parquet.


  df = pd.read_csv(



2022-12-28 19:38:49,800 - assign_clone_ids.ipynb - INFO - Participant P00643 (GeneLocus.TCR) has 198873 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00643.tsv'), (279361, 132), (209449, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:38:53,883 - assign_clone_ids.ipynb - INFO - Participant P00634 (GeneLocus.TCR) has 326448 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00634.tsv'), (460884, 132), (345436, 144), <GeneLocus.TCR: 2>)]


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:39:08,945 - assign_clone_ids.ipynb - INFO - Participant P00630 (GeneLocus.TCR) has 256403 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00630.parquet.


2022-12-28 19:39:08,944 - assign_clone_ids.ipynb - INFO - Participant P00617 (GeneLocus.TCR) has 249729 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00617.parquet.


2022-12-28 19:39:08,945 - assign_clone_ids.ipynb - INFO - Participant P00607 (GeneLocus.TCR) has 365656 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00607.parquet.


2022-12-28 19:39:08,946 - assign_clone_ids.ipynb - INFO - Participant P00627 (GeneLocus.TCR) has 260179 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00627.parquet.


2022-12-28 19:39:08,951 - assign_clone_ids.ipynb - INFO - Participant P00642 (GeneLocus.TCR) has 172135 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00642.parquet.


2022-12-28 19:39:08,951 - assign_clone_ids.ipynb - INFO - Participant P00639 (GeneLocus.TCR) has 241490 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00639.parquet.


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:39:12,840 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 21811}


  df = pd.read_csv(



  df = pd.read_csv(



  df = pd.read_csv(



2022-12-28 19:39:24,034 - assign_clone_ids.ipynb - INFO - Participant P00643 (GeneLocus.TCR) has 198536 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00643.parquet.


2022-12-28 19:39:28,799 - assign_clone_ids.ipynb - INFO - Participant P00634 (GeneLocus.TCR) has 325794 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00634.parquet.


  df = pd.read_csv(



2022-12-28 19:39:42,745 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 4883}






2022-12-28 19:39:54,731 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 16, 'TRBVA*01': 1}




2022-12-28 19:39:59,290 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 10, 'TRBVA*01': 1}


2022-12-28 19:40:01,225 - assign_clone_ids.ipynb - INFO - Participant p15 (GeneLocus.TCR) has 182741 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A2-i138.txt.gz'), (205999, 16), (192666, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:40:03,123 - assign_clone_ids.ipynb - INFO - Participant P00665 (GeneLocus.TCR) has 49070 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00665.tsv'), (67181, 132), (50405, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:40:07,905 - assign_clone_ids.ipynb - INFO - Participant P00665 (GeneLocus.TCR) has 49007 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00665.parquet.


2022-12-28 19:40:08,987 - assign_clone_ids.ipynb - INFO - Participant P00666 (GeneLocus.TCR) has 53658 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00666.tsv'), (76835, 132), (62239, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:40:09,292 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 8}


2022-12-28 19:40:12,980 - assign_clone_ids.ipynb - INFO - Participant p15 (GeneLocus.TCR) has 182524 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p15.parquet.




2022-12-28 19:40:14,471 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 20, 'TRBVA*01': 2}




2022-12-28 19:40:14,952 - assign_clone_ids.ipynb - INFO - Participant P00666 (GeneLocus.TCR) has 53608 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00666.parquet.




2022-12-28 19:40:23,248 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 4}


2022-12-28 19:40:23,884 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 3, 'TRBV7-5*02': 2}


2022-12-28 19:40:26,431 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 12787}


2022-12-28 19:40:26,466 - assign_clone_ids.ipynb - INFO - Participant P00658 (GeneLocus.TCR) has 108333 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00658.tsv'), (134873, 132), (112478, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:40:29,614 - assign_clone_ids.ipynb - INFO - Participant P00667 (GeneLocus.TCR) has 92645 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00667.tsv'), (127217, 132), (96615, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:40:34,065 - assign_clone_ids.ipynb - INFO - Participant p1531 (GeneLocus.TCR) has 57818 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f70c136a6d924982947d.tsv'), (103409, 169), (58378, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:40:37,828 - assign_clone_ids.ipynb - INFO - Participant p1437 (GeneLocus.TCR) has 72519 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.5f07aa8839579433171763b4.tsv'), (127993, 169), (73304, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:40:38,302 - assign_clone_ids.ipynb - INFO - Participant P00658 (GeneLocus.TCR) has 108206 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00658.parquet.


2022-12-28 19:40:40,445 - assign_clone_ids.ipynb - INFO - Participant P00667 (GeneLocus.TCR) has 92550 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00667.parquet.


2022-12-28 19:40:40,993 - assign_clone_ids.ipynb - INFO - Participant p1531 (GeneLocus.TCR) has 57741 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1531.parquet.


2022-12-28 19:40:42,988 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 19, 'TRBV7-5*02': 12, 'TRBVA*01': 3, 'TRBVA/OR9-2*01': 1}




2022-12-28 19:40:46,545 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 19, 'TRBVA*01': 4}


2022-12-28 19:40:46,774 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 4}


2022-12-28 19:40:48,235 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1}


2022-12-28 19:40:50,045 - assign_clone_ids.ipynb - INFO - Participant p1437 (GeneLocus.TCR) has 72426 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1437.parquet.


2022-12-28 19:40:50,680 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 5, 'TRBVA*01': 1, 'TRBVC*01': 1}


2022-12-28 19:40:50,857 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 33, 'TRBV8-2*01': 1, 'TRBVA*01': 1}


2022-12-28 19:40:51,300 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 8, 'TRBV7-5*02': 16, 'TRBVA*01': 2}


2022-12-28 19:40:51,672 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*02': 41, 'TRBVA*01': 4}


2022-12-28 19:40:52,397 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 13981}


2022-12-28 19:40:53,553 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 10141}






2022-12-28 19:40:54,223 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 30, 'TRBVA*01': 4}


2022-12-28 19:40:54,749 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 9339}


2022-12-28 19:40:56,870 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 6, 'TRBVA*01': 1}


2022-12-28 19:40:57,299 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 26, 'TRBV8-1*01': 3, 'TRBVA*01': 3}


2022-12-28 19:40:58,990 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 30, 'TRBVA*01': 5}


2022-12-28 19:40:59,164 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 33, 'TRBV8-2*01': 1}


2022-12-28 19:40:59,411 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 3, 'TRBV7-5*02': 3}




2022-12-28 19:40:59,639 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2, 'TRBV7-5*02': 2}




2022-12-28 19:40:59,921 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 36, 'TRBVA*01': 3}




2022-12-28 19:41:02,488 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 22, 'TRBV7-5*02': 1, 'TRBVA*01': 5}












2022-12-28 19:41:05,026 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 29, 'TRBVA*01': 3}


2022-12-28 19:41:06,034 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1, 'TRBV7-5*02': 57, 'TRBV8-1*01': 1, 'TRBVA*01': 6}


2022-12-28 19:41:06,376 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 37, 'TRBVA*01': 5}






2022-12-28 19:41:07,282 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 10819}




2022-12-28 19:41:08,607 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 31, 'TRBVA*01': 8}


2022-12-28 19:41:09,087 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 23, 'TRBV7-5*02': 40, 'TRBVA*01': 1}








2022-12-28 19:41:10,929 - assign_clone_ids.ipynb - INFO - Participant p1448 (GeneLocus.TCR) has 59890 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f703136a6d924982945f.tsv'), (202149, 169), (63910, 180), <GeneLocus.TCR: 2>)]








2022-12-28 19:41:12,597 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 2, 'TRBV7-5*02': 45, 'TRBVA*01': 3}


2022-12-28 19:41:12,856 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 22}










2022-12-28 19:41:16,074 - assign_clone_ids.ipynb - INFO - Participant p1482 (GeneLocus.TCR) has 78861 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f708136a6d9249829471.tsv'), (189699, 169), (80736, 180), <GeneLocus.TCR: 2>)]










2022-12-28 19:41:18,834 - assign_clone_ids.ipynb - INFO - Participant p1434 (GeneLocus.TCR) has 65638 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f702136a6d924982945c.tsv'), (218241, 169), (70788, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:19,068 - assign_clone_ids.ipynb - INFO - Participant p1448 (GeneLocus.TCR) has 59813 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1448.parquet.




2022-12-28 19:41:21,091 - assign_clone_ids.ipynb - INFO - Participant P00655 (GeneLocus.TCR) has 176950 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00655.tsv'), (239156, 132), (185289, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:22,065 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 2414}


2022-12-28 19:41:22,682 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 3}


2022-12-28 19:41:22,784 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV24/OR9-2*01': 2, 'TRBV7-5*01': 4}


2022-12-28 19:41:23,719 - assign_clone_ids.ipynb - INFO - Participant p16 (GeneLocus.TCR) has 419513 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A2-i139.txt.gz'), (492793, 16), (457936, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:41:24,240 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV22-1*01': 1, 'TRBV7-5*01': 15}










2022-12-28 19:41:27,251 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5, 'TRBVC*01': 1}


2022-12-28 19:41:27,254 - assign_clone_ids.ipynb - INFO - Participant p1434 (GeneLocus.TCR) has 65527 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1434.parquet.




2022-12-28 19:41:27,529 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 2}






2022-12-28 19:41:28,252 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 1, 'TRBV7-5*01': 3, 'TRBVA*01': 1, 'TRBVC*01': 1}




2022-12-28 19:41:29,200 - assign_clone_ids.ipynb - INFO - Participant P00659 (GeneLocus.TCR) has 171894 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00659.tsv'), (232226, 132), (179488, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:29,393 - assign_clone_ids.ipynb - INFO - Participant p1449 (GeneLocus.TCR) has 75056 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f704136a6d9249829462.tsv'), (213983, 169), (81826, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:29,604 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV24/OR9-2*01': 2, 'TRBV25/OR9-2*01': 1, 'TRBV7-5*02': 13, 'TRBVA*01': 2, 'TRBVC*01': 1}


2022-12-28 19:41:31,086 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 2763}


2022-12-28 19:41:31,413 - assign_clone_ids.ipynb - INFO - Participant p1481 (GeneLocus.TCR) has 92528 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f707136a6d924982946e.tsv'), (209069, 169), (94415, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:31,923 - assign_clone_ids.ipynb - INFO - Participant p1482 (GeneLocus.TCR) has 78720 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1482.parquet.


2022-12-28 19:41:32,036 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 5}


2022-12-28 19:41:32,043 - assign_clone_ids.ipynb - INFO - Participant P00657 (GeneLocus.TCR) has 187450 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00657.tsv'), (251406, 132), (195225, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:34,153 - assign_clone_ids.ipynb - INFO - Participant p20 (GeneLocus.TCR) has 66512 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i102.txt.gz'), (72079, 16), (67695, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:41:36,580 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 2663}














2022-12-28 19:41:39,462 - assign_clone_ids.ipynb - INFO - Participant P00663 (GeneLocus.TCR) has 185612 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00663.tsv'), (233506, 132), (191372, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:39,964 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV24/OR9-2*01': 1, 'TRBV5-2*01': 1, 'TRBV7-5*01': 4, 'TRBV7-5*02': 4}








2022-12-28 19:41:42,745 - assign_clone_ids.ipynb - INFO - Participant p20 (GeneLocus.TCR) has 66425 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p20.parquet.


2022-12-28 19:41:43,017 - assign_clone_ids.ipynb - INFO - Participant p1449 (GeneLocus.TCR) has 74862 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1449.parquet.




2022-12-28 19:41:44,510 - assign_clone_ids.ipynb - INFO - Participant p21 (GeneLocus.TCR) has 93364 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i106.txt.gz'), (101460, 16), (95105, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:45,036 - assign_clone_ids.ipynb - INFO - Participant p17 (GeneLocus.TCR) has 346536 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A2-i140.txt.gz'), (401679, 16), (372029, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:45,915 - assign_clone_ids.ipynb - INFO - Participant P00656 (GeneLocus.TCR) has 184271 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00656.tsv'), (255756, 132), (197630, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:46,865 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 1850}


2022-12-28 19:41:47,186 - assign_clone_ids.ipynb - INFO - Participant P00651 (GeneLocus.TCR) has 207794 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00651.tsv'), (266072, 132), (218304, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:41:48,423 - assign_clone_ids.ipynb - INFO - Participant p1481 (GeneLocus.TCR) has 92404 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1481.parquet.




2022-12-28 19:41:49,623 - assign_clone_ids.ipynb - INFO - Participant P00655 (GeneLocus.TCR) has 176732 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00655.parquet.


2022-12-28 19:41:50,262 - assign_clone_ids.ipynb - INFO - Participant P00648 (GeneLocus.TCR) has 217800 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00648.tsv'), (297047, 132), (227616, 144), <GeneLocus.TCR: 2>)]






2022-12-28 19:41:51,259 - assign_clone_ids.ipynb - INFO - Participant p22 (GeneLocus.TCR) has 82580 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i107.txt.gz'), (89308, 16), (84016, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:41:51,458 - assign_clone_ids.ipynb - INFO - Participant p19 (GeneLocus.TCR) has 281940 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i101.txt.gz'), (318566, 16), (297447, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:51,639 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 1995}




2022-12-28 19:41:52,335 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 14129}


2022-12-28 19:41:53,526 - assign_clone_ids.ipynb - INFO - Participant p1 (GeneLocus.TCR) has 985025 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A3-i101.txt.gz'), (1150027, 16), (1095387, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:53,758 - assign_clone_ids.ipynb - INFO - Participant P00653 (GeneLocus.TCR) has 212081 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00653.tsv'), (280756, 132), (220751, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:54,935 - assign_clone_ids.ipynb - INFO - Participant P00660 (GeneLocus.TCR) has 206387 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00660.tsv'), (278255, 132), (218180, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:41:56,697 - assign_clone_ids.ipynb - INFO - Participant p21 (GeneLocus.TCR) has 93232 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p21.parquet.


2022-12-28 19:41:56,739 - assign_clone_ids.ipynb - INFO - Participant P00649 (GeneLocus.TCR) has 217647 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00649.tsv'), (290498, 132), (227060, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:57,077 - assign_clone_ids.ipynb - INFO - Participant P00659 (GeneLocus.TCR) has 171645 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00659.parquet.


2022-12-28 19:41:57,429 - assign_clone_ids.ipynb - INFO - Participant P00661 (GeneLocus.TCR) has 212874 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00661.tsv'), (273485, 132), (222496, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:57,549 - assign_clone_ids.ipynb - INFO - Participant p1545 (GeneLocus.TCR) has 85815 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f70c136a6d924982947f.tsv'), (231012, 169), (95150, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:41:57,681 - assign_clone_ids.ipynb - INFO - Participant P00662 (GeneLocus.TCR) has 197452 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00662.tsv'), (259947, 132), (208308, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:00,077 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 11963}


2022-12-28 19:42:00,155 - assign_clone_ids.ipynb - INFO - Participant p22 (GeneLocus.TCR) has 82424 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p22.parquet.




2022-12-28 19:42:01,255 - assign_clone_ids.ipynb - INFO - Participant p1445 (GeneLocus.TCR) has 99746 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.5f07aa8939579433171763b7.tsv'), (281922, 169), (107469, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:03,299 - assign_clone_ids.ipynb - INFO - Participant p1494 (GeneLocus.TCR) has 112943 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f70a136a6d9249829477.tsv'), (273674, 169), (124165, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:04,385 - assign_clone_ids.ipynb - INFO - Participant P00652 (GeneLocus.TCR) has 253859 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00652.tsv'), (338029, 132), (269226, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:05,067 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 10047}


2022-12-28 19:42:05,640 - assign_clone_ids.ipynb - INFO - Participant P00657 (GeneLocus.TCR) has 187173 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00657.parquet.


2022-12-28 19:42:05,930 - assign_clone_ids.ipynb - INFO - Participant p26 (GeneLocus.TCR) has 88575 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i124.txt.gz'), (96990, 16), (90983, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:06,445 - assign_clone_ids.ipynb - INFO - Participant p16 (GeneLocus.TCR) has 419090 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p16.parquet.


2022-12-28 19:42:07,259 - assign_clone_ids.ipynb - INFO - Participant p1551 (GeneLocus.TCR) has 101681 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f70d136a6d9249829482.tsv'), (340302, 169), (112725, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:07,435 - assign_clone_ids.ipynb - INFO - Participant p27 (GeneLocus.TCR) has 56262 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i125.txt.gz'), (62599, 16), (57817, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:07,475 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 4921}


2022-12-28 19:42:08,260 - assign_clone_ids.ipynb - INFO - Participant p1545 (GeneLocus.TCR) has 85739 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1545.parquet.


2022-12-28 19:42:08,609 - assign_clone_ids.ipynb - INFO - Participant p18 (GeneLocus.TCR) has 464360 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A2-i141.txt.gz'), (525355, 16), (499398, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:08,976 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV5-2*01': 5, 'TRBV7-5*02': 12, 'TRBVC*01': 2}


2022-12-28 19:42:11,080 - assign_clone_ids.ipynb - INFO - Participant P00647 (GeneLocus.TCR) has 249172 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00647.tsv'), (336609, 132), (261980, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:11,746 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 4357}


2022-12-28 19:42:12,148 - assign_clone_ids.ipynb - INFO - Participant P00650 (GeneLocus.TCR) has 250319 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00650.tsv'), (337727, 132), (262680, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:12,258 - assign_clone_ids.ipynb - INFO - Participant P00663 (GeneLocus.TCR) has 185374 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00663.parquet.


2022-12-28 19:42:12,267 - assign_clone_ids.ipynb - INFO - Participant p1473 (GeneLocus.TCR) has 133743 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.5f07aa8a39579433171763ba.tsv'), (330825, 169), (145577, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:12,323 - assign_clone_ids.ipynb - INFO - Participant P00656 (GeneLocus.TCR) has 184006 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00656.parquet.


2022-12-28 19:42:13,355 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 4076}


2022-12-28 19:42:14,059 - assign_clone_ids.ipynb - INFO - Participant p27 (GeneLocus.TCR) has 56183 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p27.parquet.




2022-12-28 19:42:15,255 - assign_clone_ids.ipynb - INFO - Participant p1486 (GeneLocus.TCR) has 129165 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f709136a6d9249829474.tsv'), (296252, 169), (137213, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:15,651 - assign_clone_ids.ipynb - INFO - Participant p26 (GeneLocus.TCR) has 88482 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p26.parquet.


2022-12-28 19:42:16,111 - assign_clone_ids.ipynb - INFO - Participant P00646 (GeneLocus.TCR) has 266870 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00646.tsv'), (358624, 132), (281059, 144), <GeneLocus.TCR: 2>)]




2022-12-28 19:42:18,138 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 31673}




2022-12-28 19:42:18,782 - assign_clone_ids.ipynb - INFO - Participant p1465 (GeneLocus.TCR) has 152694 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f704136a6d9249829465.tsv'), (306456, 169), (166648, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:19,753 - assign_clone_ids.ipynb - INFO - Participant p1494 (GeneLocus.TCR) has 112790 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1494.parquet.




2022-12-28 19:42:20,111 - assign_clone_ids.ipynb - INFO - Participant p1495 (GeneLocus.TCR) has 117326 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f70b136a6d924982947a.tsv'), (395001, 169), (132964, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:20,441 - assign_clone_ids.ipynb - INFO - Participant p1445 (GeneLocus.TCR) has 99575 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1445.parquet.


2022-12-28 19:42:21,002 - assign_clone_ids.ipynb - INFO - Participant p1489 (GeneLocus.TCR) has 155580 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.5f07aa8c39579433171763c0.tsv'), (307534, 169), (163376, 180), <GeneLocus.TCR: 2>)]




2022-12-28 19:42:21,582 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 19357}


2022-12-28 19:42:22,156 - assign_clone_ids.ipynb - INFO - Participant P00651 (GeneLocus.TCR) has 207511 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00651.parquet.


2022-12-28 19:42:23,379 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 881}


2022-12-28 19:42:23,797 - assign_clone_ids.ipynb - INFO - Participant p1551 (GeneLocus.TCR) has 101497 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1551.parquet.




2022-12-28 19:42:24,702 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 3363}


2022-12-28 19:42:25,910 - assign_clone_ids.ipynb - INFO - Participant p17 (GeneLocus.TCR) has 346177 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p17.parquet.




2022-12-28 19:42:26,454 - assign_clone_ids.ipynb - INFO - Participant P00648 (GeneLocus.TCR) has 217340 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00648.parquet.


2022-12-28 19:42:27,132 - assign_clone_ids.ipynb - INFO - Participant P00644 (GeneLocus.TCR) has 292816 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00644.tsv'), (398837, 132), (307662, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:27,474 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 1100}


2022-12-28 19:42:27,530 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 6258}


2022-12-28 19:42:28,711 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 25549}


2022-12-28 19:42:29,140 - assign_clone_ids.ipynb - INFO - Participant P00668 (GeneLocus.TCR) has 252306 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00668.tsv'), (340987, 132), (269879, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:29,534 - assign_clone_ids.ipynb - INFO - Participant P00649 (GeneLocus.TCR) has 217345 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00649.parquet.


2022-12-28 19:42:29,571 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 11793}


2022-12-28 19:42:29,699 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 25955}


2022-12-28 19:42:29,819 - assign_clone_ids.ipynb - INFO - Participant p19 (GeneLocus.TCR) has 281626 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p19.parquet.


2022-12-28 19:42:30,052 - assign_clone_ids.ipynb - INFO - Participant P00660 (GeneLocus.TCR) has 206020 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00660.parquet.


2022-12-28 19:42:30,693 - assign_clone_ids.ipynb - INFO - Participant P00654 (GeneLocus.TCR) has 287584 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/emerson/P00654.tsv'), (381078, 132), (304347, 144), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:30,915 - assign_clone_ids.ipynb - INFO - Participant p1473 (GeneLocus.TCR) has 133487 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1473.parquet.




2022-12-28 19:42:32,590 - assign_clone_ids.ipynb - INFO - Participant p1486 (GeneLocus.TCR) has 128971 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1486.parquet.


2022-12-28 19:42:32,765 - assign_clone_ids.ipynb - INFO - Participant P00653 (GeneLocus.TCR) has 211767 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00653.parquet.


2022-12-28 19:42:33,440 - assign_clone_ids.ipynb - INFO - Participant p41 (GeneLocus.TCR) has 40996 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A5-S22.txt.gz'), (43350, 16), (41401, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:33,586 - assign_clone_ids.ipynb - INFO - Participant p32 (GeneLocus.TCR) has 209946 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i184.txt.gz'), (236816, 16), (218286, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:34,779 - assign_clone_ids.ipynb - INFO - Participant P00662 (GeneLocus.TCR) has 197028 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00662.parquet.


2022-12-28 19:42:35,402 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 4941}




2022-12-28 19:42:36,929 - assign_clone_ids.ipynb - INFO - Participant P00661 (GeneLocus.TCR) has 212553 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00661.parquet.


2022-12-28 19:42:37,191 - assign_clone_ids.ipynb - INFO - Participant p34 (GeneLocus.TCR) has 150023 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i186.txt.gz'), (164394, 16), (154266, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:37,449 - assign_clone_ids.ipynb - INFO - Participant p35 (GeneLocus.TCR) has 132475 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i188.txt.gz'), (145705, 16), (136979, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:38,151 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 10201}


2022-12-28 19:42:38,775 - assign_clone_ids.ipynb - INFO - Participant p1495 (GeneLocus.TCR) has 117154 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1495.parquet.


2022-12-28 19:42:38,923 - assign_clone_ids.ipynb - INFO - Participant p41 (GeneLocus.TCR) has 40942 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p41.parquet.


2022-12-28 19:42:39,561 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 13543}


2022-12-28 19:42:41,059 - assign_clone_ids.ipynb - INFO - Participant p1465 (GeneLocus.TCR) has 152490 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1465.parquet.




2022-12-28 19:42:42,199 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 11184}


2022-12-28 19:42:42,303 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 8108}


2022-12-28 19:42:42,501 - assign_clone_ids.ipynb - INFO - Participant P00652 (GeneLocus.TCR) has 253546 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00652.parquet.


2022-12-28 19:42:42,514 - assign_clone_ids.ipynb - INFO - Participant p33 (GeneLocus.TCR) has 225289 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i185.txt.gz'), (248897, 16), (234176, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:43,301 - assign_clone_ids.ipynb - INFO - Participant p1489 (GeneLocus.TCR) has 155271 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1489.parquet.


2022-12-28 19:42:43,354 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 6899}


2022-12-28 19:42:43,707 - assign_clone_ids.ipynb - INFO - Participant p2 (GeneLocus.TCR) has 531216 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A3-i102.txt.gz'), (603590, 16), (576637, 26), <GeneLocus.TCR: 2>)]






2022-12-28 19:42:47,483 - assign_clone_ids.ipynb - INFO - Participant p23 (GeneLocus.TCR) has 315560 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i110.txt.gz'), (356952, 16), (332711, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:47,831 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 16402}


2022-12-28 19:42:47,913 - assign_clone_ids.ipynb - INFO - Participant p29 (GeneLocus.TCR) has 284759 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i128.txt.gz'), (325503, 16), (298900, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:48,102 - assign_clone_ids.ipynb - INFO - Participant p43 (GeneLocus.TCR) has 108534 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A5-S20.txt.gz'), (115538, 16), (111156, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:48,321 - assign_clone_ids.ipynb - INFO - Participant p39 (GeneLocus.TCR) has 115593 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i194.txt.gz'), (125307, 16), (117974, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:42:49,319 - assign_clone_ids.ipynb - INFO - Participant P00647 (GeneLocus.TCR) has 248718 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00647.parquet.


2022-12-28 19:42:49,923 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 32537}


2022-12-28 19:42:50,152 - assign_clone_ids.ipynb - INFO - Participant p35 (GeneLocus.TCR) has 132344 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p35.parquet.


2022-12-28 19:42:50,503 - assign_clone_ids.ipynb - INFO - Participant p34 (GeneLocus.TCR) has 149812 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p34.parquet.


2022-12-28 19:42:51,870 - assign_clone_ids.ipynb - INFO - Participant P00650 (GeneLocus.TCR) has 249983 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00650.parquet.








2022-12-28 19:42:54,697 - assign_clone_ids.ipynb - INFO - Participant p32 (GeneLocus.TCR) has 209701 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p32.parquet.


2022-12-28 19:42:54,981 - assign_clone_ids.ipynb - INFO - Participant p18 (GeneLocus.TCR) has 463819 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p18.parquet.


2022-12-28 19:42:56,673 - assign_clone_ids.ipynb - INFO - Participant p43 (GeneLocus.TCR) has 108419 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p43.parquet.


2022-12-28 19:42:57,706 - assign_clone_ids.ipynb - INFO - Participant p39 (GeneLocus.TCR) has 115381 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p39.parquet.




2022-12-28 19:43:00,485 - assign_clone_ids.ipynb - INFO - Participant p33 (GeneLocus.TCR) has 225010 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p33.parquet.








2022-12-28 19:43:01,706 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 25727}


2022-12-28 19:43:02,763 - assign_clone_ids.ipynb - INFO - Participant P00646 (GeneLocus.TCR) has 266317 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00646.parquet.




2022-12-28 19:43:04,020 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 29828}


2022-12-28 19:43:06,964 - assign_clone_ids.ipynb - INFO - Participant P00668 (GeneLocus.TCR) has 252027 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00668.parquet.




2022-12-28 19:43:07,176 - assign_clone_ids.ipynb - INFO - Participant P00654 (GeneLocus.TCR) has 287242 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00654.parquet.


2022-12-28 19:43:08,715 - assign_clone_ids.ipynb - INFO - Participant p29 (GeneLocus.TCR) has 284463 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p29.parquet.


2022-12-28 19:43:09,246 - assign_clone_ids.ipynb - INFO - Participant p40 (GeneLocus.TCR) has 298448 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i195.txt.gz'), (337012, 16), (313103, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:43:10,366 - assign_clone_ids.ipynb - INFO - Participant P00644 (GeneLocus.TCR) has 292460 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/P00644.parquet.


2022-12-28 19:43:10,559 - assign_clone_ids.ipynb - INFO - Participant p1480 (GeneLocus.TCR) has 219041 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/shomuradova/split.6047f706136a6d924982946b.tsv'), (548521, 169), (249540, 180), <GeneLocus.TCR: 2>)]


2022-12-28 19:43:10,843 - assign_clone_ids.ipynb - INFO - Participant p23 (GeneLocus.TCR) has 315124 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p23.parquet.




2022-12-28 19:43:14,356 - assign_clone_ids.ipynb - INFO - Participant p54 (GeneLocus.TCR) has 292979 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A5-S21.txt.gz'), (332210, 16), (308352, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:43:17,255 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 13279}


2022-12-28 19:43:18,017 - assign_clone_ids.ipynb - INFO - Participant p37 (GeneLocus.TCR) has 345081 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i191.txt.gz'), (384882, 16), (362082, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:43:25,145 - assign_clone_ids.ipynb - INFO - Participant p2 (GeneLocus.TCR) has 530740 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p2.parquet.


2022-12-28 19:43:25,519 - assign_clone_ids.ipynb - INFO - Participant p56 (GeneLocus.TCR) has 354320 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A5-S19.txt.gz'), (403004, 16), (374373, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:43:27,996 - assign_clone_ids.ipynb - INFO - Participant p1 (GeneLocus.TCR) has 983868 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1.parquet.


2022-12-28 19:43:30,458 - assign_clone_ids.ipynb - INFO - Participant p40 (GeneLocus.TCR) has 297970 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p40.parquet.


2022-12-28 19:43:31,493 - assign_clone_ids.ipynb - INFO - Participant p1480 (GeneLocus.TCR) has 218641 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p1480.parquet.


2022-12-28 19:43:33,362 - assign_clone_ids.ipynb - INFO - Participant p70 (GeneLocus.TCR) has 420893 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A6-I160.txt.gz'), (467334, 16), (443374, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:43:34,422 - assign_clone_ids.ipynb - INFO - Participant p54 (GeneLocus.TCR) has 292664 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p54.parquet.




2022-12-28 19:43:40,314 - assign_clone_ids.ipynb - INFO - Participant p72 (GeneLocus.TCR) has 428473 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A6-I150.txt.gz'), (490122, 16), (455986, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:43:42,360 - assign_clone_ids.ipynb - INFO - Participant p37 (GeneLocus.TCR) has 344540 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p37.parquet.


2022-12-28 19:43:49,691 - assign_clone_ids.ipynb - INFO - Participant p56 (GeneLocus.TCR) has 353872 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p56.parquet.


2022-12-28 19:43:55,014 - assign_clone_ids.ipynb - INFO - Participant p4 (GeneLocus.TCR) has 622388 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A3-i107.txt.gz'), (734720, 16), (680909, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:43:58,312 - assign_clone_ids.ipynb - INFO - Participant p38 (GeneLocus.TCR) has 618159 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i192.txt.gz'), (721262, 16), (670571, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:44:01,279 - assign_clone_ids.ipynb - INFO - Participant p70 (GeneLocus.TCR) has 420194 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p70.parquet.


2022-12-28 19:44:03,138 - assign_clone_ids.ipynb - INFO - Participant p71 (GeneLocus.TCR) has 549425 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A6-I215ob.txt.gz'), (656085, 16), (596776, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:44:03,931 - assign_clone_ids.ipynb - INFO - Participant p52 (GeneLocus.TCR) has 637782 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A5-S24.txt.gz'), (721414, 16), (687646, 26), <GeneLocus.TCR: 2>)]




2022-12-28 19:44:09,274 - assign_clone_ids.ipynb - INFO - Participant p72 (GeneLocus.TCR) has 428026 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p72.parquet.


2022-12-28 19:44:12,755 - assign_clone_ids.ipynb - INFO - Participant p5 (GeneLocus.TCR) has 675897 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A3-i110.txt.gz'), (786904, 16), (741186, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:44:19,020 - assign_clone_ids.ipynb - INFO - Participant p3 (GeneLocus.TCR) has 866615 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A3-i106.txt.gz'), (1003831, 16), (948357, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:44:37,558 - assign_clone_ids.ipynb - INFO - Participant p4 (GeneLocus.TCR) has 621889 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p4.parquet.


2022-12-28 19:44:37,562 - assign_clone_ids.ipynb - INFO - Participant p36 (GeneLocus.TCR) has 887439 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i189.txt.gz'), (570480, 16), (531898, 26), <GeneLocus.TCR: 2>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A4-i190.txt.gz'), (467269, 16), (438379, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:44:40,138 - assign_clone_ids.ipynb - INFO - Participant p71 (GeneLocus.TCR) has 548898 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p71.parquet.


2022-12-28 19:44:40,566 - assign_clone_ids.ipynb - INFO - Participant p38 (GeneLocus.TCR) has 617471 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p38.parquet.


2022-12-28 19:44:46,201 - assign_clone_ids.ipynb - INFO - Participant p52 (GeneLocus.TCR) has 637237 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p52.parquet.


2022-12-28 19:44:57,518 - assign_clone_ids.ipynb - INFO - Participant p5 (GeneLocus.TCR) has 675008 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p5.parquet.


2022-12-28 19:45:01,103 - assign_clone_ids.ipynb - INFO - Participant p53 (GeneLocus.TCR) has 925164 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A5-S23.txt.gz'), (137624, 16), (126289, 26), <GeneLocus.TCR: 2>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A6-I202ob.txt.gz'), (954170, 16), (875957, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:45:09,066 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV6-3*01': 15839}


2022-12-28 19:45:18,563 - assign_clone_ids.ipynb - INFO - Participant p3 (GeneLocus.TCR) has 865685 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p3.parquet.




2022-12-28 19:45:34,546 - assign_clone_ids.ipynb - INFO - Participant p36 (GeneLocus.TCR) has 886393 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p36.parquet.


2022-12-28 19:46:03,164 - assign_clone_ids.ipynb - INFO - Participant p53 (GeneLocus.TCR) has 924067 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p53.parquet.


2022-12-28 19:49:58,155 - assign_clone_ids.ipynb - INFO - Participant p76 (GeneLocus.TCR) has 2092047 unique clones from specimens: [(PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A6-I200ob.txt.gz'), (1812062, 16), (1723172, 26), <GeneLocus.TCR: 2>), (PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/external_cohorts/raw_data/chudakov_aging/A6-I201ob.txt.gz'), (758985, 16), (721976, 26), <GeneLocus.TCR: 2>)]


2022-12-28 19:52:07,129 - assign_clone_ids.ipynb - INFO - Participant p76 (GeneLocus.TCR) has 2090120 sampled sequences -> /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/external_cohorts_part_tables/p76.parquet.


In [43]:
len(fnames_output)

965

In [44]:
# drop null returns
fnames_output = [f for f in fnames_output if f is not None]
len(fnames_output)

945

In [45]:
# Many, but not all, output fnames will exist now
len(fnames_output), all_specimens.shape[0]

(945, 1038)

In [46]:
pd.read_parquet(
    fnames_output[0],
).head()

Unnamed: 0,sequence_id,participant_label,specimen_label,extracted_isotype,v_segment,j_segment,cdr1_seq_aa_q,cdr2_seq_aa_q,cdr3_seq_aa_q,pre_seq_nt_q,...,cdr3_nt_sequence_trim_len,num_reads,timepoint,is_peak,disease,disease_subtype,cluster_id_within_clustering_group,igh_or_tcrb_clone_id,total_clone_num_reads,num_clone_members
0,6028499,326650,326650_1,IGHM,IGHV4-31*03,IGHJ6*02,G G S I S S G G Y Y,I Y Y S G S T,A R D A W V R G N Y G M D V,,...,42,1,0.0,True,Healthy/Background,Healthy/Background - Briney,62.0,326650_1|IGHV4-31|IGHJ6|42|62.0,1,1
1,6028543,326650,326650_1,IGHG3,IGHV1-8*01,IGHJ4*02,G Y T F T S Y D,M N P N S G N T,A R G T T A E N,,...,24,1,0.0,True,Healthy/Background,Healthy/Background - Briney,41.0,326650_1|IGHV1-8|IGHJ4|24|41.0,1,1
448206,7528081,326650,326650_1,IGHG1,IGHV3-7*01,IGHJ4*02,G F T F S N Y W,I R Q D G S E K,A R S T A G L D Y,,...,27,2,0.0,True,Healthy/Background,Healthy/Background - Briney,621.0,326650_1|IGHV3-7|IGHJ4|27|621.0,9,7
3,6028638,326650,326650_1,IGHG4,IGHV4-59*01,IGHJ4*02,G G S I S S Y Y,I Y Y S G S T,A R G Y A E E Q H F D Y,,...,36,1,0.0,True,Healthy/Background,Healthy/Background - Briney,3922.0,326650_1|IGHV4-59|IGHJ4|36|3922.0,1,1
4,6028661,326650,326650_1,IGHG1,IGHV1-46*01,IGHJ3*02,G Y T F T S Y Y,I N P S G G S T,A S Q S P D A F D I,,...,30,1,0.0,True,Healthy/Background,Healthy/Background - Briney,20.0,326650_1|IGHV1-46|IGHJ3|30|20.0,1,1


In [47]:
pd.read_parquet(
    fnames_output[0],
).columns

Index(['sequence_id', 'participant_label', 'specimen_label',
       'extracted_isotype', 'v_segment', 'j_segment', 'cdr1_seq_aa_q',
       'cdr2_seq_aa_q', 'cdr3_seq_aa_q', 'pre_seq_nt_q', 'fr1_seq_nt_q',
       'cdr1_seq_nt_q', 'fr2_seq_nt_q', 'cdr2_seq_nt_q', 'fr3_seq_nt_q',
       'cdr3_seq_nt_q', 'post_seq_nt_q', 'pre_seq_nt_v', 'fr1_seq_nt_v',
       'cdr1_seq_nt_v', 'fr2_seq_nt_v', 'cdr2_seq_nt_v', 'fr3_seq_nt_v',
       'cdr3_seq_nt_v', 'post_seq_nt_v', 'pre_seq_nt_d', 'fr1_seq_nt_d',
       'cdr1_seq_nt_d', 'fr2_seq_nt_d', 'cdr2_seq_nt_d', 'fr3_seq_nt_d',
       'cdr3_seq_nt_d', 'post_seq_nt_d', 'pre_seq_nt_j', 'fr1_seq_nt_j',
       'cdr1_seq_nt_j', 'fr2_seq_nt_j', 'cdr2_seq_nt_j', 'fr3_seq_nt_j',
       'cdr3_seq_nt_j', 'post_seq_nt_j', 'productive', 'v_sequence',
       'd_sequence', 'j_sequence', 'isotype_supergroup', 'v_gene', 'j_gene',
       'cdr1_seq_aa_q_trim', 'cdr2_seq_aa_q_trim', 'cdr3_seq_aa_q_trim',
       'cdr3_aa_sequence_trim_len', 'v_mut', 'cdr3_seq_nt_q_trim'