In [1]:
import glob
import os
import pandas as pd

In [2]:
def read_overview(path):
    df = pd.read_csv(path, sep='\t', na_values='na')
    fname = os.path.basename(path)
    df['fname'] = fname
    # This is the directory name that contains the samples.
    df['dir'] = os.path.splitext(fname)[0]
    return df

dfs = [read_overview(os.path.join('adaptive_overview/naive', fname)) for fname in 
       ['adaptive-peakman-t1d-naive.tsv','adaptive-lindau-cmv.tsv','adaptive-robins-ratio.tsv']]

In [3]:
df = pd.concat(dfs, ignore_index=True, sort=True)
df.columns

Index(['dir', 'fname', 'fraction_productive', 'locus',
       'max_productive_frequency', 'productive_clonality',
       'productive_rearrangements', 'productive_templates', 'sample_name',
       'sample_tags', 'sku', 'test_name', 'total_productive_reads',
       'total_reads', 'total_rearrangements', 'total_templates'],
      dtype='object')

In [4]:
# We only want Naive samples.
df = df.loc[df['sample_tags'].str.contains('Naive'),:]

In [5]:
df.sort_values('productive_rearrangements')[['productive_rearrangements', 'fname', 'sample_name']].head()

Unnamed: 0,productive_rearrangements,fname,sample_name
157,971,adaptive-robins-ratio.tsv,Healthy_Subject_16_CD8_Naive
117,30201,adaptive-robins-ratio.tsv,Healthy_Subject_3_CD8_Naive
185,34293,adaptive-robins-ratio.tsv,Healthy_Subject_9_CD8_Naive
132,35997,adaptive-robins-ratio.tsv,Healthy_Subject_8_CD8_Naive
140,38981,adaptive-robins-ratio.tsv,Healthy_Subject_6_CD8_Naive


In [6]:
# Subject 16 gets thrown out.
df = df.loc[~df['sample_name'].str.contains('Subject_16'),]

In [7]:
# Drop the diabetic samples, which Peakman says are strange.
peakman_t1d = (df['fname'] == 'adaptive-peakman-t1d-naive.tsv') & (df['sample_tags'].str.contains('Diabetes'))
df = df.loc[~peakman_t1d,:]

In [8]:
df['marker'] = df['sample_tags'].str.extract('(CD[48]\+)')

In [9]:
df['marker'].value_counts()

CD4+    30
CD8+    24
Name: marker, dtype: int64

In [10]:
# Next we're going to even out this imbalance by picking cd4 vs cd8 from the robins dataset.

sample_numbers = set(
    df.loc[df['fname'] == 'adaptive-robins-ratio.tsv', 'sample_name'].str.extract('Subject_(\d*)_')[0])
marker_df = pd.DataFrame({'marker': 8}, index=sample_numbers).sample(frac=1)
marker_df.loc[marker_df.index[:5], 'marker'] = 4
robins_chosen = set([f'Healthy_Subject_{idx}_CD{marker}_Naive' for idx, marker in marker_df.itertuples()])
robins_chosen

{'Healthy_Subject_10_CD8_Naive',
 'Healthy_Subject_11_CD8_Naive',
 'Healthy_Subject_12_CD4_Naive',
 'Healthy_Subject_13_CD4_Naive',
 'Healthy_Subject_14_CD8_Naive',
 'Healthy_Subject_15_CD8_Naive',
 'Healthy_Subject_17_CD8_Naive',
 'Healthy_Subject_1_CD8_Naive',
 'Healthy_Subject_2_CD8_Naive',
 'Healthy_Subject_3_CD8_Naive',
 'Healthy_Subject_4_CD4_Naive',
 'Healthy_Subject_5_CD8_Naive',
 'Healthy_Subject_6_CD8_Naive',
 'Healthy_Subject_7_CD8_Naive',
 'Healthy_Subject_8_CD4_Naive',
 'Healthy_Subject_9_CD4_Naive'}

In [11]:
df['keep'] = True
df.loc[df['fname'] == 'adaptive-robins-ratio.tsv','keep'] = False
df.loc[[sample in robins_chosen for sample in df['sample_name']], 'keep'] = True
df = df.loc[df['keep'], :]

df['marker'].value_counts()

CD8+    19
CD4+    19
Name: marker, dtype: int64

In [12]:
call = 'python util.py split-repertoires --out-prefix /home/matsen/re/vampire/vampire/_ignore/naive-2019-02-12 --limit-each 10000 '
call += ' '.join([f"/fh/fast/matsen_e/data/{row.dir}/{row.sample_name}.tsv" for row in df.itertuples()])
call

'python util.py split-repertoires --out-prefix /home/matsen/re/vampire/vampire/_ignore/naive-2019-02-12 --limit-each 10000 /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD11_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD9_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD13_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD14_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD12_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD10_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD4_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD8_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD7_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD6_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD5_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD3_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD2_TN.tsv /fh/fast/matsen_e/data/adaptive-peakman-t1d-naive/HD1_TN.tsv /fh/fast/matsen_e/