In [87]:
import pandas as pd
import numpy as np
import re
import math
from collections import Counter

The first exercise here is to extract the data from [GenTB](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-021-00953-4) that was **not** used in the WHO catalogue generation.

In [2]:
run_re = re.compile(r"(E|D|S)RR[0-9]{6,}")
exp_re = re.compile(r"(E|D|S)RX[0-9]{6,}")
sample_re = re.compile(r"(E|D|S)RS[0-9]{6,}")
biosample_re = re.compile(r"SAM(E|D|N)[A-Z]?[0-9]+")

In [3]:
gentb_sheet = "../../docs/gentb-samplesheet.csv"
who_sheet = "../../docs/who-samplesheet.csv"

In [4]:
gentbdf = pd.read_csv(gentb_sheet)
gentbdf.set_index("run_accession", inplace=True)
gentbdf = gentbdf[~gentbdf.index.duplicated(keep='first')]
whodf = pd.read_csv(who_sheet)

In [5]:
len(gentbdf)

18425

In [6]:
len(whodf)

38223

In [7]:
gentbdf.columns

Index(['bioproject', 'biosample', 'amikacin', 'capreomycin', 'ciprofloxacin',
       'ethambutol', 'ethionamide', 'isoniazid', 'kanamycin', 'levofloxacin',
       'moxifloxacin', 'ofloxacin', 'para-aminosalicylic_acid', 'pyrazinamide',
       'rifampicin', 'streptomycin'],
      dtype='object')

In [8]:
whodf.columns

Index(['ena_project', 'ena_sample', 'ena_experiment', 'ena_run'], dtype='object')

In [9]:
gentb_accs = []
with open(gentb_sheet) as fp:
    _ = next(fp)
    for line in fp:
        if m := run_re.search(line):
            run = m.group()
        else:
            run = None
        if m:= exp_re.search(line):
            exp = m.group()
        else:
            exp = None
        if m := sample_re.search(line):
            sample = m.group()
        else:
            sample = None
        if m := biosample_re.search(line):
            biosample = m.group()
        else:
            biosample = None
        gentb_accs.append((run, exp, sample, biosample))

In [10]:
gentb_accs

[('SRR7439415', None, None, 'SAMN09492287'),
 ('SRR7131298', None, None, 'SAMN09090624'),
 ('SRR7131297', None, None, 'SAMN09090623'),
 ('SRR7131296', None, None, 'SAMN09090622'),
 ('SRR7131295', None, None, 'SAMN09090621'),
 ('SRR7131294', None, None, 'SAMN09090620'),
 ('SRR7131293', None, None, 'SAMN09090619'),
 ('SRR7131292', None, None, 'SAMN09090618'),
 ('SRR7131291', None, None, 'SAMN09090515'),
 ('SRR7131290', None, None, 'SAMN09090514'),
 ('SRR7131289', None, None, 'SAMN09090513'),
 ('SRR7131288', None, None, 'SAMN09090512'),
 ('SRR7131287', None, None, 'SAMN09090511'),
 ('SRR7131286', None, None, 'SAMN09090510'),
 ('SRR7131285', None, None, 'SAMN09090509'),
 ('SRR7131284', None, None, 'SAMN09090508'),
 ('SRR7131283', None, None, 'SAMN09090579'),
 ('SRR7131282', None, None, 'SAMN09090429'),
 ('SRR7131281', None, None, 'SAMN09090413'),
 ('SRR7131280', None, None, 'SAMN09090517'),
 ('SRR7131279', None, None, 'SAMN09090516'),
 ('SRR7131277', None, None, 'SAMN09090409'),
 ('SRR7131

In [11]:
who_accs = []
with open(who_sheet) as fp:
    _ = next(fp)
    for line in fp:
        if m := run_re.search(line):
            run = m.group()
        else:
            run = None
        if m:= exp_re.search(line):
            exp = m.group()
        else:
            exp = None
        if m := sample_re.search(line):
            sample = m.group()
        else:
            sample = None
        if m := biosample_re.search(line):
            biosample = m.group()
        else:
            biosample = None
        accs = (run, exp, sample, biosample)
        if all(x is None for x in accs):
            continue
        else:
            who_accs.append(accs)

In [12]:
len(gentb_accs)

18612

In [13]:
len(who_accs)

35811

In [14]:
who_accs

[('ERR2516949', 'ERX2536164', 'ERS2401626', None),
 ('ERR2516943', 'ERX2536158', 'ERS2401620', None),
 ('ERR2517010', 'ERX2536225', 'ERS2401687', None),
 ('ERR2516824', 'ERX2536039', 'ERS2401501', None),
 ('ERR2516841', 'ERX2536056', 'ERS2401518', None),
 ('ERR2517030', 'ERX2536245', 'ERS2401707', None),
 ('ERR2517047', 'ERX2536262', 'ERS2401724', None),
 ('ERR2517067', 'ERX2536282', 'ERS2401744', None),
 ('ERR2516781', 'ERX2535996', 'ERS2401458', None),
 ('ERR2517086', 'ERX2536301', 'ERS2401763', None),
 ('ERR2516802', 'ERX2536017', 'ERS2401479', None),
 ('ERR2516847', 'ERX2536062', 'ERS2401524', None),
 ('ERR2516913', 'ERX2536128', 'ERS2401590', None),
 ('ERR2516850', 'ERX2536065', 'ERS2401527', None),
 ('ERR2517057', 'ERX2536272', 'ERS2401734', None),
 ('ERR2516942', 'ERX2536157', 'ERS2401619', None),
 ('ERR2516823', 'ERX2536038', 'ERS2401500', None),
 ('ERR2516796', 'ERX2536011', 'ERS2401473', None),
 ('ERR2516814', 'ERX2536029', 'ERS2401491', None),
 ('ERR2516782', 'ERX2535997', '

In [15]:
who_runs = {t[0] for t in who_accs if t[0] is not None}
who_exps = {t[1] for t in who_accs if t[1] is not None}
who_samps = {t[2] for t in who_accs if t[2] is not None}
who_bios = {t[3] for t in who_accs if t[3] is not None}

In [16]:
non_who_accs = []
for r,e,s,b in gentb_accs:
    if r is not None and r in who_runs:
        continue
    if e is not None and e in who_exps:
        continue
    if s is not None and s in who_samps:
        continue
    if b is not None and b in who_bios:
        continue
    non_who_accs.append((r, e, s, b))

In [17]:
len(non_who_accs)

4860

In [28]:
non_who_runs = list({t[0] for t in non_who_accs})
non_who_df = gentbdf.loc[non_who_runs]
non_who_df.index.rename('run', inplace=True)
non_who_df

Unnamed: 0_level_0,bioproject,biosample,amikacin,capreomycin,ciprofloxacin,ethambutol,ethionamide,isoniazid,kanamycin,levofloxacin,moxifloxacin,ofloxacin,para-aminosalicylic_acid,pyrazinamide,rifampicin,streptomycin
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
SRR1723820,PRJNA268900,SAMN03246542,,,,S,,S,S,,,S,,,S,S
SRR6982511,PRJNA428596,SAMN08912796,S,S,,,,S,S,,S,S,,S,S,
ERR400543,PRJEB5162,SAMEA2297114,,,,S,,R,,,,,,S,S,S
ERR182009,PRJEB2794,SAMEA1556682,,,,,,S,,,,,,,S,
ERR040112,PRJEB2424,SAMEA787711,,S,,S,S,S,S,,,,,S,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6982546,PRJNA428596,SAMN08913114,S,S,,,,S,S,,S,S,,S,S,
ERR245666,PRJEB2358,SAMEA1708475,,,,,,S,,,,,,,S,
SRR6982260,PRJNA428596,SAMN08912848,S,S,,,,S,S,,S,S,,S,S,
SRR6896416,PRJNA428596,SAMN08794955,S,S,,,,S,S,,S,S,,S,S,


Gathering Delamanid (DLM) phenotypes from https://journals.asm.org/doi/full/10.1128/JCM.01304-20

They used a critical concentraction of >0.06 μg/ml - based on the [WHO technical guidelines](https://apps.who.int/iris/bitstream/handle/10665/260470/WHO-CDS-TB-2018.5-eng.pdf).

MICs and accessions where taken from the supplementary Excel spreadsheet https://journals.asm.org/doi/suppl/10.1128/JCM.01304-20/suppl_file/jcm.01304-20-sd004.xlsx

In [65]:
dlm_mics = pd.read_csv("../../docs/dlm-mics.csv")
DLM_CC = 0.06  # delamanid critical concentration

In [31]:
dlm_mics

Unnamed: 0,ISOLATE 1 DLM MIC (ug/ml),ISOLATE 2 DLM MIC (ug/ml),DRS Sample selected (Isolate 1),DRS Sample selected (Isolate 2)
0,,,TRL0018311,TRL0020517
1,≤0.004,,TRL0084025,
2,0.03,,132321,
3,0.008,,TRL0029765,TRL0050938
4,0.015,,3617-13,
...,...,...,...,...
158,0.03,,140129,131110
159,0.5,,BF01311702,BF01330971
160,0.25,,BF01457875,
161,0.015,,3635-13,


In [42]:
isolate1 = list(zip(dlm_mics.iloc[:,0], dlm_mics.iloc[:,2]))
isolate2 = list(zip(dlm_mics.iloc[:,1], dlm_mics.iloc[:,3]))

In [62]:
dlm_mic_df = pd.DataFrame(isolate1+isolate2, columns=["MIC", "isolate"]).dropna()
dlm_mic_df.set_index("isolate", verify_integrity=True, inplace=True)

In [63]:
dlm_mic_df

Unnamed: 0_level_0,MIC
isolate,Unnamed: 1_level_1
TRL0084025,≤0.004
132321,0.03
TRL0029765,0.008
3617-13,0.015
132348,2
...,...
131140,0.015
140162,1
132756,0.015
132639,0.008


Convert all columns with a greater/less symbol into a single value

In [93]:
def isfloat(s: str) -> bool:
    try:
        f = float(s)
        return True
    except ValueError:
        return False

LEQ = "≤"
GT = ">"

def convert_op(s: str) -> float:
    """Convert a string with a > or < into a value just below or above it"""
    if s.startswith(LEQ):
        return float(s[1:].strip())
    elif s.startswith(GT):
        f = float(s[1:].strip()) + 0.00001
        return math.ceil(f)
    else:
        raise ValueError(f"Don't know how to handle {s}")

In [96]:
dlm_pheno = []
for f in dlm_mic_df["MIC"]:
    if not isfloat(f):
        mic = convert_op(f)
    else:
        mic = float(f)
    p = "R" if mic > DLM_CC else "S"
    dlm_pheno.append(p)

In [100]:
sum(1 for p in dlm_pheno if p == "R")

39

In [98]:
dlm_mic_df["delamanid"] = dlm_pheno

In [99]:
dlm_mic_df

Unnamed: 0_level_0,MIC,delamanid
isolate,Unnamed: 1_level_1,Unnamed: 2_level_1
TRL0084025,≤0.004,S
132321,0.03,S
TRL0029765,0.008,S
3617-13,0.015,S
132348,2,R
...,...,...
131140,0.015,S
140162,1,R
132756,0.015,S
132639,0.008,S


In [174]:
dlm_accs_df = pd.read_csv("../../docs/dlm-samplesheet.csv")
dlm_accs_df.set_index("SampleID", verify_integrity=True, inplace=True)

In [175]:
# this adds the delamanid phenotypes to the appropriate samples
dlm_df = dlm_mic_df.join(dlm_accs_df, how='outer')

In [176]:
dlm_df.index.rename("isolate", inplace=True)

In [177]:
dlm_df = dlm_df.reset_index().set_index("RUN", verify_integrity=True)
dlm_df.index.rename("run", inplace=True)

In [178]:
dlm_df

Unnamed: 0_level_0,isolate,MIC,delamanid,Country,RIF,INH,OFX (2),MFX (0.5),MFX (2),LFX (15),GFX (2),KAN,AMK,CAP,PZA
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
SRR6831721,1-1307,,,Ukraine,S,S,S,S,S,,,S,S,S,S
SRR6831720,1-1521,,,Ukraine,S,S,S,S,S,,,S,S,S,S
SRR6831719,1-2543,,,Ukraine,R,R,R,R,S,,,R,R,R,R
SRR6832237,1-3925,,,Ukraine,S,S,S,,,,,S,S,S,
SRR6832228,1-4043,,,Ukraine,R,R,S,S,S,,,S,S,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6982117,dg00676352,,,SouthAfrica,R,S,S,S,S,,,S,S,S,S
SRR6982312,dh00727632,,,SouthAfrica,S,S,,,,,,,,,
SRR6982311,dh00742746,,,SouthAfrica,S,S,S,S,S,,,S,S,S,S
SRR6982432,ef00266177,,,SouthAfrica,S,S,S,S,S,,,S,S,S,S


In [179]:
dlm_df.index.intersection(who_runs)

Index([], dtype='object', name='run')

Therefore, it looks like all of the samples with DLM phenotypes are not in the WHO set.

There are two MFX critical concentrations in this dataset. My understanding from the reference they sited for CCs (https://doi.org/10.1016/S1473-3099(18)30073-2) is that the 0.5 one if the appropriate one to use. This is also backed up the [WHO guidelines](https://apps.who.int/iris/rest/bitstreams/1098393/retrieve) (page 2 Table 1).

In [185]:
# delete rows based on inverse of column values
drop_cols = ["MIC", "isolate", "Country", "MFX (2)"]
dlm_df.drop(columns=drop_cols, inplace=True)

In [186]:
dlm_df.columns

Index(['delamanid', 'RIF', 'INH', 'OFX (2)', 'MFX (0.5)', 'LFX (15)',
       'GFX (2)', 'KAN', 'AMK', 'CAP', 'PZA'],
      dtype='object')

In [187]:
non_who_df.columns

Index(['bioproject', 'biosample', 'amikacin', 'capreomycin', 'ciprofloxacin',
       'ethambutol', 'ethionamide', 'isoniazid', 'kanamycin', 'levofloxacin',
       'moxifloxacin', 'ofloxacin', 'para-aminosalicylic_acid', 'pyrazinamide',
       'rifampicin', 'streptomycin'],
      dtype='object')

In [239]:
#rename the drug abbreviations
abbrev_to_name = {
    "RIF": "rifampicin",
    "INH": "isoniazid",
    "OFX (2)": "ofloxacin",
    "MFX (0.5)": "moxifloxacin",
    "LFX (15)": "levofloxacin",
    "GFX (2)": "gatifloxacin",
    "KAN": "kanamycin",
    "AMK": "amikacin",
    "CAP": "capreomycin",
    "PZA": "pyrazinamide"
}
dlm_df.rename(columns=abbrev_to_name, inplace=True)

Solution here is taken from https://stackoverflow.com/q/71745028/5299417

What we do here is merge the dlm dataset with the non-WHO dataset and change the phenotype to NaN (i.e. None) if the two phenotypes listed in the two datasets disagree. In addition, if one dataset has an empty value we use the other if it is non-empty.

In [240]:
def comp(c1, c2):
    if pd.isna(c1):
        return c2
    elif pd.isna(c2):
        return c1
    elif c1 != c2:
        return np.nan
    else:
        return c1
    
combine_func = lambda s1, s2: np.array([comp(c1, c2) for c1, c2 in zip(s1,s2)])

In [261]:
non_who_df = non_who_df.combine(dlm_df, func=combine_func, overwrite=False)

Add the data from the head to head paper https://doi.org/10.1101/2022.03.04.22271870

The samplesheet is downloaded from figshare https://doi.org/10.6084/m9.figshare.19304648

In [337]:
csv_url = "https://figshare.com/ndownloader/files/34302611"
h2h_df = pd.read_csv(csv_url)

In [338]:
# use the first accession where there are multiple
h2h_df["illumina_run_accession"] = [r.split(";")[0] for r in h2h_df["illumina_run_accession"]]

In [339]:
h2h_df.set_index("illumina_run_accession", verify_integrity=True, inplace=True)

In [340]:
h2h_df.index.rename("run", inplace=True)

In [341]:
h2h_df.index.intersection(who_runs)

Index(['ERR4817065', 'ERR4816407', 'ERR4821577'], dtype='object', name='run')

So we need to remove these 3 isolates from the dataset

In [342]:
h2h_df = h2h_df.loc[h2h_df.index.difference(who_runs)]

In [343]:
h2h_df

Unnamed: 0_level_0,sample,site,matched,pacbio,excluded,failed_qc,run,barcode,barcode_kit,collection_date,lineage,nanopore_run_accession,pacbio_run_accession,sample_accession,biosample_accession,assembly_accession
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ERR4814410,18_0622465,birmingham,1,0,0,0,birmingham-mgit-run4,10,EXP-NBD103,2018,2.2,ERR9030543,,ERS10792344,SAMEA13188807,
ERR4814811,18_0622363,birmingham,1,0,0,0,birmingham-mgit-run3,3,EXP-NBD103,2018,3.1.2.2,ERR9030493,,ERS10792337,SAMEA13188800,
ERR4814915,18_0622267,birmingham,1,0,0,0,birmingham-mgit-run6,2,EXP-NBD103,2018,4,ERR9030322,,ERS10792333,SAMEA13188796,
ERR4815143,18_0622446,birmingham,1,0,0,0,birmingham-mgit-run5,2,EXP-NBD103,2018,4.2.2,ERR9030423,,ERS10792341,SAMEA13188804,
ERR4815412,18_0622300,birmingham,1,0,0,0,birmingham-mgit-run6,4,EXP-NBD103,2018,1.2.2,ERR9030370,,ERS10792335,SAMEA13188798,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR9030534,mada_1-54,madagascar,1,0,0,0,madagascar_tb_mdr_4,4,EXP-NBD103,22/4/2015,1.1.2,ERR9030367,,ERS10792281,SAMEA13188744,
ERR9030535,mada_123,madagascar,1,0,0,0,madagscar_tb_aug_3,6,EXP-NBD103,2/5/2016,4.1,ERR9030249,,ERS10792239,SAMEA13188702,
ERR9030539,mada_132,madagascar,1,1,0,0,madagscar_tb_aug_4,5,EXP-NBD103,2/2/2017,4.1,ERR9030258,ERR9030503,ERS10792226,SAMEA13188689,ERZ5737209
SRR12882607,R27252,south_africa,1,0,0,0,FAK48818,22,EXP-NBD114,24/2/2015,1.2.2,ERR9030426,,ERS10792290,SAMEA13188753,


Load the phenotype information

In [344]:
url = "https://raw.githubusercontent.com/mbhall88/head_to_head_pipeline/3ed818c9f3754612a33060428ee651cd0f91234b/docs/phenotypes.csv"
h2h_pheno = pd.read_csv(url, index_col="sample")

In [345]:
h2h_df.index.rename("illumina_run_accession", inplace=True)
h2h_df = h2h_df.reset_index().set_index("sample")

In [346]:
h2h_df

Unnamed: 0_level_0,illumina_run_accession,site,matched,pacbio,excluded,failed_qc,run,barcode,barcode_kit,collection_date,lineage,nanopore_run_accession,pacbio_run_accession,sample_accession,biosample_accession,assembly_accession
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
18_0622465,ERR4814410,birmingham,1,0,0,0,birmingham-mgit-run4,10,EXP-NBD103,2018,2.2,ERR9030543,,ERS10792344,SAMEA13188807,
18_0622363,ERR4814811,birmingham,1,0,0,0,birmingham-mgit-run3,3,EXP-NBD103,2018,3.1.2.2,ERR9030493,,ERS10792337,SAMEA13188800,
18_0622267,ERR4814915,birmingham,1,0,0,0,birmingham-mgit-run6,2,EXP-NBD103,2018,4,ERR9030322,,ERS10792333,SAMEA13188796,
18_0622446,ERR4815143,birmingham,1,0,0,0,birmingham-mgit-run5,2,EXP-NBD103,2018,4.2.2,ERR9030423,,ERS10792341,SAMEA13188804,
18_0622300,ERR4815412,birmingham,1,0,0,0,birmingham-mgit-run6,4,EXP-NBD103,2018,1.2.2,ERR9030370,,ERS10792335,SAMEA13188798,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mada_1-54,ERR9030534,madagascar,1,0,0,0,madagascar_tb_mdr_4,4,EXP-NBD103,22/4/2015,1.1.2,ERR9030367,,ERS10792281,SAMEA13188744,
mada_123,ERR9030535,madagascar,1,0,0,0,madagscar_tb_aug_3,6,EXP-NBD103,2/5/2016,4.1,ERR9030249,,ERS10792239,SAMEA13188702,
mada_132,ERR9030539,madagascar,1,1,0,0,madagscar_tb_aug_4,5,EXP-NBD103,2/2/2017,4.1,ERR9030258,ERR9030503,ERS10792226,SAMEA13188689,ERZ5737209
R27252,SRR12882607,south_africa,1,0,0,0,FAK48818,22,EXP-NBD114,24/2/2015,1.2.2,ERR9030426,,ERS10792290,SAMEA13188753,


In [347]:
# drop the LPA phenotypes
drop_cols = [c for c in h2h_pheno.columns if c.endswith("-lpa")]
h2h_pheno.drop(columns=drop_cols, inplace=True)

In [348]:
# some phenos have ND
h2h_pheno.replace("ND", np.nan, inplace=True)

In [349]:
h2h_pheno

Unnamed: 0_level_0,streptomycin,isoniazid,rifampicin,ethambutol,pas,thioacetazone,moxifloxacin,ofloxacin,amikacin,kanamycin,capreomycin,pyrazinamide,ethionamide
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
mada_101,S,S,S,S,S,R,,,,,,,
mada_102,S,S,S,S,S,R,,S,S,S,S,,
mada_103,S,S,S,S,S,R,,,,,,,
mada_104,S,R,S,S,S,R,,S,S,S,S,,
mada_105,S,R,S,S,S,R,,S,S,S,S,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
R36431,,S,S,S,S,,S,,S,S,S,S,R
R36440,,,,,,,,,,,,,
R37259,,,,,,,,S,R,,,,
R37765,,,,,,,,,,,,,


In [350]:
# drop all but the illumina run accessions
h2h_df = h2h_df.loc[:,["illumina_run_accession", "biosample_accession"]]

In [351]:
h2h_df = h2h_pheno.join(h2h_df, how="inner")

In [352]:
h2h_df.rename(columns={"pas": "para-aminosalicylic_acid", "illumina_run_accession": "run", "biosample_accession": "biosample"}, inplace=True)

In [353]:
h2h_df.set_index("run", inplace=True)

In [354]:
h2h_df

Unnamed: 0_level_0,streptomycin,isoniazid,rifampicin,ethambutol,para-aminosalicylic_acid,thioacetazone,moxifloxacin,ofloxacin,amikacin,kanamycin,capreomycin,pyrazinamide,ethionamide,biosample
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ERR9030447,,S,,,,,,S,S,,,,S,SAMEA13188786
ERR9030388,,,,,,,,S,R,,,,R,SAMEA13188759
ERR9030270,,,,,,,,R,S,,,,S,SAMEA13188777
ERR5987402,,,,,,,,S,S,,,,S,SAMEA13188782
ERR6362259,,,,,,,,R,S,,,,S,SAMEA13188767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR9030436,S,R,R,R,S,R,,,,,,,,SAMEA13188675
ERR9030298,S,R,R,S,S,R,,S,S,S,S,,,SAMEA13188703
ERR9030308,S,R,R,S,,,,S,S,S,S,,,SAMEA13188691
ERR9030273,S,R,R,S,S,R,,S,S,S,S,,,SAMEA13188664


In [356]:
non_who_df = non_who_df.combine(h2h_df, func=combine_func, overwrite=False)

In [357]:
# non_who_df.to_csv("../../docs/samplesheet.csv")