In [1]:
import pandas as pd
from glob import glob
import numpy as np

In [2]:
d = pd.read_csv('all_read_files.csv')
r1_files = [i for i in d['Filename'] if 'R1.' in i or 'R1_001.' in i]
d.head()

Unnamed: 0,Filename,size
0,dBFA2-02M_NaCl_2N-R1-Subpool_S33_R1_001.fastq.gz,37M
1,dBFA2-21C_2N-R1-Subpool_S32_R1_001.fastq.gz,28M
2,dBFA2-FLC4_2N-R1-Subpool_S34_R1_001.fastq.gz,12M
3,dBFA2_h1_Ancestor_YPD_2N_R1_1_R1.fastq.gz,23M
4,dBFA2_h1_Ancestor_YPD_2N_R1_1_R2.fastq.gz,23M


In [25]:
flist = glob('demult_maps/*/*.csv')
ds = {f.split('/')[-1]: pd.read_csv(f) for f in flist}
dnames = sorted(list(ds.keys()))
df_rec = []
for dn in ds:
    ds[dn]['File'] = dn
    ds[dn] = ds[dn].rename(columns={'Inline_index': 'R1_index'})
    df_rec.append(ds[dn])
    
alldf = pd.concat(df_rec)
file_to_run = {
    'Harvard_dip_all.csv': 'Harvard_dip_LT_',
    'Stanford_hap_all.csv': 'Stanford_hap_LT_',
    'Stanford_high_all.csv': 'Stanford_high_LT_',
    'Stanford_mid_all.csv': 'Stanford_mid_LT_',
    'Stanford_redos_all.csv': 'Stanford_redos_LT_',
    'Stanford_restart_all.csv': 'Stanford_restart_LT_',
    'dBFA2_Harvard_1_indices.csv': 'dBFA2_h1_',
    'dBFA2_Stanford_1_indices.csv': 'dBFA2_s1_',
    'dBFA2_Stanford_2_indices.csv': 'dBFA2_s2_',
    'hBFA1_part1_indices.csv': 'hBFA1_h1_',
    'hBFA1_part2_indices.csv': 'hBFA1_h2_',
    'hBFA2_Harvard_1_indices.csv': 'hBFA2_h1_',
    'hBFA2_Stanford_indices.csv': 'hBFA2_s_',
    'hBFA2_dBFA2_Harvard_2_combined_indices.csv': 'hBFA2_dBFA2_h2_',
    'hBFA1_subpool_demult.csv': '',
    'hBFA2_subpool_demult.csv': '',
    'dBFA2_subpool_demult.csv': ''
}

def get_file(row):
    normal_file = row['Run']+row['Library']+'_R1.fastq.gz'
    if normal_file in r1_files:
        return normal_file
    else:
        found_file = [i for i in r1_files if row['Run']+row['Library'] in i]
        #print(row['Run']+row['Library'], found_file)
        assert len(found_file) == 1
        return found_file[0]

alldf['Run'] = alldf['File'].map(file_to_run)
alldf['R1_files'] = alldf.apply(get_file, axis=1)
lib_to_file = alldf[['Library', 'R1_files']].groupby('Library').agg(lambda x: ';'.join(x)).reset_index()
lib_to_file.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Library,R1_files
0,02M_NaCl_2N-R1-Time0,Stanford_high_LT_02M_NaCl_2N-R1-Time0_R1.fastq.gz
1,02M_NaCl_2N-R1-Time104,Stanford_high_LT_02M_NaCl_2N-R1-Time104_R1.fas...
2,02M_NaCl_2N-R1-Time112,Stanford_high_LT_02M_NaCl_2N-R1-Time112_R1.fas...
3,02M_NaCl_2N-R1-Time128,Stanford_high_LT_02M_NaCl_2N-R1-Time128_R1.fas...
4,02M_NaCl_2N-R1-Time144,Stanford_high_LT_02M_NaCl_2N-R1-Time144_R1.fas...


In [26]:
td = d.merge(alldf, left_on='Filename', right_on='R1_files', how='inner')
td[['Filename', 'Library', 'R1_index', 'R1_bp_to_BC', 'R2_index', 'R2_bp_to_BC']].to_csv('All_file_primer_info.csv', index=False)

In [27]:
def fix_lib(lib):
    if ('Subpool' in lib or 'Ancestor' in lib) and 'BFA' not in lib:
        if 'Ancestor' in lib:
            lib = lib.replace('YPD_2N', 'YPDAnc_2N').replace('Ancestor', 'Subpool')
        lib = lib.replace('_R', '-R')
        return 'dBFA2-'+lib[lib.index('_')+1:]+'-'+lib.split('_')[0]
    else:
        return lib

lib_to_file['Library'] = lib_to_file['Library'].apply(fix_lib)
lib_to_file[lib_to_file['Library'].apply(lambda l: 'Subpool' in l)]

Unnamed: 0,Library,R1_files
127,dBFA2-YPDAnc_2N-R1_1-Subpool,dBFA2_h1_Ancestor_YPD_2N_R1_1_R1.fastq.gz
128,dBFA2-YPDAnc_2N-R1_2-Subpool,dBFA2_h1_Ancestor_YPD_2N_R1_2_R1.fastq.gz
270,dBFA2-02M_NaCl_2N-R2-Subpool,dBFA2_h1_Subpool_02M_NaCl_2N_R2_R1.fastq.gz
271,dBFA2-21C_2N-R2-Subpool,dBFA2_h1_Subpool_21C_2N_R2_R1.fastq.gz
272,dBFA2-37C_2N-R1-Subpool,dBFA2_h1_Subpool_37C_2N_R1_R1.fastq.gz
273,dBFA2-37C_2N-R2-Subpool,dBFA2_h1_Subpool_37C_2N_R2_R1.fastq.gz
274,dBFA2-48Hr_2N-R1-Subpool,dBFA2_h1_Subpool_48Hr_2N_R1_R1.fastq.gz
275,dBFA2-48Hr_2N-R2-Subpool,dBFA2_h1_Subpool_48Hr_2N_R2_R1.fastq.gz
276,dBFA2-CLM_2N-R1-Subpool,dBFA2_h1_Subpool_CLM_2N_R1_R1.fastq.gz
277,dBFA2-CLM_2N-R2-Subpool,dBFA2_h1_Subpool_CLM_2N_R2_R1.fastq.gz


In [28]:
for bfa in ['dBFA2', 'hBFA2', 'hBFA1']:
    td = lib_to_file[lib_to_file['Library'].apply(lambda l: bfa in l)]
    print(bfa, len(td))
    td.to_csv('assay_files/'+bfa+'_assay.csv', index=False)

dBFA2 194
hBFA2 186
hBFA1 89


In [29]:
lt = lib_to_file[lib_to_file['Library'].apply(lambda l: 'BFA' not in l and 'Ancestor' not in l and 'Subpool' not in l)]
env_reps = sorted(set([i.split('-Time')[0] for i in lt['Library']]))
for er in env_reps:
    td = lt[lt['Library'].apply(lambda l: l.split('-Time')[0]==er)]
    #print(er, len(td))
    td.to_csv('assay_files/LT_'+er+'_assay.csv', index=False)

In [30]:
file_dict = dict()
all_assay_files = glob('assay_files/*.csv')
for f in all_assay_files:
    for j, row in pd.read_csv(f).iterrows():
        for fname in row['R1_files'].split(';'):
            assert fname not in file_dict
            file_dict[fname] = [row['Library'], f.split('/')[-1]]

In [31]:
len(r1_files), len(file_dict), len(set(file_dict.keys()).intersection(set(r1_files)))

(793, 793, 793)

In [32]:
mat = []
for f in all_assay_files:
    td = pd.read_csv(f)
    td['Assay'] = f.split('/')[-1].split('_assay')[0]
    mat.append(td)
    
def get_r2(r1):
    r2s = []
    for f in r1.split(';'):
        r2 = ''
        if 'R1.fastq.gz' in f:
            r2_tmp = f.replace('R1.fastq.gz', 'R2.fastq.gz')
            if r2_tmp in list(d['Filename']):
                r2 = r2_tmp
        r2s.append(r2)
    return ';'.join(r2s)
    
assay_info = pd.concat(mat)
assay_info['R2_files'] = assay_info['R1_files'].apply(get_r2)
assay_info[['Assay', 'Library', 'R1_files', 'R2_files']].to_csv('Files_by_assay.csv', index=False)

In [33]:
assay_info[['Assay']].drop_duplicates().to_csv('All_assays.txt', index=False, header=None)