In [25]:
import pandas as pd
from snakemake.io import expand
import yaml
import numpy as np
from utils import *
from sm_utils import *
from bc_utils import *

In [26]:
config_file = '../configs/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [75]:
# variables to change (again these could go in
# a future analysis spec)
# config_tsv = 'configs/test_2.tsv'
# config_tsv = 'configs/test_3.tsv'
# config_tsv = '../configs/test_4.tsv'
config_tsv = '../configs/test_fastqs_for_exp_spec.tsv'
subpool_tsv = '../configs/subpool_metadata.tsv'
sample_csv = '../configs/sample_metadata.csv'
kit = 'WT_mega'
chemistry = 'v2'
first_min_counts = 500

# read in config / analysis spec
df = parse_config(config_tsv)
bc_df = get_bc1_matches(kit, chemistry)
sample_df = parse_sample_df(sample_csv)
subpool_df = pd.read_csv(subpool_tsv, sep='\t')
subpool_df.columns = [c.lower() for c in subpool_df.columns] 


In [76]:
subpool_df.head()

Unnamed: 0,plate,subpool,count,selection,sublibrary index,i7_subpool_barcode,i5_subpool_barcode
0,igvf_010,Sublibrary_2,67000,EX,2,ACTTGA,
1,igvf_010,Sublibrary_3,67000,NO,3,GATCAG,
2,igvf_010,Sublibrary_4,67000,NO,4,TAGCTT,
3,igvf_010,Sublibrary_5,67000,NO,5,ATGTCA,
4,igvf_010,Sublibrary_6,67000,NO,6,CTTGTA,


In [77]:
def make_exp_spec_from_sample_df(sample_df, experiment, ofile):
    """
    Parameters:
        sample_df (pandas DataFrame): Output from parse_sample_df; derived from
            the Google Sheets thingie
        experiment (str): Name of experiment. Will be used to subset the sample_df
        ofile (str): Path to output .yml file
    """
    exp_spec = dict()
    exp_spec['samples'] = dict()
    
    # subset on correct experiment
    sample_df = sample_df.loc[sample_df.plate==experiment].copy(deep=True)
    
    # make sure that we only have one kit / chemistry for this experiment
    assert len(sample_df['Protocol'].unique().tolist()) == 1
    assert len(sample_df['Chemistry'].unique().tolist()) == 1
    
    # kit + chemistry
    exp_spec['protocol'] = sample_df['Protocol'].tolist()[0]
    exp_spec['chemistry'] = sample_df['Chemistry'].tolist()[0]
    
    # list of samples for each row / col well
    def make_alias_list(x, alias_cols):
        """
        Turn the alias sample columns into a list that's one column
        """
        aliases = [a for a in x[alias_cols] if not str(a)=='nan']
        return aliases    

    alias_cols = ['alias_tissue1', 'alias_tissue2', 'alias_tissue3', 'alias_tissue4']
    sample_df['well_row'] = sample_df.bc1_well.str.slice(0,1)
    sample_df['well_col'] = sample_df.bc1_well.str.slice(1).astype(int)
    sample_df = sample_df[alias_cols+['well_row', 'well_col']]
    sample_df['samples'] = sample_df.apply(lambda x: make_alias_list(x, alias_cols), axis=1)
    sample_df.drop(alias_cols, axis=1, inplace=True)

    for row in sample_df.well_row.unique().tolist():
        temp = sample_df.loc[sample_df.well_row==row].copy(deep=True)
        for ind, entry in temp.iterrows():
            if row not in exp_spec['samples'].keys():
                exp_spec['samples'][row] = dict()
            exp_spec['samples'][row][entry.well_col] = entry['samples']

    yml_str = yaml.dump(exp_spec)
    with open(ofile, 'w') as o:
        o.write(yml_str)

In [78]:
# this works for the samples and exp overview
sample_df = parse_sample_df(sample_csv)
# make_exp_spec_from_sample_df(sample_df, 'igvf_010', 'igvf_010_exp_spec.yml')

In [79]:
df = df.merge(subpool_df, how='left', on=['subpool', 'plate'])

In [80]:
df.head()

Unnamed: 0,fastq,fastq_r2,subpool,plate,lane,run,platform,notes,seqspec,path,path2,r2_fastq,count,selection,sublibrary index,i7_subpool_barcode,i5_subpool_barcode
0,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,igvf_010,L001,1,nova,,igvf_010/nova/seqspec.yml,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,
1,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,igvf_010,L002,1,nova,,igvf_010/nova/seqspec.yml,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,
2,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,igvf_010,L003,1,nova,,igvf_010/nova/seqspec.yml,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,
3,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,igvf_010,L004,1,nova,,igvf_010/nova/seqspec.yml,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,
4,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/Subli...,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/Subli...,Sublibrary_2,igvf_010,L001,2,nova,,igvf_010/nova/seqspec.yml,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/Subli...,67000,EX,2,ACTTGA,


In [81]:
# add some extra columns
plat_dict = {'prom': 'ONT PromethION',
             'grid': 'ONT GridION',
             'nova': 'Illumina NovaSeq',
             'next': 'Illumina NextSeq'}
plat_to_seq_type_dict = {'prom': 'long_read',
                         'grid': 'long_read',
                         'nova': 'short_read',
                         'next': 'short_read'}
df['long_or_short'] = df.platform.map(plat_to_seq_type_dict)
df['platform_hr'] = df.platform.map(plat_dict)

In [117]:
# subpool dict init
sp_dict = {}


# add an empty dict for each sublib
gb_cols = ['subpool', 'count', 'i7_subpool_barcode', 'i5_subpool_barcode', 'selection']
temp = df[gb_cols].drop_duplicates()
temp.set_index('subpool', inplace=True)
sp_dict['subpool'] = temp.to_dict('index')
sp_dict

{'subpool': {'Sublibrary_2': {'count': 67000,
   'i7_subpool_barcode': 'ACTTGA',
   'i5_subpool_barcode': nan,
   'selection': 'EX'},
  'Sublibrary_3': {'count': 67000,
   'i7_subpool_barcode': 'GATCAG',
   'i5_subpool_barcode': nan,
   'selection': 'NO'}}}

In [83]:
def check_and_add_entry(d, key, item=None):
    if key not in d.keys():
        d[key] = {}
    if item:
        d[key] = item
    return d
    

dict_keys(['Sublibrary_2', 'Sublibrary_3'])

In [123]:
# fastq dict
gb_cols = ['subpool', 'long_or_short', 'platform_hr', 'run', 'seqspec']
gb_fastq_cols = gb_cols+['fastq', 'fastq_r2']
print(len(gb_cols))
print(len(gb_fastq_cols))

temp = df[gb_fastq_cols].groupby(gb_cols).agg({'fastq':list,
                                               'fastq_r2':list}).reset_index()
print(temp.head())
# temp = temp.to_dict()


# subpool
for sp in sp_dict['subpool'].keys():
    temp0 = temp.loc[temp.subpool==sp].copy(deep=True)

    # short or long
    for sl in temp0.long_or_short.unique().tolist():
        sp_dict['subpool'][sp] = check_and_add_entry(sp_dict['subpool'][sp], sl)
        temp2 = temp0.loc[temp0.long_or_short==sl].copy(deep=True)

        # platforms
        for p in temp2.platform_hr.unique().tolist():
            sp_dict['subpool'][sp][sl] = check_and_add_entry(sp_dict['subpool'][sp][sl], p)
            temp3 = temp2.loc[temp2.platform_hr==p].copy(deep=True)
            seqspec = temp3.seqspec.unique().tolist()
            assert len(seqspec)==1
            seqspec = seqspec[0]
            sp_dict['subpool'][sp][sl][p]['seqspec'] = seqspec
            sp_dict['subpool'][sp][sl][p] = check_and_add_entry(sp_dict['subpool'][sp][sl][p], 'run')

            # runs
            for r in temp3.run.unique().tolist():
                sp_dict['subpool'][sp][sl][p]['run'] = check_and_add_entry(sp_dict['subpool'][sp][sl][p]['run'], r)
                temp4 = temp3.loc[temp3.run==r].copy(deep=True)
                assert len(temp4.index) == 1
                fastqs = temp4.fastq.tolist()[0]
                fastq_r2s = temp4.fastq_r2.tolist()[0]
                sp_dict['subpool'][sp][sl][p]['run'][r] = check_and_add_entry(sp_dict['subpool'][sp][sl][p]['run'][r], 'fastqs')
                sp_dict['subpool'][sp][sl][p]['run'][r] = check_and_add_entry(sp_dict['subpool'][sp][sl][p]['run'][r], 'fastq_r2s')
                sp_dict['subpool'][sp][sl][p]['run'][r]['fastqs'] = fastqs
                sp_dict['subpool'][sp][sl][p]['run'][r]['fastq_r2s'] = fastq_r2s

yml_str = yaml.dump(sp_dict)
with open('subpool_spec.yml', 'w') as o:
    o.write(yml_str)

5
7
        subpool long_or_short       platform_hr  run  \
0  Sublibrary_2     long_read       ONT GridION    1   
1  Sublibrary_2     long_read    ONT PromethION    1   
2  Sublibrary_2    short_read  Illumina NextSeq    1   
3  Sublibrary_2    short_read  Illumina NextSeq    2   
4  Sublibrary_2    short_read  Illumina NovaSeq    1   

                     seqspec  \
0  igvf_010/grid/seqspec.yml   
1  igvf_010/prom/seqspec.yml   
2  igvf_010/next/seqspec.yml   
3  igvf_010/next/seqspec.yml   
4  igvf_010/nova/seqspec.yml   

                                               fastq  \
0  [/dfs7/samlab/seyedam/IGVF/igvf_010/grid/igvf_...   
1  [/dfs7/samlab/seyedam/IGVF/igvf_010/prom/igvf_...   
2  [/dfs7/samlab/seyedam/IGVF/igvf_010/next1/Subl...   
3  [/dfs7/samlab/seyedam/IGVF/igvf_010/next2/Subl...   
4  [/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subl...   

                                            fastq_r2  
0                                              [nan]  
1                  