In [23]:
import pandas as pd
from snakemake.io import expand
import yaml
import numpy as np
from utils import *
from sm_utils import *
from bc_utils import *

In [24]:
config_file = '../configs/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [25]:
# variables to change (again these could go in
# a future analysis spec)
# config_tsv = 'configs/test_2.tsv'
# config_tsv = 'configs/test_3.tsv'
# config_tsv = '../configs/test_4.tsv'
config_tsv = '../configs/test_fastqs_for_exp_spec.tsv'
subpool_tsv = '../configs/subpool_metadata.tsv'
sample_csv = '../configs/sample_metadata.csv'
kit = 'WT_mega'
chemistry = 'v2'
first_min_counts = 500

# read in config / analysis spec
df = parse_config(config_tsv)
bc_df = get_bc1_matches(kit, chemistry)
sample_df = parse_sample_df(sample_csv)
subpool_df = pd.read_csv(subpool_tsv, sep='\t')
subpool_df.columns = [c.lower() for c in subpool_df.columns] 


In [30]:
def make_exp_spec_from_sample_df(sample_df, experiment, ofile):
    """
    Parameters:
        sample_df (pandas DataFrame): Output from parse_sample_df; derived from
            the Google Sheets thingie
        experiment (str): Name of experiment. Will be used to subset the sample_df
        ofile (str): Path to output .yml file
    """
    exp_spec = dict()
    exp_spec['sample'] = dict()
    
    # subset on correct experiment
    sample_df = sample_df.loc[sample_df.plate==experiment].copy(deep=True)
    
    # make sure that we only have one kit / chemistry for this experiment
    assert len(sample_df['Protocol'].unique().tolist()) == 1
    assert len(sample_df['Chemistry'].unique().tolist()) == 1
    
    # kit + chemistry
    exp_spec['protocol'] = sample_df['Protocol'].tolist()[0]
    exp_spec['chemistry'] = sample_df['Chemistry'].tolist()[0]
    
    # list of samples for each row / col well
    def make_alias_list(x, alias_cols):
        """
        Turn the alias sample columns into a list that's one column
        """
        aliases = [a for a in x[alias_cols] if not str(a)=='nan']
        return aliases    

    alias_cols = ['alias_tissue1', 'alias_tissue2', 'alias_tissue3', 'alias_tissue4']
    sample_df['well_row'] = sample_df.bc1_well.str.slice(0,1)
    sample_df['well_col'] = sample_df.bc1_well.str.slice(1).astype(int)
    sample_df = sample_df[alias_cols+['well_row', 'well_col']]
    sample_df['sample'] = sample_df.apply(lambda x: make_alias_list(x, alias_cols), axis=1)
    sample_df.drop(alias_cols, axis=1, inplace=True)

    for row in sample_df.well_row.unique().tolist():
        temp = sample_df.loc[sample_df.well_row==row].copy(deep=True)
        for ind, entry in temp.iterrows():
            if row not in exp_spec['sample'].keys():
                exp_spec['sample'][row] = dict()
            exp_spec['sample'][row][entry.well_col] = entry['sample']

    yml_str = yaml.dump(exp_spec)
    with open(ofile, 'w') as o:
        o.write(yml_str)
        
    return exp_spec

In [31]:
# this works for the samples and exp overview
sample_df = parse_sample_df(sample_csv)
exp_spec =  make_exp_spec_from_sample_df(sample_df, 'igvf_010', 'igvf_010_exp_spec.yml')

In [32]:
df = df.merge(subpool_df, how='left', on=['subpool', 'plate'])

In [33]:
exp_spec

{'sample': {'A': {1: ['ali-mortazavi:016_B6J_10F_09_UBERON_0002113'],
   2: ['ali-mortazavi:017_B6J_10M_09_UBERON_0002113'],
   3: ['ali-mortazavi:018_B6J_10F_09_UBERON_0002113'],
   4: ['ali-mortazavi:019_B6J_10M_09_UBERON_0002113'],
   5: ['ali-mortazavi:020_B6J_10F_09_UBERON_0002113'],
   6: ['ali-mortazavi:021_B6J_10M_09_UBERON_0002113'],
   7: ['ali-mortazavi:024_B6J_10F_09_UBERON_0002113'],
   8: ['ali-mortazavi:025_B6J_10M_09_UBERON_0002113'],
   10: ['ali-mortazavi:017_B6J_10M_16_UBERON_0001388',
    'ali-mortazavi:067_NODJ_10M_16_UBERON_0001388'],
   11: ['ali-mortazavi:018_B6J_10F_16_UBERON_0001388',
    'ali-mortazavi:074_NODJ_10F_16_UBERON_0001388'],
   12: ['ali-mortazavi:019_B6J_10M_16_UBERON_0001388',
    'ali-mortazavi:069_NODJ_10M_16_UBERON_0001388'],
   9: ['ali-mortazavi:016_B6J_10F_16_UBERON_0001388',
    'ali-mortazavi:066_NODJ_10F_16_UBERON_0001388']},
  'B': {1: ['ali-mortazavi:066_NODJ_10F_09_UBERON_0002113'],
   2: ['ali-mortazavi:067_NODJ_10M_09_UBERON_0002113

In [34]:
# add some extra columns
plat_dict = {'prom': 'promethion',
             'grid': 'gridion',
             'nova': 'novaseq',
             'next': 'nextseq'}
plat_to_seq_type_dict = {'prom': 'long_read',
                         'grid': 'long_read',
                         'nova': 'short_read',
                         'next': 'short_read'}
df['long_or_short'] = df.platform.map(plat_to_seq_type_dict)
df['platform_hr'] = df.platform.map(plat_dict)

selection_dict = {'EX': 'exome'}
df['selection'] = df.selection.map(selection_dict)

In [35]:
# subpool dict init
sp_dict = {}

# add an empty dict for each sublib
gb_cols = ['subpool', 'count', 'i7_subpool_barcode', 'i5_subpool_barcode', 'selection']
temp = df[gb_cols].drop_duplicates()
temp.set_index('subpool', inplace=True)
sp_dict['subpool'] = temp.to_dict('index')

# remove nans
for key1, item1 in sp_dict.items():
    for key2, item2 in item1.items():
        del_keys = []
        for key3, item3 in item2.items():
            if pd.isna(item3):
                del_keys.append(key3)
        for k in del_keys:
            del item2[k]

In [36]:
def check_and_add_entry(d, key, item=None):
    if key not in d.keys():
        d[key] = {}
    if item:
        d[key] = item
    return d  

In [37]:
# fastq dict
gb_cols = ['subpool', 'long_or_short', 'platform_hr', 'run', 'seqspec']
gb_fastq_cols = gb_cols+['fastqs', 'fastq_pairs']
# print(len(gb_cols))
# print(len(gb_fastq_cols))

temp = df[gb_fastq_cols].groupby(gb_cols).agg({'fastqs':sum,
                                               'fastq_pairs':list}).reset_index()

In [38]:
# subpool
for sp in sp_dict['subpool'].keys():
    temp0 = temp.loc[temp.subpool==sp].copy(deep=True)

    # short or long
    for sl in temp0.long_or_short.unique().tolist():
        sp_dict['subpool'][sp] = check_and_add_entry(sp_dict['subpool'][sp], sl)
        temp2 = temp0.loc[temp0.long_or_short==sl].copy(deep=True)

        # platforms
        for p in temp2.platform_hr.unique().tolist():
            sp_dict['subpool'][sp][sl] = check_and_add_entry(sp_dict['subpool'][sp][sl], p)
            temp3 = temp2.loc[temp2.platform_hr==p].copy(deep=True)
            seqspec = temp3.seqspec.unique().tolist()
            assert len(seqspec)==1
            seqspec = seqspec[0]
            sp_dict['subpool'][sp][sl][p]['seqspec'] = seqspec
            sp_dict['subpool'][sp][sl][p] = check_and_add_entry(sp_dict['subpool'][sp][sl][p], 'run')

            # runs
            for r in temp3.run.unique().tolist():
                sp_dict['subpool'][sp][sl][p]['run'] = check_and_add_entry(sp_dict['subpool'][sp][sl][p]['run'], r)
                temp4 = temp3.loc[temp3.run==r].copy(deep=True)
                assert len(temp4.index) == 1
                fastqs = temp4.fastqs.tolist()[0]
                fastq_pairs = temp4.fastq_pairs.tolist()[0]
                # if sl == 'long_read':
                #     import pdb; pdb.set_trace()
                sp_dict['subpool'][sp][sl][p]['run'][r] = check_and_add_entry(sp_dict['subpool'][sp][sl][p]['run'][r], 'fastqs')
                sp_dict['subpool'][sp][sl][p]['run'][r] = check_and_add_entry(sp_dict['subpool'][sp][sl][p]['run'][r], 'fastq_pairs')
                sp_dict['subpool'][sp][sl][p]['run'][r]['fastqs'] = fastqs
                sp_dict['subpool'][sp][sl][p]['run'][r]['fastq_pairs'] = fastq_pairs

yml_str = yaml.dump(sp_dict)
with open('subpool_spec.yml', 'w') as o:
    o.write(yml_str)

In [41]:
exp_spec['subpool'] = sp_dict['subpool']
yml_str = yaml.dump(exp_spec)
with open('exp_spec.yml', 'w') as o:
    o.write(yml_str)