In [5]:
import pandas as pd
from snakemake.io import expand
import yaml
from utils import *
from sm_utils import *
from bc_utils import *

In [6]:
config_file = '../configs/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

## 231006 tissue level agg

In [7]:
config_tsv = '../configs/test_2.tsv'
sample_csv = '../configs/sample_metadata.csv'

In [17]:
df = parse_config(config_tsv)
sample_df = pd.read_csv(sample_csv)
wc = {'tissue': 'PBMC'}

In [30]:
def get_tissue_adatas(df, sample_df, wc, cfg_entry):
    
    # limit to input tissue
    temp_sample = sample_df.copy(deep=True)
    print(len(temp_sample.index))
    temp_sample = temp_sample.loc[temp_sample.Tissue==wc['tissue']]
    print(len(temp_sample.index))
    
    # merge this stuff in with the fastq df
    fastq_df = df.copy(deep=True)
    temp = fastq_df.merge(sample_df, on='plate', how='inner')
    
    # get the plate / subpool / sample info for this tissue
    plates = temp.plate().tolist()
    subpools = temp.subpool.tolist()
    samples = temp.tolist()


In [31]:
get_tissue_adatas(df, sample_df, wc, 'test')

1056
96
['igvf_013']
1
['Sublibrary_2', 'Sublibrary_3']
2
['016_B6J_10F_20', '017_B6J_10M_20', '018_B6J_10F_20', '019_B6J_10M_20', '020_B6J_10F_20', '021_B6J_10M_20', '024_B6J_10F_20', '025_B6J_10M_20', '066_NODJ_10F_20', '067_NODJ_10M_20', '070_NODJ_10F_20', '069_NODJ_10M_20', '072_NODJ_10F_20', '073_NODJ_10M_20', '074_NODJ_10F_20', '075_NODJ_10M_20', '026_AJ_10F_20', '029_AJ_10M_20', '028_AJ_10F_20', '031_AJ_10M_20', '032_AJ_10F_20', '033_AJ_10M_20', '034_AJ_10F_20', '035_AJ_10M_20', '076_PWKJ_10F_20', '077_PWKJ_10M_20', '078_PWKJ_10F_20', '079_PWKJ_10M_20', '080_PWKJ_10F_20', '083_PWKJ_10M_20', '082_PWKJ_10F_20', '085_PWKJ_10M_20', '036_129S1J_10F_20', '037_129S1J_10M_20', '038_129S1J_10F_20', '041_129S1J_10M_20', '040_129S1J_10F_20', '043_129S1J_10M_20', '044_129S1J_10F_20', '045_129S1J_10M_20', '086_CASTJ_10F_20', '087_CASTJ_10M_20', '090_CASTJ_10F_20', '091_CASTJ_10M_20', '092_CASTJ_10F_20', '093_CASTJ_10M_20', '094_CASTJ_10F_20', '095_CASTJ_10M_20', '058_WSBJ_10F_20', '057_WSBJ_

## 231006 more bc stuff

In [7]:
kit = 'WT_mega'
chemistry = 'v2'
bc = 3

In [8]:
df = get_bcs(bc, kit, chemistry)

In [9]:
df

Unnamed: 0,bc3,well
0,AACGTGAT,A1
1,AAACATCG,A2
2,ATGCCTAA,A3
3,AGTGGTCA,A4
4,ACCACTGT,A5
...,...,...
91,GAACAGGC,H8
92,GACAGTGC,H9
93,GAGTTAGC,H10
94,GATGAATC,H11


In [3]:
kit = 'WT_mega'
chemistry = 'v2'

In [6]:
bc_df = get_bc1_matches(kit, chemistry)

In [7]:
bc_df

Unnamed: 0,bc1_dt,well,bc1_randhex
0,CATTCCTA,A1,CATCATCC
1,CTTCATCA,A2,CTGCTTTG
2,CCTATATC,A3,CTAAGGGA
3,ACATTTAC,A4,GCTTATAG
4,ACTTAGCT,A5,TCTGATCC
...,...,...,...
91,ATTAGGCT,H8,GTGTGTGT
92,GCCTTTCA,H9,TATGCTTC
93,ATTCTAGG,H10,ATGGTGTT
94,CCTTACAT,H11,GAATAATG


In [68]:
fname = '../configs/test_2.tsv'

def parse_config(fname):
    df = pd.read_csv(fname, sep='\t')
    df['path'] = df.fastq.str.rsplit('/', n=2, expand=True)[0]+'/'
    df['path2'] = df.fastq.str.rsplit('/', n=1, expand=True)[0]+'/'
    df['r2_fastq'] = df.fastq.str.replace('_R1_', '_R2_')
    return df

In [76]:
def get_subpool_fastqs(wc, df, how, read=None):
    """
    Get list of fastqs from the same subpool. Can
    either return as a Python list of strings or a
    formatted string list read to pass to a shell cmd.

    Parameters:
        how (str): {'str', 'list'}. 'list' will return
            Python list of str var. 'str' will return
            Python string
    """
    temp = df.copy(deep=True)
    temp = temp.loc[(temp.plate==wc['plate'])&\
                    (temp.subpool==wc['subpool'])]

    if how == 'list':
        reads = [read for i in range(len(temp.index))]
        return expand(expand(config['raw']['fastq'],
                        zip,
                        sample=temp['sample'].tolist(),
                        lane=temp['lane'].tolist(),
                        allow_missing=True),
                        read=read,
                        plate=wc['plate'],
                        subpool=wc['subpool'])

    elif how == 'str':
        r1s = expand(expand(config['raw']['fastq'],
                        zip,
                        sample=temp['sample'].tolist(),
                        lane=temp['lane'].tolist(),
                        allow_missing=True),
                        read='R1',
                        plate=wc['plate'],
                        subpool=wc['subpool'])
        r2s = expand(expand(config['raw']['fastq'],
                        zip,
                        sample=temp['sample'].tolist(),
                        lane=temp['lane'].tolist(),
                        allow_missing=True),
                        read='R2',
                        plate=wc['plate'],
                        subpool=wc['subpool'])
        fastq_str = ''
        for r1, r2 in zip(r1s, r2s):
            fastq_str+=f' {r1} {r2}'
        return fastq_str

In [86]:
def get_df_info(wc, df, col):
    temp = df.copy(deep=True)
    temp = temp.loc[(temp.plate==wc['plate'])&\
                    (temp.subpool==wc['subpool'])&\
                    (temp['sample']==wc['sample'])&\
                    (temp.lane==wc['lane'])]
    assert len(temp.index) == 1
    return temp[col].values[0]

In [87]:
df = parse_config(fname)
wc = {'plate': 'igvf_013', 'subpool': 'Sublibrary_2', 'sample': 'S1', 'lane': 'L001', 'read': 'R1'}

In [88]:
get_df_info(wc, df, 'r2_fastq')

'/dfs7/samlab/seyedam/IGVF/igvf_013/nova1/Sublibrary_2_S1_L001_R2_001.fastq.gz'

In [78]:
get_subpool_fastqs(wc, df, 'list', read='R2')

['S1', 'S1', 'S1', 'S1']
['L001', 'L002', 'L003', 'L004']
R2
igvf_013
Sublibrary_2


['fastq/igvf_013_Sublibrary_2_S1_L001_R2_001.fastq.gz',
 'fastq/igvf_013_Sublibrary_2_S1_L002_R2_001.fastq.gz',
 'fastq/igvf_013_Sublibrary_2_S1_L003_R2_001.fastq.gz',
 'fastq/igvf_013_Sublibrary_2_S1_L004_R2_001.fastq.gz']

In [79]:
get_subpool_fastqs(wc, df, 'str')

['S1', 'S1', 'S1', 'S1']
['L001', 'L002', 'L003', 'L004']
None
igvf_013
Sublibrary_2


' fastq/igvf_013_Sublibrary_2_S1_L001_R1_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L001_R2_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L002_R1_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L002_R2_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L003_R1_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L003_R2_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L004_R1_001.fastq.gz fastq/igvf_013_Sublibrary_2_S1_L004_R2_001.fastq.gz'

In [23]:
df.r2_fastq.tolist()[:2]

['/dfs7/samlab/seyedam/IGVF/igvf_005/nova1/Sublibrary_2_S1_L001_R2_001.fastq.gz',
 '/dfs7/samlab/seyedam/IGVF/igvf_005/nova1/Sublibrary_2_S1_L002_R2_001.fastq.gz']