In [1]:
import numpy as np, seaborn as sns, matplotlib.pyplot as plt, pandas as pd

import sys
sys.path.append('/ahg/regevdata/projects/CRISPR-libraries/prj2/evolution/abe8e/src/')
import _config

Using data folder:
 /ahg/regevdata/projects/CRISPR-libraries/prj2/evolution/abe8e/data/


## global functions

In [2]:
def recursive_generate_option_combos(options, recurse_idx):
    curr_opt = list(options.keys())[recurse_idx]
    if recurse_idx == 0:
        return [[s] for s in options[curr_opt]]
    combos = recursive_generate_option_combos(options, recurse_idx - 1)
    new_combos = []
    for combo in combos:
        for item in options[curr_opt]:
            new_combo = combo + [item]
            new_combos.append(new_combo)
    return new_combos



In [3]:
def get_name(row):
    options = list(row.index)
    vals = list(row)
    nm = '--'.join([f'{opt}-{val}' for opt, val in zip(options, vals)])
    return nm



In [4]:
def create_job_table(options, name):
    from collections import defaultdict
    dd = defaultdict(list)
        
    combos = recursive_generate_option_combos(options, len(options) - 1)
    df = pd.DataFrame(combos, columns = list(options.keys()))    
    
    df['dataset'] = f'{name}--' + df.apply(get_name, axis = 'columns')

    df.to_csv(_config.DATA_DIR + f'datagroup_{name}.csv')
    print(f'Generated {len(df)} dataset jobs')
    return df



## generate full tables (too many to add random seeds to)

Usage: Provide datagroup name to data_multi to generate datasets. Do not provide datagroup name to gen_modeling_exp. 

In [48]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2, 3],
    'threshold': [5, 8, 10],
    'read_len': [1, 50, 75, 100, 150, 200, 250],
    'min_gt_frequency': [0, 0.0001, 0.001, 0.01],
    'proposal_type': ['smart'],
}
name = 'simple'

df = create_job_table(options, name)
df.head()

Generated 168 dataset jobs


In [6]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [1],
    'threshold': [5, 8, 10],
    'read_len': [1, 50, 75, 100, 150, 200, 250],
    'min_gt_frequency': [0, 0.0001, 0.001, 0.01],
    'proposal_type': ['smart'],
}
name = 'simple_p1'

df = create_job_table(options, name)
df.head()

Generated 84 dataset jobs


Unnamed: 0,pace_num,threshold,read_len,min_gt_frequency,proposal_type,dataset
0,1,5,1,0.0,smart,simple_p1--pace_num-1--threshold-5--read_len-1...
1,1,5,1,0.0001,smart,simple_p1--pace_num-1--threshold-5--read_len-1...
2,1,5,1,0.001,smart,simple_p1--pace_num-1--threshold-5--read_len-1...
3,1,5,1,0.01,smart,simple_p1--pace_num-1--threshold-5--read_len-1...
4,1,5,50,0.0,smart,simple_p1--pace_num-1--threshold-5--read_len-5...


In [5]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2],
    'threshold': [5],
    'read_len': [1, 25, 50, 75, 100, 150, 200, 250, 300, 350, 400],
    'min_gt_frequency': [0],
    'proposal_type': ['smart'],
}
name = 'simple_rl'

df = create_job_table(options, name)
df.head()

Generated 11 dataset jobs


Unnamed: 0,pace_num,threshold,read_len,min_gt_frequency,proposal_type,dataset
0,2,5,1,0,smart,simple_rl--pace_num-2--threshold-5--read_len-1...
1,2,5,25,0,smart,simple_rl--pace_num-2--threshold-5--read_len-2...
2,2,5,50,0,smart,simple_rl--pace_num-2--threshold-5--read_len-5...
3,2,5,75,0,smart,simple_rl--pace_num-2--threshold-5--read_len-7...
4,2,5,100,0,smart,simple_rl--pace_num-2--threshold-5--read_len-1...


In [52]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2, 3],
    'threshold': [5, 8, 10],
    'min_gt_frequency': [0, 0.0001, 0.001, 0.01],
    'read_len': [1, 150],
    'noise': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.125, 0.15, 0.175, 0.20, 0.225, 0.25, 0.275, 0.3, 0],
    'proposal_type': ['smart'],
}
name = 'varynoise'

df = create_job_table(options, name)
df.head()

Generated 912 dataset jobs


Unnamed: 0,pace_num,threshold,min_gt_frequency,read_len,noise,proposal_type,dataset
0,2,5,0.0,1,0.01,smart,varynoise--pace_num-2--threshold-5--min_gt_fre...
1,2,5,0.0,1,0.02,smart,varynoise--pace_num-2--threshold-5--min_gt_fre...
2,2,5,0.0,1,0.03,smart,varynoise--pace_num-2--threshold-5--min_gt_fre...
3,2,5,0.0,1,0.04,smart,varynoise--pace_num-2--threshold-5--min_gt_fre...
4,2,5,0.0,1,0.05,smart,varynoise--pace_num-2--threshold-5--min_gt_fre...


In [53]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2, 3],
    'threshold': [5, 8, 10],
    'min_gt_frequency': [0, 0.0001, 0.001, 0.01],
    'read_len': [1, 150],
    'proposal_type': ['smart',
        'x2',
        'x3',
        'x4',
        'x5',
        'x6',
        'x7',
        'x8',
        'x9',
        'x10',
        'x20',
        'x30',
        'x40',
        'x50',
        'x100',
    ],
}
name = 'varyproposals'

df = create_job_table(options, name)
df.head()

Generated 720 dataset jobs


Unnamed: 0,pace_num,threshold,min_gt_frequency,read_len,proposal_type,dataset
0,2,5,0.0,1,smart,varyproposals--pace_num-2--threshold-5--min_gt...
1,2,5,0.0,1,x2,varyproposals--pace_num-2--threshold-5--min_gt...
2,2,5,0.0,1,x3,varyproposals--pace_num-2--threshold-5--min_gt...
3,2,5,0.0,1,x4,varyproposals--pace_num-2--threshold-5--min_gt...
4,2,5,0.0,1,x5,varyproposals--pace_num-2--threshold-5--min_gt...


## generate subset of tables (small enough to add random seeds)

Usage: Specify different datagroup name and a subset of an above datagroup. Provide to gen_modeling_exp.py to augment, then provide output modelexp name to modeling script fitness_from_reads_pt_multi.py.

In [7]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2, 3],
    'threshold': [5, 8, 10],
#     'min_gt_frequency': [0, 0.0001, 0.001, 0.01],
    'min_gt_frequency': [0],
#     'read_len': [1, 150],
    'read_len': [1],
    'noise': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.125, 0.15, 0.175, 0.20, 0.225, 0.25, 0.275, 0.3, 0],
    'proposal_type': ['smart'],
}
name = 'varynoise-mgt0-rl1'

df = create_job_table(options, name)
df.head()

Generated 114 dataset jobs


Unnamed: 0,pace_num,threshold,min_gt_frequency,read_len,noise,proposal_type,dataset
0,2,5,0,1,0.01,smart,varynoise-mgt0-rl1--pace_num-2--threshold-5--m...
1,2,5,0,1,0.02,smart,varynoise-mgt0-rl1--pace_num-2--threshold-5--m...
2,2,5,0,1,0.03,smart,varynoise-mgt0-rl1--pace_num-2--threshold-5--m...
3,2,5,0,1,0.04,smart,varynoise-mgt0-rl1--pace_num-2--threshold-5--m...
4,2,5,0,1,0.05,smart,varynoise-mgt0-rl1--pace_num-2--threshold-5--m...


In [11]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2],
    'threshold': [5],
    'min_gt_frequency': [0],
    'read_len': [1],
    'noise': [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.125, 0.15, 0.175, 0.20, 0.225, 0.25],
    'proposal_type': ['smart'],
    'noiserep': list(range(50)),
}
name = 'varynoisev2_p2'

df = create_job_table(options, name)
df.head()

Generated 850 dataset jobs


Unnamed: 0,pace_num,threshold,min_gt_frequency,read_len,noise,proposal_type,noiserep,dataset
0,2,5,0,1,0.0,smart,0,varynoisev2_p2--pace_num-2--threshold-5--min_g...
1,2,5,0,1,0.0,smart,1,varynoisev2_p2--pace_num-2--threshold-5--min_g...
2,2,5,0,1,0.0,smart,2,varynoisev2_p2--pace_num-2--threshold-5--min_g...
3,2,5,0,1,0.0,smart,3,varynoisev2_p2--pace_num-2--threshold-5--min_g...
4,2,5,0,1,0.0,smart,4,varynoisev2_p2--pace_num-2--threshold-5--min_g...


In [5]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [2],
    'threshold': [5],
    'min_gt_frequency': [0],
    'read_len': [1],
    'proposal_type': ['smart',
        'x2',
        'x3',
        'x4',
        'x5',
        'x6',
        'x7',
        'x8',
        'x9',
        'x10',
        'x20',
        'x30',
        'x40',
        'x50',
        'x100',
    ],
}
name = 'varyproposalsv2'

df = create_job_table(options, name)
df.head()

Generated 15 dataset jobs


Unnamed: 0,pace_num,threshold,min_gt_frequency,read_len,proposal_type,dataset
0,2,5,0,1,smart,varyproposalsv2--pace_num-2--threshold-5--min_...
1,2,5,0,1,x2,varyproposalsv2--pace_num-2--threshold-5--min_...
2,2,5,0,1,x3,varyproposalsv2--pace_num-2--threshold-5--min_...
3,2,5,0,1,x4,varyproposalsv2--pace_num-2--threshold-5--min_...
4,2,5,0,1,x5,varyproposalsv2--pace_num-2--threshold-5--min_...


In [5]:
'''
    Usage: option keys must match param keys in data_multi.py
'''

options = {
    'pace_num': [1],
    'threshold': [5],
    'read_len': [1],
    'min_gt_frequency': [0],
    'proposal_type': ['smart'],
}
name = 'simplev2'

df = create_job_table(options, name)
df.head()

Generated 1 dataset jobs


Unnamed: 0,pace_num,threshold,read_len,min_gt_frequency,proposal_type,dataset
0,1,5,1,0,smart,simplev2--pace_num-1--threshold-5--read_len-1-...
