In [None]:
import numpy as np
motif = np.array([[   0,   2, 104, 104,   1,   2, 103, 102,   0,   0,  99, 105,   0,   0, 100, 102,   5,   3],
                  [   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   4,   0,   0,   2,   3,   0,   0,   3],
                  [ 105, 103,   1,   1, 104, 102,   2,   3, 104, 103,   2,   0, 105, 103,   0,   2,  97,  97],
                  [   0,   0,   0,   0,   0,   1,   0,   0,   1,   0,   0,   0,   0,   0,   2,   1,   3,   2]])

sequence_length = 30

def build_positive_sequences(number_positive_examples_=5000):
    motif_plus_background = np.hstack([np.ones((4, (sequence_length-motif.shape[1])//2)), 
                                       motif,
                                       np.ones((4, (sequence_length-motif.shape[1])//2))])

    positive_examples = \
    np.array([np.random.choice(['A', 'C', 'G', 'T'], 
                               size=number_positive_examples_, 
                               p=motif_plus_background[:,position]/float(np.sum(motif_plus_background[:,position]))) 
              for position in range(sequence_length)]).transpose()

    sequences_positive_ = []
    for positive_example in positive_examples: 
        sequences_positive_.append(''.join(positive_example))
        
    return sequences_positive_

def build_negative_sequences(number_negative_examples_=995000):
    negative_examples_ = \
    np.array([np.random.choice(['A', 'C', 'G', 'T'], 
                               size=number_negative_examples_, 
                               p=np.array([1,1,1,1])/4.0) 
              for position in range(sequence_length)]).transpose()

    sequences_negative_ = []
    for negative_example in negative_examples_: 
        sequences_negative_.append(''.join(negative_example))
        
    return sequences_negative_

import tensorflow_probability as tfp 

tfd = tfp.distributions 

import pandas as pd
from collections import OrderedDict 

def build_data_frames(rate_vector_): 
    sequences_positive_ = build_positive_sequences(5000)
    sequences_negative_ = build_negative_sequences(5000)

    distribution_positive = tfd.NegativeBinomial(total_count=1000, probs=0.05)
    distribution_negative = tfd.NegativeBinomial(total_count=1000, probs=0.10)
    observed_depths_positive_ = distribution_positive.sample(len(sequences_positive_))
    observed_depths_negative_ = distribution_negative.sample(len(sequences_negative_))

    df = pd.DataFrame(OrderedDict([
        ('sequence', np.concatenate((sequences_positive_, sequences_negative_))),    
        ('class', [1]*len(sequences_positive_) + [0]*len(sequences_negative_)),
        ('distribution',[distribution_positive]*len(sequences_positive_) + [distribution_negative]*len(sequences_negative_)),
        ('observed_depth', list(observed_depths_positive_[:,0]) + list(observed_depths_negative_[:,0]))
    ]))

    # add fake genomic positions
    df['position'] = np.random.permutation(len(df))

    # shuffle data
    df = df.sample(frac=1) 

    # split data
    test_dev_sets_size = min(1000, int(0.1*len(df)))

    # make copies to avoid settingwithcopywarning: https://www.dataquest.io/blog/settingwithcopywarning
    df_test = df[:test_dev_sets_size].copy()
    df_dev = df[test_dev_sets_size:2*test_dev_sets_size].copy()
    df_train = df[2*test_dev_sets_size:].copy()
    
    return df_test, df_dev, df_train
