In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from sklearn import preprocessing
from tqdm import tqdm
from itertools import product
from sklearn.linear_model import LinearRegression
from GibsonPrediction.modeling_utils import hash_split
from GibsonPrediction.utils import get_fold_change

%matplotlib inline

In [2]:
# Parameters for plotting model results
pd.set_option("display.max_colwidth",100)
sns.set(style="ticks", color_codes=True)
plt.rcParams['font.weight'] = 'normal'
plt.rcParams['axes.labelweight'] = 'normal'
plt.rcParams['axes.labelpad'] = 5
plt.rcParams['axes.linewidth']= 2
plt.rcParams['xtick.labelsize']= 14
plt.rcParams['ytick.labelsize']= 14
plt.rcParams['xtick.major.size'] = 6
plt.rcParams['ytick.major.size'] = 6
plt.rcParams['xtick.minor.size'] = 3
plt.rcParams['ytick.minor.size'] = 3
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.minor.width'] = 1
plt.rcParams['xtick.major.width'] = 2
plt.rcParams['ytick.major.width'] = 2
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['axes.edgecolor'] = 'black'

In [3]:
def kmer_encoding(n):
  '''
  Generate all k-mers of length N and assign a unique one-hot vector
  of length 4 ** n to each k-mer.
  
  Returns a dictionary with k-mers as keys and one-hot vectors as values.
  '''
  encoding_dict = {}
  for i, kmer in enumerate(product(['A', 'C', 'G', 'T'], repeat=n)):
    encoding = [0] * 4 ** n
    encoding[i] = 1
    encoding_dict["".join(kmer)] = encoding
  return encoding_dict


def kmer_featurize(seq, k, kmer_encoding_func):
    '''
    Generate one-hot encoded positional k-mer features for a DNA sequence.
    '''
    encoding = kmer_encoding_func(k)
    features = []

    for i in range(len(seq) - k + 1):
        features.append(encoding[seq[i:i + k]])

    return np.array(features).flatten()


def one_hot_encode(seq):
    '''
    One-hot encode a DNA sequence"
    '''
    seq = seq.upper()
    if not set(seq) <= {'A', 'T', 'G', 'C'}:
       raise ValueError(f"Sequence contains non-ACTG character: {seq}")
    one_hot_dict = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
    one_hot_seq = []
    for nt in seq:
        one_hot_seq.append(one_hot_dict[nt])
    return np.array(one_hot_seq)

def pearson_r2(x,y):
    r_value, p_value = stats.pearsonr(x, y)
    return r_value**2

def valid_predict(train_df, valid_df, model, test_seq, obs_col, output_col='pred'):
    '''Predict mean ribosome load using model and test set UTRs'''
    
    # Scale the test set mean ribosome load
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_df[obs_col].values.reshape(-1,1))
    
    # Make predictions
    predictions = model.predict(test_seq).reshape(-1,1)
    
    # Inverse scaled predicted mean ribosome load and return in a column labeled 'pred'
    df_copy = valid_df.copy()
    df_copy.loc[:,output_col] = scaler.inverse_transform(predictions)
    return df_copy

def evaluate_model(model, train_x, valid_x, train_df, valid_df):
    print(f"Train R-squared: {model.score(train_x, train_df['scaled_log2_FC'])}")
    valid = valid_predict(train_df=train_df, valid_df=valid_df, model=model, obs_col='log2_FC',test_seq=valid_x)
    r = r2(valid_df['scaled_log2_FC'], valid['pred'])
    print(f"Validation R-squared: {r}")
    c1 = (0.3, 0.45, 0.69)
    c2 = 'r'
    g = sns.JointGrid(x='log2_FC', y="pred", data=valid, space=0, xlim=(1,10), ylim=(0,10), ratio=6, height=7)
    g.plot_joint(plt.scatter,s=20, color=c1, linewidth=0.2, alpha=0.1, edgecolor='white')
    f = g.fig
    ax = f.gca()
    ax.set_yticks(np.arange(-3,3.01, 1))
    # ax.set_yticklabels(range(-3, ),size=20)
    ax.set_xticks(np.arange(-3,3.01, 1))
    # ax.set_xticklabels(range(1,11),size=20)
    ax.set_ylim(-3,3)
    ax.set_xlim(-3,3)
    g.plot_marginals(sns.kdeplot,shade=c1, **{'linewidth':2, 'color':c1})
    g.set_axis_labels('Observed MRL', 'Predicted MRL', **{'size':22})

    # g.x = n_atg['rl'].values
    # g.y = n_atg['pred'].values
    g.plot_joint(plt.scatter, s=20, linewidth=0.2, alpha=0.2, color=c2, edgecolor='white')
    g.plot_marginals(sns.kdeplot, shade=c2, **{'linewidth':2, 'color':c2})
    f = g.fig



In [4]:
one_hot_encode("A")

array([[1, 0, 0, 0]])

In [5]:
df = pd.read_pickle("../data/20230414_pipeline_run/output/counts/PCR_large_rep1_count_df.pkl")
initial_count = len(df)
print(f"Number of sequences: {initial_count}")
print(df.head())

Number of sequences: 590709
                         sequence  input_count  assembled_count
0  TATGCGCTAAATACGGCTTTGAGCGGCACA            4                1
1  TGAGTTGGTGACGTGTGGTTGACGGGGGGA           63               57
2  AGTGTGGCGGGTGGGGGTGCAGCGGGCTAG           60               45
3  CGGCAACATCGAAGTGGAATATATGGAACG           15                8
4  GTCAAATAAATTATTTGGCAACCAGTCCTT           17               24


Only keep sequences with > 10 reads in the input or > 10 reads in assembled.

In [6]:
df = df[(df['input_count'] > 10) | (df['assembled_count'] > 10)]
print(f"{initial_count - len(df)} sequences removed due to low counts")
print(df.head())

247177 sequences removed due to low counts
                         sequence  input_count  assembled_count
1  TGAGTTGGTGACGTGTGGTTGACGGGGGGA           63               57
2  AGTGTGGCGGGTGGGGGTGCAGCGGGCTAG           60               45
3  CGGCAACATCGAAGTGGAATATATGGAACG           15                8
4  GTCAAATAAATTATTTGGCAACCAGTCCTT           17               24
8  GCACCCGCAGTCGCCTACCAAGGGTCGTCG           29               18


Calculate log FC and one-hot encoding.

In [7]:
df['fold_change'] = get_fold_change(df, 1, 1)
df['log2_FC'] = np.log2(df['fold_change'])
df['one_hot_sequence'] = df['sequence'].apply(one_hot_encode)
df.head()

Unnamed: 0,sequence,input_count,assembled_count,fold_change,log2_FC,one_hot_sequence
1,TGAGTTGGTGACGTGTGGTTGACGGGGGGA,63,57,0.91095,-0.134556,"[[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0..."
2,AGTGTGGCGGGTGGGGGTGCAGCGGGCTAG,60,45,0.758009,-0.399713,"[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0..."
3,CGGCAACATCGAAGTGGAATATATGGAACG,15,8,0.565417,-0.822612,"[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0..."
4,GTCAAATAAATTATTTGGCAACCAGTCCTT,17,24,1.396092,0.481394,"[[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1..."
8,GCACCCGCAGTCGCCTACCAAGGGTCGTCG,29,18,0.636618,-0.6515,"[[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0..."


In [8]:
df['3mer_features'] = df['sequence'].apply(kmer_featurize, 3, kmer_encoding)

TypeError: Value after * must be an iterable, not function

In [25]:
import numpy as np

# example input array
arr = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]])

# function to convert a k-mer to one-hot encoding
def kmer_to_one_hot(kmer):
    print(len(kmer))
    concatenated = kmer.flatten()
    print(f"concatenated shape: {concatenated.shape}")
    index = concatenated.dot(4**np.arange(4*len(kmer) - 1, -1, -1))
    print(f"index shape: {index.shape}")
    result = np.zeros(4**(len(kmer) * 4))
    result[index] = 1
    return result

# example usage
kmer = arr[1:4]  # [1,0,0,0], [0,0,0,1], [0,0,1,0]
result = kmer_to_one_hot(kmer)
print(len(result))

3
concatenated shape: (12,)
index shape: ()
16777216


In [19]:
(4**np.arange(3*4 - 1, -1, -1)).shape

(12,)

Split into train, validation, and test sets.

In [None]:
train_df, val_df, test_df = hash_split(df, split_col='sequence', percentages=(80, 10, 10))
print(train_df.head())
split_sizes = [len(i) for i in (train_df, val_df, test_df)]
print(split_sizes)
assert sum(split_sizes) == len(df)

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(train_df['log2_FC'].values.reshape(-1,1))
for split_df in [train_df, val_df, test_df]:
    split_df['scaled_log2_FC'] = scaler.transform(split_df['log2_FC'].values.reshape(-1,1))
print(train_df.head())

In [None]:
print(train_df['one_hot_sequence'].values)
print(val_df['one_hot_sequence'].values.shape)

# train_one_hot = train_one_hot.reshape(471961, 120)
# val_one_hot = val_one_hot.reshape(59400, 120)

In [None]:
reg = LinearRegression().fit(train_one_hot, train_df['scaled_log2_FC'])

In [None]:
evaluate_model(model=reg, train_x=train_one_hot, valid_x=val_one_hot, train_df=train_df, valid_df=val_df)

In [None]:
train_dinuc = featurize(train_df, 2, col="sequence")
val_dinuc = featurize(val_df, 2, col="sequence")

train_trinuc = featurize(train_df, 3, col="sequence")
val_trinuc = featurize(val_df, 3, col="sequence")
print(len(train_dinuc[0]) * len(train_dinuc[0][0]))
print(len(train_trinuc[0]) * len(train_trinuc[0][0]))

In [None]:
train_dinuc = train_dinuc.reshape(471961, 464)
val_dinuc = val_dinuc.reshape(59400, 464)
train_trinuc = train_trinuc.reshape(471961, 1792)
val_trinuc = val_trinuc.reshape(59400, 1792)

In [None]:
reg = LinearRegression().fit(train_trinuc, train_df['scaled_log2_FC'])

In [None]:
evaluate_model(model=reg, train_x=train_trinuc, valid_x=val_trinuc, train_df=train_df, valid_df=val_df)