In [1]:
import numpy as np
import pandas as pd
from skbio.sequence import Sequence
from sklearn.linear_model import LassoCV, LinearRegression

# Data pre-processing

In [2]:
sequences = \
  pd.read_table("data/3U_sequences_final.txt",
                header=None,
                names=["id", "sequence"],
                dtype={"id": str},
                converters={"sequence": lambda string: Sequence(string.upper())})

In [3]:
deg_rate_a_plus = pd.read_table("data/3U.models.3U.40A.seq1022_param.txt",
                                header=None,
                                names=["id", "log2_deg_rate", "log2_x0", "onset_time"],
                                dtype={"id": str})
deg_rate_a_minus = pd.read_table("data/3U.models.3U.00A.seq1022_param.txt",
                                 header=None,
                                 names=["id", "log2_deg_rate", "log2_x0", "onset_time"],
                                 dtype={"id": str})
assert (deg_rate_a_plus["id"] == deg_rate_a_minus["id"]).all()

In [4]:
def count_kmers(sequences, k):
  data = [row["sequence"].kmer_frequencies(k)
          for _, row in sequences.iterrows()]

  df = pd.DataFrame(data=data, index=sequences["id"], dtype="Int64")
  df.fillna(value=0, inplace=True)
  df = df.astype(int)

  df = df.astype(pd.SparseDtype(int, fill_value=0))

  return df

In [5]:
sequences_for_fit = sequences[sequences["id"].isin(deg_rate_a_plus["id"])]

In [6]:
kmer_counts_for_fit = pd.concat([count_kmers(sequences_for_fit, k)
                                 for k in range(3, 8)],
                                axis=1)

In [7]:
def count_specific_kmers(sequences, kmers):
  data = np.zeros((len(sequences.index), len(kmers)), dtype=int)
  for sequence_index, (_, sequence_row) in enumerate(sequences.iterrows()):
    for kmer_index, kmer in enumerate(kmers):
      data[sequence_index, kmer_index] = sequence_row["sequence"].count(kmer)

  df = pd.DataFrame(data=data,
                    index=sequences["id"],
                    columns=kmers,
                    dtype=pd.SparseDtype(int, fill_value=0))

  return df

In [8]:
# kmer_counts = count_specific_kmers(sequences, kmer_counts_for_fit.columns)

# Linear Regression

In [9]:
X = kmer_counts_for_fit
y = deg_rate_a_minus["log2_deg_rate"][deg_rate_a_minus["id"] != "EMPTY"]
model = LinearRegression().fit(X, y)

# Lasso

In [10]:
# X = kmer_counts
# y = np.exp2(deg_rate_a_minus["log2_deg_rate"][deg_rate_a_minus["id"] != "EMPTY"])
# model = LassoCV(cv=10, n_jobs=-1).fit(X, y)