In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix, vstack
from torch.utils.data import random_split, DataLoader

import util
from linear_regression import Model
from preprocessing import read_all_data, read_original_predictions
from neural_net import Net
from util import match_parens, test_network, train_network

# Data pre-processing

In [None]:
# Load data

df = read_all_data("data/ss_out.txt",
                   "data/3U_sequences_final.txt",
                   "data/3U.models.3U.40A.seq1022_param.txt",
                   "data/3U.models.3U.00A.seq1022_param.txt")
df

In [None]:
df.dropna(inplace=True)
df

In [None]:
# One-hot encode the sequences

sequences = df["sequence"].str.split("", expand=True)
sequences.drop(columns=[sequences.columns[0], sequences.columns[-1]], inplace=True)
sequences = sequences.add_prefix("sequence_")
sequences = pd.get_dummies(sequences, sparse=True)

In [None]:
# Merge into main DataFrame

# df.drop(columns=["sequence"], inplace=True)
df = pd.concat([sequences, df], axis="columns")
df

In [None]:
# One-hot encode the secondary structure of each sequence

all_pairs_matrices = vstack(df["secondary_structure"].map(lambda struct: coo_matrix(match_parens(struct).reshape(-1))))
secondary_structures = pd.DataFrame.sparse.from_spmatrix(all_pairs_matrices,
                                                         index=df.index,
                                                         columns=pd.RangeIndex(1, all_pairs_matrices.shape[1] + 1))
secondary_structures = secondary_structures.add_prefix("secondary_structure_")

In [None]:
# Merge into main DataFrame

# df.drop(columns=["secondary_structure"], inplace=True)
df = pd.concat([df, secondary_structures], axis="columns")
df

In [None]:
sequences_tensor = df.filter(regex="^sequence_", axis="columns").to_numpy()
sequences_tensor = sequences_tensor.reshape(-1, sequences_tensor.shape[1] // 4, 4)
sequences_tensor = sequences_tensor.transpose(0, 2, 1)
sequences_tensor = sequences_tensor.astype(np.float32)
sequences_tensor

In [None]:
all_data = list(zip(sequences_tensor, df[["log2_deg_rate_a_plus", "log2_deg_rate_a_minus"]].to_numpy().astype(np.float32)))

# NN

In [None]:
a_plus_net = Net()
a_minus_net = Net()
a_minus_net

In [None]:
train_size = int(0.9 * len(all_data))
test_size = len(all_data) - train_size
train_data, test_data = random_split(all_data, [train_size, test_size])

In [None]:
train_loader = DataLoader([(sequence, np.array([rates[0]])) for (sequence, rates) in train_data],
                          batch_size=4, shuffle=True, num_workers=2)
train_network(a_plus_net, train_loader, 5)

In [None]:
train_loader = DataLoader([(sequence, np.array([rates[1]])) for (sequence, rates) in train_data],
                          batch_size=4, shuffle=True, num_workers=2)
train_network(a_minus_net, train_loader, 5)

In [None]:
test_loader = DataLoader([(sequence, np.array([rates[0]])) for (sequence, rates) in test_data],
                         batch_size=4, shuffle=False, num_workers=2)
a_plus_mse = test_network(a_plus_net, test_loader)
a_plus_mse

In [None]:
test_loader = DataLoader([(sequence, np.array([rates[1]])) for (sequence, rates) in test_data],
                         batch_size=4, shuffle=False, num_workers=2)
a_minus_mse = test_network(a_minus_net, test_loader)
a_minus_mse

# Linear Regression

In [None]:
model_a_plus = Model.load("data/run_linear_3U_40A_dg_BEST.out.mat")
model_a_minus = Model.load("data/run_linear_3U_00Am1_dg_BEST.out.mat")

In [None]:
prediction_a_minus = np.log2(model_a_minus.predict(df["sequence"]))
prediction_a_minus

In [None]:
prediction_a_plus = np.log2(model_a_plus.predict(df["sequence"]))
prediction_a_plus

In [None]:
prediction_df = pd.DataFrame({"id": df.index,
                              "a_minus": prediction_a_minus.T[0],
                              "a_plus": prediction_a_plus.T[0]})
prediction_df.sort_values(by=["id"], inplace=True)

In [None]:
_, a_minus_clip, a_plus_clip = read_original_predictions("data/models_full_dg.txt")

In [None]:
prediction_df["a_minus"].fillna(a_minus_clip, inplace=True)
prediction_df["a_minus"].clip(lower=a_minus_clip, inplace=True)

prediction_df["a_plus"].fillna(a_plus_clip, inplace=True)
prediction_df["a_plus"].clip(lower=a_plus_clip, inplace=True)

In [None]:
assert (prediction_df["id"] == df.index).all()

In [None]:
linear_mse_a_minus = util.mse(prediction_df["a_minus"].to_numpy(), df["log2_deg_rate_a_minus"].to_numpy())
linear_mse_a_minus

In [None]:
linear_mse_a_plus = util.mse(prediction_df["a_plus"].to_numpy(), df["log2_deg_rate_a_plus"].to_numpy())
linear_mse_a_plus
