In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from scipy.sparse import coo_matrix, vstack
from sklearn.model_selection import train_test_split
from torch import from_numpy
from torch.utils.data import DataLoader, TensorDataset

import util
from linear_regression import Model
from neural_net import ResidualDegrade
from preprocessing import one_hot_encode_sequences, read_all_data, \
    read_original_predictions
from util import match_parens, test_network, train_network

# Data pre-processing

In [None]:
# Load data

df = read_all_data("data/ss_out.txt",
                   "data/3U_sequences_final.txt",
                   "data/3U.models.3U.40A.seq1022_param.txt",
                   "data/3U.models.3U.00A.seq1022_param.txt")
df

In [None]:
df.dropna(inplace=True)
df

In [None]:
# One-hot encode the secondary structure of each sequence

all_pairs_matrices = vstack(df["secondary_structure"].map(lambda struct: coo_matrix(match_parens(struct).reshape(-1))))
secondary_structures = pd.DataFrame.sparse.from_spmatrix(all_pairs_matrices,
                                                         index=df.index,
                                                         columns=pd.RangeIndex(1, all_pairs_matrices.shape[1] + 1))
secondary_structures

In [None]:
sequences_tensor = one_hot_encode_sequences(df["sequence"])
sequences_tensor

# NN

In [None]:
a_plus_net = ResidualDegrade()
a_minus_net = ResidualDegrade()
a_minus_net

In [None]:
seq_train, \
seq_test, \
rate_a_plus_train, \
rate_a_plus_test, \
rate_a_minus_train, rate_a_minus_test = \
    train_test_split(sequences_tensor,
                     df["log2_deg_rate_a_plus"].to_numpy(np.float32).reshape(-1, 1),
                     df["log2_deg_rate_a_minus"].to_numpy(np.float32).reshape(-1, 1),
                     train_size=0.9)

In [None]:
train_loader = DataLoader(TensorDataset(from_numpy(seq_train), from_numpy(rate_a_plus_train)),
                          batch_size=4, shuffle=True)
optimizer = torch.optim.Adam(a_plus_net.parameters(), lr=0.001)
train_network(a_plus_net, train_loader, 5, nn.MSELoss(), optimizer)

In [None]:
train_loader = DataLoader(TensorDataset(from_numpy(seq_train), from_numpy(rate_a_minus_train)),
                          batch_size=4, shuffle=True)
optimizer = torch.optim.Adam(a_minus_net.parameters(), lr=0.001)
train_network(a_minus_net, train_loader, 5, nn.MSELoss(), optimizer)

In [None]:
test_loader = DataLoader(TensorDataset(from_numpy(seq_test), from_numpy(rate_a_plus_test)),
                         batch_size=4, shuffle=False)
a_plus_mse = test_network(a_plus_net, test_loader)
a_plus_mse

In [None]:
test_loader = DataLoader(TensorDataset(from_numpy(seq_test), from_numpy(rate_a_minus_test)),
                         batch_size=4, shuffle=False)
a_minus_mse = test_network(a_minus_net, test_loader)
a_minus_mse

# Linear Regression

In [None]:
model_a_plus = Model.load("data/run_linear_3U_40A_dg_BEST.out.mat")
model_a_minus = Model.load("data/run_linear_3U_00Am1_dg_BEST.out.mat")

In [None]:
prediction_a_minus = np.log2(model_a_minus.predict(df["sequence"]))
prediction_a_minus

In [None]:
prediction_a_plus = np.log2(model_a_plus.predict(df["sequence"]))
prediction_a_plus

In [None]:
prediction_df = pd.DataFrame({"id": df.index,
                              "a_minus": prediction_a_minus.T[0],
                              "a_plus": prediction_a_plus.T[0]})
prediction_df.sort_values(by=["id"], inplace=True)

In [None]:
_, a_minus_clip, a_plus_clip = read_original_predictions("data/models_full_dg.txt")

In [None]:
prediction_df["a_minus"].fillna(a_minus_clip, inplace=True)
prediction_df["a_minus"].clip(lower=a_minus_clip, inplace=True)

prediction_df["a_plus"].fillna(a_plus_clip, inplace=True)
prediction_df["a_plus"].clip(lower=a_plus_clip, inplace=True)

In [None]:
assert (prediction_df["id"] == df.index).all()

In [None]:
linear_mse_a_minus = util.mse(prediction_df["a_minus"].to_numpy(), df["log2_deg_rate_a_minus"].to_numpy())
linear_mse_a_minus

In [None]:
linear_mse_a_plus = util.mse(prediction_df["a_plus"].to_numpy(), df["log2_deg_rate_a_plus"].to_numpy())
linear_mse_a_plus
