In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate

from neural_net_estimators import ResidualDegradeEstimator
from preprocessing import one_hot_encode_sequences, read_sequence_ids

In [2]:
ESTIMATOR = ResidualDegradeEstimator
PARAMS = {
    'stage4_conv_channels': 198,
    'stage3_pool_kernel_size': 8,
    'stage2_conv_kernel_size': 3,
    'stage1_conv_kernel_size': 7,
    'stage1_conv_channels': 97,
}
DEG_MODEL = "a_minus"
JOBS = 3
FOLDS = 10
EPOCHS = 5

# Data pre-processing

In [3]:
sequence_ids = read_sequence_ids("data/3U_sequences_final.txt", slice(20, -20))
sequence_ids.set_index("id", inplace=True)
sequence_ids

Unnamed: 0_level_0,sequence
id,Unnamed: 1_level_1
S1_H_T1,GATCAAATGCTAAAGAAAATATTGGTTTTAGTAATAATCTCTATGC...
S1_H_T2,ATCTGGTAAATTAGGTTGATTTCTGGTTATGGAAAAAGCGCGAAAA...
S1_H_T3,TAACTGAGCCTTATGATTATGACATTTGACTGAAGTATTTGTTTTT...
S1_H_T4,CTTGTGTACGACGAACTCAGAAGCCGCAAATAGGAGACTGTTTTCA...
S1_H_T5,ATTTAAGATGTTTTCTCACGTTTGTATTCGCTTTTAATTAGGATGC...
...,...
S0_M_T1318,GTGGGCGGTGTGGACAGCGTGTCTGAGAGCACTGGCAGCATCCTCA...
S0_M_T1319,GCTCGTTAACAGCTGCTGTAACTAGTCTGGCCTACAATAGTGTGAT...
S0_M_T1320,GCTCGTTAACAGCTGCTGTAACTAGTCTGGCCTACAATAGTGTGAT...
S0_M_T1321,AAGGGATGGTCCACATCAGAAAACTCACTAAATGTCATGTTAGAAT...


In [4]:
deg_rates = pd.read_table("data/5U.seq0119.txt",
                          header=None,
                          names=["id", "a_minus", "a_plus"],
                          dtype={"id": str},
                          index_col="id")
deg_rates.drop("EMPTY", inplace=True)
deg_rates = np.log2(deg_rates)
deg_rates

Unnamed: 0_level_0,a_minus,a_plus
id,Unnamed: 1_level_1,Unnamed: 2_level_1
S0_M_T1,-1.451859,-1.835832
S0_M_T10,-1.454664,-2.454348
S0_M_T1001,-1.234907,-1.607485
S0_M_T1002,-1.456286,-1.821993
S0_M_T1005,-1.759014,-2.230668
...,...,...
S3_H_T999,-1.439637,-2.204499
S3_H_T9992,-1.241441,-1.860766
S3_H_T9993,-1.366652,-2.126140
S3_H_T9996,-1.433779,-2.138346


In [5]:
df = sequence_ids.join(deg_rates)
df.dropna(inplace=True)
df

Unnamed: 0_level_0,sequence,a_minus,a_plus
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S1_H_T4,CTTGTGTACGACGAACTCAGAAGCCGCAAATAGGAGACTGTTTTCA...,-0.942751,-1.936773
S1_H_T5,ATTTAAGATGTTTTCTCACGTTTGTATTCGCTTTTAATTAGGATGC...,-1.275856,-1.928951
S1_H_T6,AGTAACGTGTACCTGCAGGTGAAGTCGATCTCTAACCTGGTGTGGA...,-0.910231,-1.616127
S1_H_T9,TAGACATGTCATCTGCTGTTTAAACACATACACATCCTGGTTTGAC...,-1.190536,-1.963307
S1_H_T12,ATGAAATATTCGTAAGATCTTCCAGATGCACTAAATCTATTCATAA...,-1.851053,-2.076432
...,...,...,...
S0_M_T1310,AGAGCATCATTCCTTTCTATCTGCTGCCAGAGCCACGGTGCCATTT...,-1.122680,-2.139236
S0_M_T1311,GGCTATGAGACCGAGCAAGACACCATCACCAGCAAAGTCCCCACGG...,-1.147905,-1.450912
S0_M_T1313,CACGAATTCAGTTCCCACGCCAAACCAGACCCCATGGCCTGCTCAG...,-1.121895,-1.792157
S0_M_T1314,CACGAATTCAGTTCCCACGCCAAACCAGACCCCATGGCCTGCTCAG...,-1.090127,-1.768764


In [6]:
X = one_hot_encode_sequences(df["sequence"])
X

array([[[0., 0., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 1., ..., 0., 1., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 1., ..., 0., 1., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 1., 1.]],

       ...,

       [[0., 1., 0., ..., 0., 0., 0.],
        [1., 0., 1., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [1., 0., 1., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[1., 1., 0., ..., 1., 0., 1.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [7]:
y = df[DEG_MODEL].to_numpy(np.float32)
y

array([-0.94275075, -1.2758561 , -0.9102307 , ..., -1.1218946 ,
       -1.0901271 , -1.8148185 ], dtype=float32)

# NN evaluation

In [8]:
cv_results = cross_validate(
    estimator=ESTIMATOR(**PARAMS),
    X=X,
    y=y,
    scoring=("neg_mean_squared_error", "r2"),
    cv=FOLDS,
    n_jobs=JOBS,
    pre_dispatch=JOBS,
    verbose=10,
    fit_params={"training_epochs": EPOCHS},
)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 17.3min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 34.6min
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed: 63.6min remaining:    0.0s
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed: 63.6min finished


In [9]:
cv_results["test_neg_mean_squared_error"]

array([-0.03860554, -0.0372845 , -0.04533102, -0.03811284, -0.03746803,
       -0.03980242, -0.03771404, -0.03929851, -0.03924485, -0.04017861])

In [10]:
cv_results["test_r2"]

array([0.36511939, 0.3793291 , 0.26029824, 0.37685461, 0.38779035,
       0.35871659, 0.36911601, 0.34613556, 0.3528617 , 0.35308495])

In [11]:
np.mean(cv_results["test_neg_mean_squared_error"])

-0.03930403739213943

In [12]:
np.mean(cv_results["test_r2"])


0.35493064982958406