In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from linear_regression import Model
from preprocessing import read_all_data, read_original_predictions, read_sequence_ids

# Data pre-processing

In [2]:
df = read_all_data("data/ss_out.txt",
                   "data/3U_sequences_final.txt",
                   "data/3U.models.3U.40A.seq1022_param.txt",
                   "data/3U.models.3U.00A.seq1022_param.txt")
df.sort_index(inplace=True)
df

Unnamed: 0_level_0,sequence,secondary_structure,free_energy,secondary_structure_prob,log2_deg_rate_a_plus,log2_x0_a_plus,onset_time_a_plus,log2_deg_rate_a_minus,log2_x0_a_minus,onset_time_a_minus
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0_M_T1,TGTCCCCGGGTCTTCCAACGGACTGGCGTTGCCCCGGTTCACTGGG...,.(.((((((((....(((((......)))))((((((....)))))...,-45.92,".(.((((((((.,,,{{..(|||{((|{..,{{||||,,,.}))))...",,,,,,
S0_M_T10,AGATTTTTGGTTCAATATGCTCCTTGAGTGGAGTCTTAGTGATTGC...,........(((((.....(((((......)))))...(((((..((...,-31.17,"........(((((,.,..(((((......))))}...{({({..({...",-2.7469,2.7887,1.0,-2.1721,2.5964,1.0
S0_M_T100,ACCCGGCGCCGCTCGACCCGGAGCGAGGAGTTGACCCGGAGCGAGG...,....((((((.((((..((((..(((....)))..))))..))))....,-41.95,"....((((((.((((..((((.,({(....})).,))))..)}}),...",,,,,,
S0_M_T1000,ATGAGGGCTGGAATTTGCATTGAAACACTGGTCCAGTCGCTGTGTA...,.....(((((((...((........))....)))))))..(((......,-23.18,"...,((((((((.,.({........}}..,,))))))).,|((,,....",,,,,,
S0_M_T1001,CCTTAGTGCCCTTAAAATAATGATTTAAGCATTTTACTGTATGTAT...,....(.((((((((((.(((.(......((.(((((((((.(.......,-30.84,"....{.(((((((((({(((.{......((.((((((((,.{.......",,,,,,
...,...,...,...,...,...,...,...,...,...,...
S3_H_T9995,TTGTAGCTGTCAATTGTATTTAATATACTTTTTTGTCTTTTTAATT...,((((..(((.((((((.((((((...........(((............,-18.50,",{(((((((.,(({{((((.....}}}}......(((............",,,,,,
S3_H_T9996,AAAACACCACTACATATGTTTCTCATAAGCGCAACTGTAGTGTTAT...,((((((((((((((..(((..((....)).)))..))))))).......,-19.40,"((((((((((((((..(((..,.....}}.)})..))))))).......",-2.5808,3.4966,1.0,-2.3105,3.3307,1.0
S3_H_T9997,AGGATTTTTTTTTTCACCAATGCTCTTTAATACACACTTGCCTATA...,.(((..((((((((.......((................))........,-20.95,",((,((((((((({...,,,,({..............,,)}....)...",,,,,,
S3_H_T9998,GGTGCTTCAAAGAGTGATTACCCACTAACTAATGAACCCAGACTGT...,((((..(((.....))).))))..((((..((((((((((((.(((...,-28.53,"((((..(((.....))).))))..((((..(((((((((,((.(((...",,,,,,


In [3]:
# The DataFrame above contains truncated sequences, but we need the full ones
complete_sequences = read_sequence_ids("data/3U_sequences_final.txt")
complete_sequences.set_index("id", inplace=True)
df["sequence"] = complete_sequences["sequence"]
df

Unnamed: 0_level_0,sequence,secondary_structure,free_energy,secondary_structure_prob,log2_deg_rate_a_plus,log2_x0_a_plus,onset_time_a_plus,log2_deg_rate_a_minus,log2_x0_a_minus,onset_time_a_minus
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0_M_T1,GGAGATCTGAGTTCAAGGATTGTCCCCGGGTCTTCCAACGGACTGG...,.(.((((((((....(((((......)))))((((((....)))))...,-45.92,".(.((((((((.,,,{{..(|||{((|{..,{{||||,,,.}))))...",,,,,,
S0_M_T10,GGAGATCTGAGTTCAAGGATAGATTTTTGGTTCAATATGCTCCTTG...,........(((((.....(((((......)))))...(((((..((...,-31.17,"........(((((,.,..(((((......))))}...{({({..({...",-2.7469,2.7887,1.0,-2.1721,2.5964,1.0
S0_M_T100,GGAGATCTGAGTTCAAGGATACCCGGCGCCGCTCGACCCGGAGCGA...,....((((((.((((..((((..(((....)))..))))..))))....,-41.95,"....((((((.((((..((((.,({(....})).,))))..)}}),...",,,,,,
S0_M_T1000,GGAGATCTGAGTTCAAGGATATGAGGGCTGGAATTTGCATTGAAAC...,.....(((((((...((........))....)))))))..(((......,-23.18,"...,((((((((.,.({........}}..,,))))))).,|((,,....",,,,,,
S0_M_T1001,GGAGATCTGAGTTCAAGGATCCTTAGTGCCCTTAAAATAATGATTT...,....(.((((((((((.(((.(......((.(((((((((.(.......,-30.84,"....{.(((((((((({(((.{......((.((((((((,.{.......",,,,,,
...,...,...,...,...,...,...,...,...,...,...
S3_H_T9995,GGAGATCTGAGTTCAAGGATTTGTAGCTGTCAATTGTATTTAATAT...,((((..(((.((((((.((((((...........(((............,-18.50,",{(((((((.,(({{((((.....}}}}......(((............",,,,,,
S3_H_T9996,GGAGATCTGAGTTCAAGGATAAAACACCACTACATATGTTTCTCAT...,((((((((((((((..(((..((....)).)))..))))))).......,-19.40,"((((((((((((((..(((..,.....}}.)})..))))))).......",-2.5808,3.4966,1.0,-2.3105,3.3307,1.0
S3_H_T9997,GGAGATCTGAGTTCAAGGATAGGATTTTTTTTTTCACCAATGCTCT...,.(((..((((((((.......((................))........,-20.95,",((,((((((((({...,,,,({..............,,)}....)...",,,,,,
S3_H_T9998,GGAGATCTGAGTTCAAGGATGGTGCTTCAAAGAGTGATTACCCACT...,((((..(((.....))).))))..((((..((((((((((((.(((...,-28.53,"((((..(((.....))).))))..((((..(((((((((,((.(((...",,,,,,


# Linear Regression

In [4]:
model_a_plus = Model.load("data/run_linear_3U_40A_dg_BEST.out.mat")
model_a_minus = Model.load("data/run_linear_3U_00Am1_dg_BEST.out.mat")

In [5]:
prediction_a_minus = np.log2(model_a_minus.predict(df["sequence"]))

  """Entry point for launching an IPython kernel.


In [6]:
prediction_a_plus = np.log2(model_a_plus.predict(df["sequence"]))

  """Entry point for launching an IPython kernel.


In [7]:
prediction_df = pd.DataFrame({"id": df.index,
                              "a_minus": prediction_a_minus.T[0],
                              "a_plus": prediction_a_plus.T[0]})
prediction_df.set_index("id", inplace=True)

## Compare against original predictions

In [8]:
compare_df, a_minus_clip, a_plus_clip = \
    read_original_predictions("data/models_full_dg.txt")

In [9]:
prediction_df["a_minus"].fillna(a_minus_clip, inplace=True)
prediction_df["a_minus"].clip(lower=a_minus_clip, inplace=True)

prediction_df["a_plus"].fillna(a_plus_clip, inplace=True)
prediction_df["a_plus"].clip(lower=a_plus_clip, inplace=True)

In [10]:
np.allclose(prediction_df["a_minus"],
            compare_df["a_minus"],
            atol=1e-4,
            rtol=0,
            equal_nan=True)

True

In [11]:
np.allclose(prediction_df["a_plus"],
            compare_df["a_plus"],
            atol=1e-4,
            rtol=0,
            equal_nan=True)

True

## Compare against "ground truth"

In [12]:
available_deg_rates = df.dropna()
available_deg_rates

Unnamed: 0_level_0,sequence,secondary_structure,free_energy,secondary_structure_prob,log2_deg_rate_a_plus,log2_x0_a_plus,onset_time_a_plus,log2_deg_rate_a_minus,log2_x0_a_minus,onset_time_a_minus
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0_M_T10,GGAGATCTGAGTTCAAGGATAGATTTTTGGTTCAATATGCTCCTTG...,........(((((.....(((((......)))))...(((((..((...,-31.17,"........(((((,.,..(((((......))))}...{({({..({...",-2.746900,2.7887,1.0,-2.17210,2.5964,1.0
S0_M_T1006,GGAGATCTGAGTTCAAGGATTAGATAGAGATCATCTTTACAGTTCC...,...(((((((((((...((((.((((...))))..))))...))))...,-26.55,".,,({(((({((((...((((.((((...))))..))))...))))...",-2.495200,3.5146,1.0,-1.94970,3.1963,1.0
S0_M_T1009,GGAGATCTGAGTTCAAGGATTAGTTATTGTGTGTTGCTAATCATTG...,.....(((((((((((.(((.((..(((((.(((..((((((((((...,-33.05,"...,,(((((((((((.{((.{(..(((((.(((..((((((((((...",-2.550700,2.7105,1.0,-1.51500,2.8747,1.0
S0_M_T1013,GGAGATCTGAGTTCAAGGATTGATTCTAGTATATAATATTTTTGTC...,.......................((((.((....))))))...((....,-20.70,".......,{,,............((((.((....))))))..,((....",-2.327900,3.7761,1.0,-1.89040,3.2967,1.0
S0_M_T1014,GGAGATCTGAGTTCAAGGATTTCTAGACTTTCCAAGTATGTTGTCT...,.............(((..(((((((((......(((((......))...,-22.82,"......,,,,...|||,.,(((((,,{{((.{((((((......))...",-1.623200,1.6160,4.8,-2.09580,2.1356,1.0
...,...,...,...,...,...,...,...,...,...,...
S3_H_T9985,GGAGATCTGAGTTCAAGGATGTCCTTATTTACATGTTTCATTGAGC...,.............((((..(((((.........(((((((.((((....,-18.22,",....,,,,...,((((,.((({{.........(((((((.((((....",-2.027200,2.6826,1.0,-1.59040,2.5908,1.0
S3_H_T9987,GGAGATCTGAGTTCAAGGATTCAATGGTTACAGGTTTCAAACATTC...,(((((((((((((((((.......((..(((((........)))))...,-30.56,(((((((((((((((((.......{{..(((((........)))))...,-2.589200,2.7555,1.0,-2.05310,2.5282,1.0
S3_H_T9989,GGAGATCTGAGTTCAAGGATTGAAAGCACAGAGGGGCTGAGATTCT...,.(((((((..((((.(((...........))).)))).)))))))....,-24.03,".((((({(.,((((,(((.,,...,....))).)))).))))))),...",0.023414,1.2948,4.5,-0.75861,1.4902,3.0
S3_H_T9990,GGAGATCTGAGTTCAAGGATAATTAAAGAGAGAGAGAGACGGAGAA...,.........((.(((((((((.((....((.(((.....))).)))...,-27.22,".........{(.(((((((((.,{....((.(((.....))).)),...",-2.463100,1.5812,1.0,-2.71790,1.2301,1.0


In [13]:
mean_squared_error(y_true=available_deg_rates["log2_deg_rate_a_minus"],
                   y_pred=prediction_df.loc[available_deg_rates.index]["a_minus"])

0.3703773304161375

In [14]:
mean_squared_error(y_true=available_deg_rates["log2_deg_rate_a_plus"],
                   y_pred=prediction_df.loc[available_deg_rates.index]["a_plus"])

0.6929771096312785