# Self-Supervised

## Create Dataset

In [1]:
import os
from RNARepLearn.datasets import GFileDataset, GFileDatasetUTR
from RNARepLearn.utils import random_train_val_test_loaders

Avalaible threads: 48


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rfams = ["RF00006", "RF00007","RF00008","RF00010","RF00011"]
rfam = GFileDataset("../example_data/rfam_14.08_sample",rfams)
batch_size = 64

In [3]:
train, val, test = random_train_val_test_loaders(rfam, 0.8,0.1,0.1, batch_size=batch_size, pin_memory=False, num_workers=0)

Training:	1538
Test:	192
Validation:	193


# Load Model

In [4]:
from RNARepLearn.models import Encoder_Decoder_Model, Seq_Struc_GNN
from RNARepLearn.modules import  AttentionDecoder
from torch_geometric.nn import GCNConv

gnn = Seq_Struc_GNN(4, 128, 6, GCNConv)
decoder = AttentionDecoder(128, 4)

dummy_model = Encoder_Decoder_Model(gnn, decoder)

# Self-Supervised Pretraining

In [5]:
from RNARepLearn.train import MaskedTraining
training = MaskedTraining(dummy_model, 10, 15, None, print_out=True)

Learning rate: 0.002
Creating new SummaryWriter
Model: 
Encoder_Decoder_Model(
  (model): Sequential(
    (0): Seq_Struc_GNN(
      (body): Sequential(
        (0): Sep_Seq_Struc_Layer(
        (seq_op): CNN_Seq(
          (body): Conv1d(4, 128, kernel_size=(9,), stride=(1,), padding=same)
        )
        (struc_op): GCNConv(4, 128)
      )
        (1): <function relu at 0x14833c207f60>
        (2): <function dropout at 0x14833c207b00>
        (3): Sep_Seq_Struc_Layer(
        (seq_op): CNN_Seq(
          (body): Conv1d(128, 128, kernel_size=(9,), stride=(1,), padding=same)
        )
        (struc_op): GCNConv(128, 128)
      )
        (4): <function relu at 0x14833c207f60>
        (5): <function dropout at 0x14833c207b00>
        (6): Sep_Seq_Struc_Layer(
        (seq_op): CNN_Seq(
          (body): Conv1d(128, 128, kernel_size=(9,), stride=(1,), padding=same)
        )
        (struc_op): GCNConv(128, 128)
      )
        (7): <function relu at 0x14833c207f60>
        (8): <functi

In [8]:
training.run(train, val)

Training running on device: cuda:0
[Epoch    1/  10] [Batch   10/  24] Loss:  8.41e+01 Nucleotide-Loss:  1.38e+00 Edge-Loss:  8.27e+01  Memory  428238336
[Epoch    1/  10] [Batch   20/  24] Loss:  1.70e+01 Nucleotide-Loss:  1.39e+00 Edge-Loss:  1.57e+01  Memory  444457984
[Epoch    1/  10] Epoch-Loss:  5.06e+01 Epoch-Loss Val:  1.17e+01 Nucleotide-Loss Val:  1.39e+00 Edge-Loss Val:  1.03e+01  Memory  574652928
[Epoch    2/  10] [Batch    6/  24] Loss:  1.28e+01 Nucleotide-Loss:  1.38e+00 Edge-Loss:  1.14e+01  Memory  562494976
[Epoch    2/  10] [Batch   16/  24] Loss:  1.12e+01 Nucleotide-Loss:  1.38e+00 Edge-Loss:  9.78e+00  Memory  589882880
[Epoch    2/  10] Epoch-Loss:  1.20e+01 Epoch-Loss Val:  1.04e+01 Nucleotide-Loss Val:  1.38e+00 Edge-Loss Val:  9.00e+00  Memory  539070976
[Epoch    3/  10] [Batch    2/  24] Loss:  1.02e+01 Nucleotide-Loss:  1.39e+00 Edge-Loss:  8.81e+00  Memory  493769216
[Epoch    3/  10] [Batch   12/  24] Loss:  1.01e+01 Nucleotide-Loss:  1.38e+00 Edge-Loss

# Load pretrained model(s)

In [5]:
import gin
import torch_geometric
import torch
gin.external_configurable(torch_geometric.nn.GCNConv, "GCNConv")
gin.external_configurable(torch_geometric.nn.TransformerConv, "TransformerConv")
gin.external_configurable(torch_geometric.transforms.AddLaplacianEigenvectorPE, "AddLaplacianEigenvectorPE")
gin.external_configurable(torch_geometric.nn.ChebConv, "ChebConv")
gin.external_configurable(torch_geometric.nn.GraphConv, "GraphConv")
gin.external_configurable(torch_geometric.nn.GatedGraphConv, "GatedGraphConv")
gin.external_configurable(torch_geometric.nn.TAGConv, "TAGConv")
gin.external_configurable(torch_geometric.nn.GATConv, "GATConv")

torch_geometric.nn.conv.gat_conv.GATConv

In [16]:
model_selection = ['GCNConv','RPINet', 'GatedGraphConv', 'GCN_LSTM']
model_name = model_selection[0]

In [17]:
gin.parse_config_file(os.path.join("../example_data/pretrained_models/trained_unlabeled/",model_name,"gin.config"))
model = Encoder_Decoder_Model()
model.load_state_dict(torch.load(os.path.join("../example_data/pretrained_models/trained_unlabeled/",model_name,"final_model")))

<All keys matched successfully>

# Downstream task: Translation efficiency 
# Change to task-specific decoder

In [35]:
from RNARepLearn.modules import TE_Decoder

batch_size = 128

encoder = next(model.model.children())
decoder = TE_Decoder(batch_size, 512)

model = Encoder_Decoder_Model(encoder, decoder)

# Load mean-ribosome-load labeled data

In [36]:
ds = GFileDatasetUTR("../example_data/mean_ribosomoe_load",["GSM3130443_designed_lib"])
ds


GFileDatasetUTR(100014)

In [37]:
train, val, test = random_train_val_test_loaders(ds, 0.8,0.1,0.1, batch_size=batch_size, pin_memory=False, num_workers=0)

Training:	80011
Test:	10001
Validation:	10002


# TE prediction

In [38]:
from RNARepLearn.train import TETraining
training = TETraining(model, 10, None)

Learning rate: 2e-05
Creating new SummaryWriter
Model: 
Encoder_Decoder_Model(
  (model): Sequential(
    (0): Seq_Struc_GNN(
      (body): Sequential(
        (0): Sep_Seq_Struc_Layer(
        (seq_op): CNN_Seq(
          (body): Conv1d(4, 512, kernel_size=(9,), stride=(1,), padding=same)
        )
        (struc_op): GCNConv(4, 512)
      )
        (1): <function relu at 0x14833c207f60>
        (2): <function dropout at 0x14833c207b00>
        (3): Sep_Seq_Struc_Layer(
        (seq_op): CNN_Seq(
          (body): Conv1d(512, 512, kernel_size=(9,), stride=(1,), padding=same)
        )
        (struc_op): GCNConv(512, 512)
      )
        (4): <function relu at 0x14833c207f60>
        (5): <function dropout at 0x14833c207b00>
        (6): Sep_Seq_Struc_Layer(
        (seq_op): CNN_Seq(
          (body): Conv1d(512, 512, kernel_size=(9,), stride=(1,), padding=same)
        )
        (struc_op): GCNConv(512, 512)
      )
        (7): <function relu at 0x14833c207f60>
        (8): <functi