In [18]:
import os
import gc
import torch
import pickle
import importlib
from importlib import reload
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import dataloader
from dataloader import RNA_Dataset, RNA_Sub_Dataset, LenMatchBatchSampler, DeviceDataLoader

import model
from model import RNA_Model

import metrics
from metrics import MAE, loss

from fastai.vision.data import  DataLoaders
from fastai.vision.all import Learner, GradientClip

In [7]:
fname = 'example0'
PATH = '/scratch/lemercier/WIP_data/'
OUT = './'
bs = 256
num_workers = 2
nfolds = 4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
seq_pred = pd.read_csv(os.path.join(PATH,'test_sequences.csv'))
public = seq_pred.query('future==0')
sub = pd.read_csv(os.path.join(PATH,'sample_submission.csv'))

In [20]:
res = []

# prepare data
ds_sub = RNA_Sub_Dataset(public)
ds_len = RNA_Sub_Dataset(public, mask_only=True)
sampler = torch.utils.data.SequentialSampler(ds_len)
len_sampler = LenMatchBatchSampler(sampler, batch_size=bs,
            drop_last=False)
dl_sub = DeviceDataLoader(torch.utils.data.DataLoader(ds_sub, 
           batch_sampler=len_sampler, num_workers=num_workers), device)
gc.collect()

# load model
model = RNA_Model()
model.load_state_dict(torch.load('/scratch/lemercier/model.pth'))
model.eval()

model = model.to(device)

for batch in tqdm(dl_sub):   

    with torch.no_grad():
        pred = model.forward(batch[0])

        b = pred[1].shape[0]
        reac = pred[1][batch[1]['mask'][:,:pred[1].shape[1]]].reshape(b,-1,2).detach().cpu().numpy()

        # fill results
        id_mins = batch[0]['id_min'].detach().cpu().numpy()
        id_maxs = batch[0]['id_max'].detach().cpu().numpy()
        
        for i in range(b):
            id_ = np.arange(id_mins[i], id_maxs[i]+1)
            data = np.stack([id_, reac[i,:,1], reac[i,:,0]]).T
            res.append(pd.DataFrame(data, columns=['id','reactivity_DMS_MaP','reactivity_2A3_MaP']))

gc.collect()

100%|██████████████████████████████████████████████████████████████| 1312/1312 [03:30<00:00,  6.23it/s]


0

In [21]:
res_ = pd.concat(res)
df = res_.astype({'id': 'int'})
df

Unnamed: 0,id,reactivity_DMS_MaP,reactivity_2A3_MaP
0,0,-0.007659,0.086491
1,1,-0.010308,0.077308
2,2,-0.010237,0.071179
3,3,0.023897,0.017810
4,4,0.036157,0.031405
...,...,...,...
172,59440666,0.134237,0.491965
173,59440667,0.133345,0.496647
174,59440668,0.118865,0.465841
175,59440669,0.114309,0.493537


In [22]:
full_sub = pd.read_csv(os.path.join(PATH,'sample_submission.csv'))

In [23]:
full_sub.id.max()

269796670

In [24]:
df.id.iloc[5]

5

In [25]:
new_res = pd.concat([df, full_sub.iloc[df.id.iloc[-1]+1:]])

In [26]:
new_res.shape

(269796671, 3)

In [27]:
new_res.to_parquet('/scratch/lemercier/mysubmissionname.parquet', index=False)

In [71]:
new_res.iloc[df.id.iloc[-1]]

id                    5.944067e+07
reactivity_DMS_MaP    4.477028e-01
reactivity_2A3_MaP    4.526522e-01
Name: 176, dtype: float64

In [28]:
check = pd.read_parquet('/scratch/lemercier/mysubmissionname.parquet')

In [29]:
check

Unnamed: 0,id,reactivity_DMS_MaP,reactivity_2A3_MaP
0,0,-0.007659,0.086491
1,1,-0.010308,0.077308
2,2,-0.010237,0.071179
3,3,0.023897,0.017810
4,4,0.036157,0.031405
...,...,...,...
269796666,269796666,0.000000,0.000000
269796667,269796667,0.000000,0.000000
269796668,269796668,0.000000,0.000000
269796669,269796669,0.000000,0.000000
