# Inference notebook 


In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pickle
import shutil
import math
import pandas as pd
import gc
import os




In [3]:
test = pd.read_parquet('test_sequences.parquet')
print(test.shape)
test.head()

(1343823, 5)


Unnamed: 0,id_min,id_max,sequence_id,sequence,future
0,0,176,eee73c1836bc,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,0
1,177,353,d2a929af7a97,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,0
2,354,530,d39a4425ff45,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,0
3,531,707,1fc41e92d553,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,0
4,708,884,1d0826fb892f,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,0


In [5]:
test_sequences = test.sequence.to_numpy()
encoding_dict = {'A':1, 'C': 2, 'G': 3, 'U': 4}
encoding_dict

{'A': 1, 'C': 2, 'G': 3, 'U': 4}

In [8]:
DEBUG = False


In [9]:
max_len = 457 
test_sequences_encoded = []
for seq in test_sequences:
    test_sequences_encoded.append(
        np.concatenate([np.asarray([encoding_dict[x] for x in seq]), np.zeros((max_len - len(seq)))]).astype(np.float32))

In [10]:
test_ds = tf.data.Dataset.from_tensor_slices(test_sequences_encoded)
batch_size = 256
if DEBUG:
    test_ds = test_ds.take(8)
    batch_size = 2
#test_ds = test_ds.take(10000)

test_ds = test_ds.padded_batch(batch_size, padding_values=(0.0), padded_shapes=([max_len]), drop_remainder=False)
test_ds = test_ds.prefetch(tf.data.AUTOTUNE)
batch = next(iter(test_ds))
batch.shape

TensorShape([256, 457])

In [11]:
class transformer_block(tf.keras.layers.Layer):
    def __init__(self, dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=dim//num_heads)
        self.ffn = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(feed_forward_dim, activation="relu"),
                tf.keras.layers.Dense(dim),
            ]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.supports_masking = True

    def call(self, inputs, training, mask):
        att_mask = tf.expand_dims(mask, axis=-1)
        att_mask = tf.repeat(att_mask, repeats=tf.shape(att_mask)[1], axis=-1)

        attn_output = self.att(inputs, inputs, attention_mask = att_mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class positional_encoding_layer(tf.keras.layers.Layer):
    def __init__(self, num_vocab=5, maxlen=500, hidden_dim=384):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.pos_emb = self.positional_encoding(maxlen-1, hidden_dim)
        self.supports_masking = True

    def call(self, x):
        maxlen = tf.shape(x)[-2]
        x = tf.math.multiply(x, tf.math.sqrt(tf.cast(self.hidden_dim, tf.float32)))
        return x + self.pos_emb[:maxlen, :]

    def positional_encoding(self, maxlen, hidden_dim):
        depth = hidden_dim/2
        positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
        depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
        angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
        angle_rads = tf.linalg.matmul(positions, angle_rates)
        pos_encoding = tf.concat(
          [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
          axis=-1)
        return pos_encoding


In [14]:
X_max_len = 457
num_vocab = 5

def get_model(hidden_dim = 384, max_len = 206):
    inp = tf.keras.Input([None])
    x = inp

    x = tf.keras.layers.Embedding(num_vocab, hidden_dim, mask_zero=True)(x)
    x = positional_encoding_layer(num_vocab=num_vocab, maxlen=500, hidden_dim=hidden_dim)(x)

    x = transformer_block(hidden_dim, 6, hidden_dim*4)(x)
    x = transformer_block(hidden_dim, 6, hidden_dim*4)(x)
    x = transformer_block(hidden_dim, 6, hidden_dim*4)(x)
    x = transformer_block(hidden_dim, 6, hidden_dim*4)(x)


    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(2)(x)

    model = tf.keras.Model(inp, x)
    return model

In [15]:
model = get_model(hidden_dim = 192,max_len = max_len)
model.load_weights('model_weights/model_epoch_3.h5') 
#asendada õige failinimega
model(batch)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 192)         960       
                                                                 
 positional_encoding_layer_  (None, None, 192)         0         
 1 (positional_encoding_lay                                      
 er)                                                             
                                                                 
 transformer_block_12 (tran  (None, None, 192)         444864    
 sformer_block)                                                  
                                                                 
 transformer_block_13 (tran  (None, None, 192)         444864    
 sformer_block)                                            

In [19]:
test_samples = []
for data in test_ds.take(500):
    test_samples.append(data)

test_subset = tf.data.Dataset.from_tensor_slices(test_samples)

preds = model.predict(test_subset)



In [17]:
print(preds)

NameError: name 'preds' is not defined

In [20]:
preds_processed = []
for i, pred in enumerate(preds):
    preds_processed.append(pred[:len(test_sequences[i])])
concat_preds = np.concatenate(preds_processed)

In [35]:
submission = pd.DataFrame({'id':np.arange(0, len(concat_preds), 1), 'reactivity_DMS_MaP':concat_preds[:,1], 'reactivity_2A3_MaP':concat_preds[:,0]})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,reactivity_DMS_MaP,reactivity_2A3_MaP
0,0,0.132235,0.156113
1,1,0.121759,0.157653
2,2,0.115223,0.167442
3,3,0.495984,0.259817
4,4,0.50286,0.278011


In [38]:
submission.tail()

Unnamed: 0,id,reactivity_DMS_MaP,reactivity_2A3_MaP
22655995,22655995,0.49543,0.219686
22655996,22655996,0.323599,0.138582
22655997,22655997,0.52419,0.211837
22655998,22655998,0.533429,0.215171
22655999,22655999,0.327038,0.138473


In [27]:
269796671-22656000

247140671

In [25]:
import shutil

# Replace 'source_file.csv' with the name of your source CSV file
# Replace 'destination_file.csv' with the name of the new copy
shutil.copyfile('submission.csv', 'sub1copy.csv')

'sub1copy.csv'

In [37]:
submission.shape()

TypeError: 'tuple' object is not callable

In [40]:
import csv

with open('submission.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    for i in range(247140671):
        writer.writerow([{i + 22656000},0.533429,0.533429])

KeyboardInterrupt: 

In [34]:
submission.tail()

Unnamed: 0,id,reactivity_DMS_MaP,reactivity_2A3_MaP
22655995,22655995,0.49543,0.219686
22655996,22655996,0.323599,0.138582
22655997,22655997,0.52419,0.211837
22655998,22655998,0.533429,0.215171
22655999,22655999,0.327038,0.138473


In [31]:
submission.head()

Unnamed: 0,id,reactivity_DMS_MaP,reactivity_2A3_MaP
0,0,0.132235,0.156113
1,1,0.121759,0.157653
2,2,0.115223,0.167442
3,3,0.495984,0.259817
4,4,0.50286,0.278011


In [32]:
file.close()

In [None]:


# Read the CSV file and insert an index column
with open('submission.csv', 'r', newline='') as infile:
    reader = csv.reader(infile, delimiter='\t')
    next(reader)  # Skip header if exists

    # Create a list of rows with an added index column
    rows_with_index = [[i + 1] + row for i, row in enumerate(reader)]

# Write modified rows with index to a new CSV file
with open('submission.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerows(rows_with_index)