In [None]:
# Stanford RNA 3D Folding Competition Notebook
# https://www.kaggle.com/code/olaflundstrom/stanford-rna-3d-folding-competition-notebook

In [2]:
# 基本ライブラリ
import os 

# EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, BatchNormalization, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [7]:
# Define file paths
TRAIN_SEQ_PATH = "./data/train_sequences.csv"
TRAIN_LABELS_PATH = "./data/train_labels.csv"
VALID_SEQ_PATH = "./data/validation_sequences.csv"
VALID_LABELS_PATH = "./data/validation_labels.csv"
TEST_SEQ_PATH = "./data/test_sequences.csv"
SAMPLE_SUB_PATH = "./data/sample_submission.csv"

# Load csv files
train_sequences_df = pd.read_csv(TRAIN_SEQ_PATH)
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)
valid_sequences_df = pd.read_csv(VALID_SEQ_PATH)
valid_labels_df = pd.read_csv(VALID_LABELS_PATH)
test_sequences_df = pd.read_csv(TEST_SEQ_PATH)
sample_submission_df = pd.read_csv(SAMPLE_SUB_PATH)

# Fill missing values in labels with 0
train_labels_df.fillna(0, inplace=True)
valid_labels_df.fillna(0, inplace=True)

print("Train Sequences Shape:\t", train_sequences_df.shape)
print("Train Labels Shape:\t", train_labels_df.shape)
print("Validation Sequences Shape:\t", valid_sequences_df.shape)
print("Validation labels Shape:\t", valid_labels_df.shape)
print("Test Sequences Shape:\t", test_sequences_df.shape)

print("\nTrain sequences Head:")
display(train_sequences_df.head())
print("\nTrain Labels Head:")
display(train_labels_df.head())


Train Sequences Shape:	 (844, 5)
Train Labels Shape:	 (137095, 6)
Validation Sequences Shape:	 (12, 5)
Validation labels Shape:	 (2515, 123)
Test Sequences Shape:	 (12, 5)

Train sequences Head:


Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
0,1SCL_A,GGGUGCUCAGUACGAGAGGAACCGCACCC,1995-01-26,"THE SARCIN-RICIN LOOP, A MODULAR RNA",>1SCL_1|Chain A|RNA SARCIN-RICIN LOOP|Rattus n...
1,1RNK_A,GGCGCAGUGGGCUAGCGCCACUCAAAAGGCCCAU,1995-02-27,THE STRUCTURE OF AN RNA PSEUDOKNOT THAT CAUSES...,>1RNK_1|Chain A|RNA PSEUDOKNOT|null\nGGCGCAGUG...
2,1RHT_A,GGGACUGACGAUCACGCAGUCUAU,1995-06-03,24-MER RNA HAIRPIN COAT PROTEIN BINDING SITE F...,>1RHT_1|Chain A|RNA (5'-R(P*GP*GP*GP*AP*CP*UP*...
3,1HLX_A,GGGAUAACUUCGGUUGUCCC,1995-09-15,P1 HELIX NUCLEIC ACIDS (DNA/RNA) RIBONUCLEIC ACID,>1HLX_1|Chain A|RNA (5'-R(*GP*GP*GP*AP*UP*AP*A...
4,1HMH_E,GGCGACCCUGAUGAGGCCGAAAGGCCGAAACCGU,1995-12-07,THREE-DIMENSIONAL STRUCTURE OF A HAMMERHEAD RI...,">1HMH_1|Chains A, C, E|HAMMERHEAD RIBOZYME-RNA..."



Train Labels Head:


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,1SCL_A_1,G,1,13.76,-25.974001,0.102
1,1SCL_A_2,G,2,9.31,-29.638,2.669
2,1SCL_A_3,G,3,5.529,-27.813,5.878
3,1SCL_A_4,U,4,2.678,-24.900999,9.793
4,1SCL_A_5,G,5,1.827,-20.136,11.793


In [None]:
# データ前処理

# シーケンスエンコーディング(A: 1, C: 2, G: 3, U: 4)
nucleotide_map = {'A': 1, 'c': 2, 'G': 3, 'U': 4}

def encode_sequence(seq):
    """Encodes a RNA sequence into a list of integers based on nucleotide_map."""
    return [nucleotide_map.get(ch, 0) for ch in seq]

train_sequences_df['encoded'] = train_sequences_df['sequence'].apply(encode_sequence)
valid_sequences_df['encoded'] = valid_sequences_df['sequence'].apply(encode_sequence)
test_sequences_df['encoded'] = test_sequences_df['sequence'].apply(encode_sequence)

In [None]:
# ラベルデータの処理
def process_labels(labels_df):
    """
    Processes a labels DataFrame by grouping rows by target_id.
    Returns a dictionary mapping target_id to an array of coordinates (seq_len, 3).
    """
    label_dict = {}
    for idx, row in labels_df.iterrows():

        # Split ID into target_id and residue number
        parts = row['ID'].split('_')
        target_id = "-".join(parts[:-1])
        resid = int(parts[-1])

        # Extract the coordinates; they should be numeric
        coord = np.array([row['x_1'], row['y_1'], row['z_1']], dtype=np.float32)
        if target_id not in label_dict:
            sorted_coords = sorted(label_dict[key], key=lambda x: x[0])
            coords = np.stack([c for r, c in sorted_coords])
            label_dict[key] = coords
            return label_dict
        
# 実行
train_labels_dict_df = process_labels(train_labels_df)
valid_labels_dict_df = process_labels(valid_labels_df)

In [None]:
# データセットの作成とパディング

def create_dataset(sequences_df, labels_dict):
    """
    Creates a dataset from a sequences DataFrame and a labels dictionary.
    Returns:
        X: list of encoded sequences,
        y: list of coordinate arrays,
        target_ids: list of target ids.
    """
    X, y, target_ids = [], [], []
    for idx, row in sequences_df.iterrows():
        tid = row['target_id']o
        if tid in labels_dict:
            X.append(row['encoded'])
            y.append(labels_dict[tid])
            target_ids.append(tid)
    return X, y, target_ids

# 実行
X_train, y_train, train_ids = create_dataset(train_sequences_df, train_labels_dict_df)
X_valid, y_valid, valid_ids = create_dataset(valid_sequences_df, valid_labels_dict_df)

# トレーニングデータセットから最大シーケンス長を決定する
max_len = max(len(seq) for seq in X_train)
print("Maximum sequence length (train):", max_len)

# シークエンスをパディングする
X_train_pad = pad_sequences(X_train, maxlen=max_len, padding='post', value=0)
X_valid_pad = pad_sequences(X_valid, maxlen=max_len, padding='post', value=0)

# パディング用コード
def pad_coordinates(coord_array, max_len):
    L = coord_array.shape[0]
    if L < max_len:
        pad_width = ((0, max_len - L), (0, 0))
        return np.pad(coord_array, pad_width, mode='constant', constant_values=0)
    
    else:
        return coord_array
    
