<a href="https://colab.research.google.com/github/nanpolend/machine-learning/blob/master/Kaggle_Stanford_RNA_3D_Folding_%E2%80%94_%E5%AE%8C%E7%BE%8E%E7%89%88(%E7%B9%81%E9%AB%94%E4%B8%AD%E6%96%87%E5%AE%8C%E6%95%B4%E7%89%88).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
files.upload()  # 上傳 kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [13]:
!pip install scikit-learn -U



In [3]:
# 範例：下載 stanford-rna-3d-folding 資料集
!kaggle competitions download -c stanford-rna-3d-folding -p /content/data
!unzip /content/data/stanford-rna-3d-folding.zip -d /content/data

Archive:  /content/data/stanford-rna-3d-folding.zip
  inflating: /content/data/MSA/17RA_A.MSA.fasta  
  inflating: /content/data/MSA/1A1T_B.MSA.fasta  
  inflating: /content/data/MSA/1A4T_A.MSA.fasta  
  inflating: /content/data/MSA/1A51_A.MSA.fasta  
  inflating: /content/data/MSA/1A60_A.MSA.fasta  
  inflating: /content/data/MSA/1A9N_Q.MSA.fasta  
  inflating: /content/data/MSA/1AFX_A.MSA.fasta  
  inflating: /content/data/MSA/1ANR_A.MSA.fasta  
  inflating: /content/data/MSA/1AQO_A.MSA.fasta  
  inflating: /content/data/MSA/1ATO_A.MSA.fasta  
  inflating: /content/data/MSA/1ATV_A.MSA.fasta  
  inflating: /content/data/MSA/1ATW_A.MSA.fasta  
  inflating: /content/data/MSA/1AUD_B.MSA.fasta  
  inflating: /content/data/MSA/1B36_A.MSA.fasta  
  inflating: /content/data/MSA/1BAU_B.MSA.fasta  
  inflating: /content/data/MSA/1BGZ_A.MSA.fasta  
  inflating: /content/data/MSA/1BIV_A.MSA.fasta  
  inflating: /content/data/MSA/1BVJ_A.MSA.fasta  
  inflating: /content/data/MSA/1BZ2_A.MSA.fasta 

In [9]:
import pandas as pd
train = pd.read_csv('/content/data/sample_submission.csv')
test = pd.read_csv('/content/data/sample_submission.csv')
# 在此進行資料處理和模型訓練

In [None]:
# 假設生成 submission.csv
submission.to_csv('/content/sample_submission.csv', index=False)

# 提交結果
!kaggle competitions submit -c titanic -f /content/sample_submission.csv -m "My model submission"

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import math

# ==========================================================
# Positional Encoding 定義
# ==========================================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# ==========================================================
# Swin Transformer Block
# ==========================================================
class SwinTransformerBlock(nn.Module):
    def __init__(self, d_model=256, nhead=8, window_size=7, dropout=0.1):
        super().__init__()
        self.window_size = window_size
        self.attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.ffn = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout),
        )
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, attention_mask):
        x, _ = self.attn(x, x, x, key_padding_mask=~attention_mask)
        x = self.ffn(x)
        return self.norm(x)

# ==========================================================
# Conformer Block with Multi-Scale Convolution
# ==========================================================
class ResConcatMultiScaleConformerBlock(nn.Module):
    def __init__(self, d_model, nhead, conv_kernel_sizes=[5, 15, 31], dropout=0.1):
        super().__init__()
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(d_model, d_model, kernel_size=ks, padding=ks // 2) for ks in conv_kernel_sizes
        ])
        self.attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout),
        )

    def forward(self, x, attention_mask):
        # Multi-scale convolution
        conv_outs = [conv(x.transpose(1, 2)).transpose(1, 2) for conv in self.conv_layers]
        x = torch.cat(conv_outs, dim=-1)  # Concatenate convolution outputs
        x, _ = self.attn(x, x, x, key_padding_mask=~attention_mask)
        x = self.ffn(x)
        return self.norm(x)

# ==========================================================
# RNA Swin + Conformer 混合模型
# ==========================================================
class RNA_SwinConformer(nn.Module):
    def __init__(self, num_layers=8, d_model=256, nhead=8, window_size=7, conv_kernel_sizes=[5, 15, 31], dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(3, d_model)  # Assume each position has 3 features (e.g., sequence length, structure, etc.)
        self.pos_encoder = PositionalEncoding(d_model)

        self.swin_blocks = nn.ModuleList([
            SwinTransformerBlock(d_model, nhead, window_size, dropout) for _ in range(num_layers)
        ])
        self.conformer_blocks = nn.ModuleList([
            ResConcatMultiScaleConformerBlock(d_model, nhead, conv_kernel_sizes, dropout) for _ in range(num_layers)
        ])

        self.final_proj = nn.Linear(d_model * (num_layers + 1), d_model)
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 128),
            nn.GELU(),
            nn.Linear(128, 5)  # Assuming 5 structure categories
        )

    def forward(self, x, attention_mask):
        x = self.embedding(x)
        x = self.pos_encoder(x)

        features = [x]  # Store features for each layer

        for swin_block, conformer_block in zip(self.swin_blocks, self.conformer_blocks):
            x = swin_block(x, attention_mask)  # Swin Transformer block
            x = conformer_block(x, attention_mask)  # Conformer block
            features.append(x)

        x = torch.cat(features, dim=-1)  # Concatenate features from all layers
        x = self.final_proj(x)

        output = self.head(x)
        return output

# ==========================================================
# RNA Dataset class (CSV format)
# ==========================================================
class RNADataset(Dataset):
    def __init__(self, data_path, max_len=1000):
        self.data = pd.read_csv(data_path)  # Assuming CSV format
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Read each sample
        sequence = self.data.iloc[idx]['sequence']  # Assuming 'sequence' is the RNA sequence column
        structure = self.data.iloc[idx]['structure']  # Assuming 'structure' is the RNA structure column

        # Convert sequence and structure to numerical values
        sequence_input = np.array([ord(char) for char in sequence], dtype=np.float32)
        structure_input = np.array([ord(char) for char in structure], dtype=np.float32)

        # Trim to max_len if necessary
        sequence_input = sequence_input[:self.max_len]
        structure_input = structure_input[:self.max_len]

        # Padding (if the sequence is shorter than max_len)
        padding_len = self.max_len - len(sequence_input)
        sequence_input = np.pad(sequence_input, (0, padding_len), 'constant', constant_values=0)
        structure_input = np.pad(structure_input, (0, padding_len), 'constant', constant_values=0)

        # Create attention mask
        attention_mask = np.ones(self.max_len)
        if len(sequence_input) < self.max_len:
            attention_mask[len(sequence_input):] = 0

        return torch.tensor(sequence_input), torch.tensor(structure_input), torch.tensor(attention_mask)

# ==========================================================
# Training and Testing Setup
# ==========================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNA_SwinConformer().to(device)

# Replace with your actual file paths
train_dataset = RNADataset('/kaggle/input/stanford-rna-3d-folding/train.csv')  # Path to your train.csv
test_dataset = RNADataset('/kaggle/input/stanford-rna-3d-folding/test.csv')    # Path to your test.csv

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Train the model
model.train()
num_epochs = 10  # Number of training epochs
for epoch in range(num_epochs):
    for inputs, targets, attention_mask in train_loader:
        inputs, targets, attention_mask = inputs.to(device), targets.to(device), attention_mask.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
with torch.no_grad():
    for inputs, attention_mask in test_loader:
        inputs, attention_mask = inputs.to(device), attention_mask.to(device)
        outputs = model(inputs, attention_mask)
        # Post-processing (e.g., decoding) can be added here
