In [None]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


# Import feature code

In [None]:

import sys, os
ROOT = os.path.abspath(".")
if ROOT not in sys.path: sys.path.insert(0, ROOT)

try:
    # Preferred: class in snake_case file
    from rna_features import RNAFeatures as RF
    rna = RF()
    print("Using RNAFeatures class API")
except Exception:
    try:
        from rna_features import rna_features as RF
        rna = RF()
        print("Using rna_features class API")
    except Exception:
        # Fall back to function API
        import rna_features as rna
        rna = rna  # module
        print("Using module-level function API")


# Load data

In [None]:

DATA_PATH = "sequences.csv"  

df = pd.read_csv(DATA_PATH)
assert {"sequence","label"}.issubset(df.columns), df.columns
df = df.dropna(subset=["sequence","label"]).reset_index(drop=True)

# Make labels numeric if needed
if not np.issubdtype(df["label"].dtype, np.number):
    df["label"] = df["label"].astype("category").cat.codes

df.head()

# Train/validation split

In [None]:
X_text = df["sequence"].astype(str).tolist()
y = df["label"].values

X_text_train, X_text_val, y_train, y_val = train_test_split(
    X_text, y, test_size=0.2, stratify=y, random_state=42
)

len(X_text_train), len(X_text_val)

# Extract RNA features (k-mers)

In [None]:
K = 3  # try 2 or 3; 3 is common

# Handle both class API and module API
if hasattr(rna, "kmer_matrix"):
    cols, X_train = rna.kmer_matrix(X_text_train, k=K, normalize=True, return_format="matrix")
    _,    X_val   = rna.kmer_matrix(X_text_val,   k=K, normalize=True, return_format="matrix")
else:
    # class methods may be named the same; if not, adapt here
    cols, X_train = rna.kmer_matrix(X_text_train, k=K, normalize=True, return_format="matrix")
    _,    X_val   = rna.kmer_matrix(X_text_val,   k=K, normalize=True, return_format="matrix")

X_train = np.asarray(X_train, dtype=np.float32)
X_val   = np.asarray(X_val,   dtype=np.float32)

print("Feature shape:", X_train.shape, "columns:", len(cols))


# Basic feature selection

In [None]:
# Standardize then SelectKBest by mutual information
scaler = StandardScaler(with_mean=False)  # features are non-negative; with_mean=False keeps sparse-friendliness
selector = SelectKBest(score_func=mutual_info_classif, k=min(256, X_train.shape[1]))

# Fit on train
X_train_std = scaler.fit_transform(X_train)
X_train_sel = selector.fit_transform(X_train_std, y_train)

# Transform val
X_val_std = scaler.transform(X_val)
X_val_sel = selector.transform(X_val_std)

X_train_sel.shape, X_val_sel.shape


# Build a small neural network (Keras)

In [None]:


input_dim = X_train_sel.shape[1]
num_classes = len(np.unique(y_train))

model = keras.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1 if num_classes==2 else num_classes, activation="sigmoid" if num_classes==2 else "softmax"),
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy" if num_classes==2 else "sparse_categorical_crossentropy",
    metrics=["AUC"] if num_classes==2 else ["accuracy"],
)

cb = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss")
]

history = model.fit(
    X_train_sel, y_train,
    validation_data=(X_val_sel, y_val),
    epochs=100,
    batch_size=64,
    callbacks=cb,
    verbose=1
)


In [None]:
if num_classes == 2:
    # binary
    y_prob = model.predict(X_val_sel).ravel()
    y_pred = (y_prob >= 0.5).astype(int)
    print("ROC-AUC:", roc_auc_score(y_val, y_prob))
else:
    # multi-class
    y_prob = model.predict(X_val_sel)
    y_pred = np.argmax(y_prob, axis=1)

print(classification_report(y_val, y_pred))


# Evaluate