# Off-target Prediction: SVM + LSTM
Loads `sample_dna_sequences.csv`, builds features, trains SVM and a small LSTM.

In [None]:
# !pip install numpy pandas scikit-learn tensorflow matplotlib

import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

# Simple one-hot for A,T,C,G
vocab = {'A':0,'T':1,'C':2,'G':3}
def one_hot_seq(seq, max_len=30):
    x = np.zeros((max_len, 4), dtype=np.float32)
    for i, ch in enumerate(seq[:max_len]):
        x[i, vocab[ch]] = 1.0
    return x

df = pd.read_csv('../datasets/sample_dna_sequences.csv')
X = np.stack([one_hot_seq(s) for s in df['sequence'].values])  # (N, L, 4)
y = df['label_offtarget'].values

# Flatten for SVM
X_flat = X.reshape(len(X), -1)
X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=42, stratify=y)

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print("SVM accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


In [None]:
# LSTM model
import tensorflow as tf
from tensorflow.keras import layers, models

# Use the original 3D tensors for LSTM (N, L, 4)
X_full = X
X_tr, X_te, y_tr, y_te = train_test_split(X_full, y, test_size=0.2, random_state=42, stratify=y)

model = models.Sequential([
    layers.Input(shape=(X_full.shape[1], X_full.shape[2])),
    layers.LSTM(64),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_tr, y_tr, validation_split=0.2, epochs=5, batch_size=32, verbose=1)

loss, acc = model.evaluate(X_te, y_te, verbose=0)
print("LSTM accuracy:", acc)
