In [19]:
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf, gc
from tensorflow import keras
from tensorflow.keras import layers
import joblib
from scipy import sparse
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# load datasets
n_samples = 1125678
embedding_dim = 768
sarcasm_feats = pd.read_csv("/content/drive/My Drive/cs3244/features/sarcasm_features.csv")
word_vectorizer = joblib.load("/content/drive/My Drive/cs3244/features/word_vectorizer.pkl")
char_vectorizer = joblib.load("/content/drive/My Drive/cs3244/features/char_vectorizer.pkl")
embeddings = np.memmap("/content/drive/My Drive/cs3244/features/embeddings_memmap.npy",dtype='float32',mode='r',shape=(n_samples, embedding_dim))

In [22]:
sarcasm_feats['text'] = sarcasm_feats['text'].fillna('')
# transform text using pre-fitted TF-IDF vectorizers
X_word = word_vectorizer.transform(sarcasm_feats['text'])
X_char = char_vectorizer.transform(sarcasm_feats['text'])

In [23]:
y = sarcasm_feats['label'].values

idx_all = np.arange(len(y))
idx_tr, idx_va, y_tr, y_va = train_test_split(idx_all, y, test_size = 0.2, random_state = 42, stratify = y)

In [24]:
# transform embeddings so collab don't crash :(
pca_components = 100    # reduce to 100 dimensions
batch_size_pca = 1024   # batch size for incremental PCA

ipca = IncrementalPCA(n_components = pca_components, batch_size = batch_size_pca)

for start in range(0, n_samples, batch_size_pca):
    end = min(start + batch_size_pca, n_samples)
    ipca.partial_fit(embeddings[start:end])

embeddings_pca = np.zeros((n_samples, pca_components), dtype = np.float32)
for start in range(0, n_samples, batch_size_pca):
    end = min(start + batch_size_pca, n_samples)
    embeddings_pca[start:end] = ipca.transform(embeddings[start:end])

In [25]:
X_sarcasm = sarcasm_feats.drop(columns = ['label', 'text']).select_dtypes(include = np.number).values

scaler = StandardScaler()
scaler.fit(X_sarcasm[idx_tr])
X_sarcasm_scaled = scaler.transform(X_sarcasm)

In [26]:
# LSTM config to stay within RAM limits
SEED                 = 42
LSTM_MAX_VOCAB       = 20000
LSTM_MAX_LEN         = 48
EMB_DIM_FINAL        = 96
LSTM_UNITS_FINAL     = 48
FUSION_DENSE_FINAL   = 192
DROP_SEQ_FINAL       = 0.30
DROP_FUSION_FINAL    = 0.30
SVD_WORD_COMP        = 64
SVD_CHAR_COMP        = 64
BATCH_SIZE           = 64
EPOCHS               = 1
SUBSET_FOR_FIT       = min(100_000, len(idx_tr))

# ensure integer labels
if np.issubdtype(np.array(y).dtype, np.number):
    y_all = y.astype(int)
else:
    _le = LabelEncoder()
    y_all = _le.fit_transform(y)
NUM_CLASSES = int(np.unique(y_all).size)

In [27]:
# fit tokenizer & SVDs on a small subset only
rng = np.random.default_rng(SEED)
if len(idx_tr) <= SUBSET_FOR_FIT:
    fit_sub = idx_tr
else:
    fit_sub = rng.choice(idx_tr, size=SUBSET_FOR_FIT, replace = False)

# run tokenizer on subset texts
_tok = keras.preprocessing.text.Tokenizer(num_words = LSTM_MAX_VOCAB, lower = True, oov_token = "<OOV>")
_tok.fit_on_texts(sarcasm_feats.loc[fit_sub, "text"].fillna("").tolist())

def _to_seq_batch(texts, max_len = LSTM_MAX_LEN):
    seqs = _tok.texts_to_sequences(texts)
    return keras.preprocessing.sequence.pad_sequences(seqs, maxlen = max_len, padding = "post", truncating = "post")

# SVDs on subset (sparse -> low mem)
svd_w = TruncatedSVD(n_components = SVD_WORD_COMP, random_state = SEED)
svd_c = TruncatedSVD(n_components = SVD_CHAR_COMP, random_state = SEED)
svd_w.fit(X_word[fit_sub])
svd_c.fit(X_char[fit_sub])

# dimensions for model inputs
if X_sarcasm_scaled.size:
    NUM_DIM = X_sarcasm_scaled.shape[1]
else:
    NUM_DIM = 0

EMB_DIM_REDUCED = embeddings_pca.shape[1]

In [28]:
# streaming Sequence
class HybridSequence(keras.utils.Sequence):
    def __init__(self, indices, batch_size, shuffle = False, class_weight = None):
        self.indices = np.array(indices, dtype = np.int64)
        self.batch_size = int(batch_size)
        self.shuffle = bool(shuffle)
        self.rng = np.random.default_rng(SEED)
        self.class_weight = class_weight
        self.on_epoch_end()

    def __len__(self):
        return math.ceil(len(self.indices) / self.batch_size)

    def on_epoch_end(self):
        if self.shuffle:
            self.rng.shuffle(self.indices)

    def __getitem__(self, idx):
        start = idx * self.batch_size
        end = min(start + self.batch_size, len(self.indices))
        sl = self.indices[start:end]

        # A) text -> sequences (batch)
        texts = sarcasm_feats.loc[sl, "text"].fillna("").tolist()
        seq_batch = _to_seq_batch(texts).astype("int32")

        # B) TF-IDF -> SVD (batch; cast to float32)
        w_svd_batch = svd_w.transform(X_word[sl]).astype("float32")
        c_svd_batch = svd_c.transform(X_char[sl]).astype("float32")

        # C) numeric (already scaled)
        if NUM_DIM > 0:
            num_batch = X_sarcasm_scaled[sl].astype("float32")
        else:
            num_batch = None

        # D) external embeddings (already IPCA-reduced)
        emb_batch = embeddings_pca[sl].astype("float32")

        # labels
        y_batch = y_all[sl].astype("int32")

        inputs = (seq_batch, w_svd_batch, c_svd_batch, emb_batch) if NUM_DIM == 0 \
                 else (seq_batch, w_svd_batch, c_svd_batch, emb_batch, num_batch)

        if self.class_weight is not None:
            sw = np.asarray([self.class_weight[int(k)] for k in y_batch], dtype = "float32")
            return (inputs, y_batch, sw)  # sample weights supported by Sequence
        else:
            return (inputs, y_batch)

# per-class weights from train labels (used as sample weights)
counts = np.bincount(y_all[idx_tr])
class_weight_map = {i: float(counts.max()/max(c,1)) for i, c in enumerate(counts)}

# build sequences
train_seq = HybridSequence(idx_tr, batch_size = BATCH_SIZE, shuffle = True,  class_weight = class_weight_map)
val_seq   = HybridSequence(idx_va, batch_size = BATCH_SIZE, shuffle = False, class_weight = None)


In [29]:
# build hybrid model
inp_seq  = layers.Input(shape = (LSTM_MAX_LEN,), name = "seq_input")
inp_wsvd = layers.Input(shape = (SVD_WORD_COMP,), name = "tfidf_word_svd")
inp_csvd = layers.Input(shape = (SVD_CHAR_COMP,), name = "tfidf_char_svd")
inp_emb  = layers.Input(shape = (EMB_DIM_REDUCED,), name = "ext_emb_ipca")
if NUM_DIM > 0:
    inp_num = layers.Input(shape = (NUM_DIM,), name = "numeric")
else:
    inp_num = None

# A: sequence tower
xa = layers.Embedding(
    input_dim = min(LSTM_MAX_VOCAB, len(_tok.word_index) + 2),
    output_dim = EMB_DIM_FINAL,
    mask_zero = True,
)(inp_seq)
xa = layers.Bidirectional(layers.LSTM(LSTM_UNITS_FINAL, return_sequences = True))(xa)
xa = layers.GlobalMaxPool1D()(xa)
xa = layers.Dropout(DROP_SEQ_FINAL)(xa)

# B1/B2: TF-IDF SVD towers
b1 = layers.Dense(128, activation = "relu")(inp_wsvd); b1 = layers.Dropout(0.20)(b1)
b2 = layers.Dense(128, activation = "relu")(inp_csvd); b2 = layers.Dropout(0.20)(b2)

# C: numeric
if inp_num is not None:
    cn = layers.Dense(64, activation = "relu")(inp_num); cn = layers.Dropout(0.10)(cn)
else:
    cn = None

# D: external embeddings tower
de = layers.Dense(128, activation = "relu")(inp_emb); de = layers.Dropout(0.20)(de)

# fuse
parts = [xa, b1, b2, de] + ([cn] if cn is not None else [])
z = layers.Concatenate()(parts)
z = layers.Dense(FUSION_DENSE_FINAL, activation = "relu")(z)
z = layers.Dropout(DROP_FUSION_FINAL)(z)
out = layers.Dense(NUM_CLASSES, activation = "softmax")(z)

inputs = [inp_seq, inp_wsvd, inp_csvd, inp_emb] + ([inp_num] if NUM_DIM > 0 else [])
model = keras.Model(inputs = inputs, outputs = out)
model.compile(optimizer = keras.optimizers.Adam(1e-3),loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [30]:
# train
es = keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True, monitor = "val_loss")
history = model.fit(
    train_seq,
    validation_data = val_seq,
    epochs = EPOCHS,
    verbose = 1
)

[1m14071/14071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1573s[0m 111ms/step - accuracy: 0.7174 - loss: 0.5485 - val_accuracy: 0.7487 - val_loss: 0.5045


In [31]:
model.save("/content/drive/MyDrive/cs3244/models/lstm_model.keras")

In [32]:
# evaluate and derive metrics
probs = model.predict(val_seq, verbose = 0)
preds = probs.argmax(axis = 1)
true  = y_all[idx_va][:len(preds)]  # ensure same length

print("Accuracy:", round(accuracy_score(true, preds), 4))
print("Weighted F1:", round(f1_score(true, preds, average = "weighted"), 4))
print("\nClassification report:\n", classification_report(true, preds, digits = 4))

Accuracy: 0.7487
Weighted F1: 0.7483

Classification report:
               precision    recall  f1-score   support

           0     0.7313    0.7864    0.7578    112568
           1     0.7689    0.7110    0.7388    112568

    accuracy                         0.7487    225136
   macro avg     0.7501    0.7487    0.7483    225136
weighted avg     0.7501    0.7487    0.7483    225136



In [33]:
# individual predictions on validation
probs = model.predict(val_seq, verbose=0)
preds = probs.argmax(axis=1)
true  = y_all[idx_va][:len(preds)]
val_indices = idx_va[:len(preds)]

import numpy as np, pandas as pd
conf = probs[np.arange(len(preds)), preds]

df_val = pd.DataFrame({
    "row_id": val_indices,
    "text":  sarcasm_feats.loc[val_indices, "text"].astype(str).values,
    "true":  true,
    "pred":  preds,
    "pred_confidence": np.round(conf, 4),
})

# split into misclassified & correctly classified
mis = df_val[df_val["true"] != df_val["pred"]].copy()
cor = df_val[df_val["true"] == df_val["pred"]].copy()

print(f"Val size: {len(df_val)} | Misclassified: {len(mis)} | Correct: {len(cor)}")


N = 10

# show misclassified (top-N by confidence)
print("\nMisclassified (top-N by predicted confidence)")
print(mis.sort_values("pred_confidence", ascending=False)
         .head(N)[["row_id","true","pred","pred_confidence","text"]]
         .to_string(index=False))

# show correctly classified (top-N by confidence)
print("\nCorrectly classified (top-N by predicted confidence)")
print(cor.sort_values("pred_confidence", ascending=False)
         .head(N)[["row_id","true","pred","pred_confidence","text"]]
         .to_string(index=False))



Val size: 225136 | Misclassified: 56582 | Correct: 168554

Misclassified (top-N by predicted confidence)
 row_id  true  pred  pred_confidence                                                                                                                                                                                                                                                                                                                 text
 332738     0     1           1.0000                                                                     ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!ALLRIGHT Y'ALL!
 905260     1     0           0.9975                                                                                                                                                                       

In [34]:
# hyperparameter tuning (run only as needed)

def build_from_cfg(cfg):
    inp_seq  = layers.Input((LSTM_MAX_LEN,))
    inp_wsvd = layers.Input((SVD_WORD_COMP,))
    inp_csvd = layers.Input((SVD_CHAR_COMP,))
    inp_emb  = layers.Input((EMB_DIM_REDUCED,))
    if NUM_DIM > 0:
      inp_num = layers.Input(shape = (NUM_DIM,), name = "numeric")
    else:
      inp_num = None

    # sequence tower
    xa = layers.Embedding(min(LSTM_MAX_VOCAB, len(_tok.word_index) + 2), cfg["emb_dim"], mask_zero = True)(inp_seq)
    xa = layers.Bidirectional(layers.LSTM(cfg["lstm_units"], return_sequences = True))(xa)
    xa = layers.GlobalMaxPool1D()(xa); xa = layers.Dropout(cfg["drop_seq"])(xa)

    # tf-idf towers
    b1 = layers.Dense(128, activation = "relu")(inp_wsvd); b1 = layers.Dropout(0.2)(b1)
    b2 = layers.Dense(128, activation = "relu")(inp_csvd); b2 = layers.Dropout(0.2)(b2)

    # external embeddings
    de = layers.Dense(128, activation = "relu")(inp_emb);  de = layers.Dropout(0.2)(de)

    parts = [xa, b1, b2, de]
    if inp_num is not None:
        cn = layers.Dense(64, activation = "relu")(inp_num); cn = layers.Dropout(0.1)(cn)
        parts.append(cn)

    z = layers.Concatenate()(parts)
    z = layers.Dense(cfg["fusion"], activation = "relu")(z); z = layers.Dropout(cfg["drop_fusion"])(z)
    out = layers.Dense(NUM_CLASSES, activation = "softmax")(z)

    model_i = keras.Model([inp_seq, inp_wsvd, inp_csvd, inp_emb] + ([inp_num] if NUM_DIM>0 else []), out)
    model_i.compile(optimizer = keras.optimizers.Adam(cfg["lr"]),
                    loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
    return model_i

# picking 5-7 configs to compare between
candidates = [
    {"emb_dim": 96,  "lstm_units": 48, "fusion": 192, "drop_seq": 0.30, "drop_fusion": 0.30, "lr": 1e-3},
    {"emb_dim": 96,  "lstm_units": 64, "fusion": 256, "drop_seq": 0.30, "drop_fusion": 0.30, "lr": 5e-4},
    {"emb_dim": 128, "lstm_units": 64, "fusion": 256, "drop_seq": 0.40, "drop_fusion": 0.30, "lr": 5e-4},
    {"emb_dim": 96,  "lstm_units": 48, "fusion": 256, "drop_seq": 0.35, "drop_fusion": 0.35, "lr": 7.5e-4},
    {"emb_dim": 128, "lstm_units": 48, "fusion": 192, "drop_seq": 0.30, "drop_fusion": 0.40, "lr": 3e-4},
    {"emb_dim": 96,  "lstm_units": 64, "fusion": 192, "drop_seq": 0.45, "drop_fusion": 0.45, "lr": 5e-4},
    {"emb_dim": 96,  "lstm_units": 64, "fusion": 256, "drop_seq": 0.30, "drop_fusion": 0.30, "lr": 3e-4},
]

results = []
for i, cfg in enumerate(candidates, 1):
    print(f"\nIteration {i}: {cfg}")
    m = build_from_cfg(cfg)
    es = keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True, monitor = "val_loss")
    _ = m.fit(train_seq, validation_data = val_seq, epochs = EPOCHS, callbacks = [es], verbose = 1)

    probs = m.predict(val_seq, verbose = 0)
    preds = probs.argmax(axis = 1); true = y_all[idx_va][:len(preds)]
    acc = accuracy_score(true, preds); f1 = f1_score(true, preds, average = "weighted")
    print(f"Val Acc = {acc:.4f}  Val F1 = {f1:.4f}")
    results.append({"iter": i, **cfg, "val_acc": acc, "val_f1": f1})

    # free GPU/CPU memory before next trial
    tf.keras.backend.clear_session(); del m; gc.collect()

# sorted summary
results = sorted(results, key=lambda r: r["val_f1"], reverse=True)
print("\nTuning summary (best first)")
for r in results: print(r)



Iteration 1: {'emb_dim': 96, 'lstm_units': 48, 'fusion': 192, 'drop_seq': 0.3, 'drop_fusion': 0.3, 'lr': 0.001}
[1m14071/14071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1580s[0m 112ms/step - accuracy: 0.7177 - loss: 0.5480 - val_accuracy: 0.7495 - val_loss: 0.5045
Val Acc = 0.7495  Val F1 = 0.7492

Iteration 2: {'emb_dim': 96, 'lstm_units': 64, 'fusion': 256, 'drop_seq': 0.3, 'drop_fusion': 0.3, 'lr': 0.0005}
[1m14071/14071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2157s[0m 153ms/step - accuracy: 0.7135 - loss: 0.5521 - val_accuracy: 0.7482 - val_loss: 0.5086
Val Acc = 0.7482  Val F1 = 0.7480

Iteration 3: {'emb_dim': 128, 'lstm_units': 64, 'fusion': 256, 'drop_seq': 0.4, 'drop_fusion': 0.3, 'lr': 0.0005}
[1m14071/14071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2379s[0m 169ms/step - accuracy: 0.7129 - loss: 0.5528 - val_accuracy: 0.7484 - val_loss: 0.5065
Val Acc = 0.7484  Val F1 = 0.7484

Iteration 4: {'emb_dim': 96, 'lstm_units': 48, 'fusion': 256, 'drop_seq': 0