# DL-GENAi PROJECT â€” Scratch BiLSTM
# Name  : Abhishek Saha
# Roll  : 23f1001572
# Model : TF-IDF + LightGBM

# imports

In [None]:
import os, re, html, time
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, log_loss
import joblib
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
import wandb

from scipy.sparse import hstack


!wandb login 20d9b18a55f275c39d05bf53e51e8b328aeffff5

## Configuration

In [None]:
class CFG:
    TRAIN_PATH = "/kaggle/input/2025-sep-dl-gen-ai-project/train.csv"
    TEST_PATH  = "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv"
    SAMPLE_SUB = "/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv"
    RANDOM_SEED = 42
    TEST_SIZE = 0.1

    # TF-IDF
    MAX_FEATURES_WORD = 40000
    MAX_FEATURES_CHAR = 20000
    NGRAM_RANGE_WORD = (1,2)
    NGRAM_RANGE_CHAR = (3,5)

    # LightGBM
    LR = 0.05
    NUM_LEAVES = 127
    N_ESTIMATORS = 2000
    EARLY_STOPPING_ROUNDS = 50

    OUTPUT_DIR = "./model3_outputs"
    WAND_PROJECT = "23f1001572-t32025"
    RUN_NAME = "model3-tfidf-lgbm"

CFG = CFG()
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)
TARGET_COLS = ["anger","fear","joy","sadness","surprise"]

## Data Loader

In [None]:
train = pd.read_csv(CFG.TRAIN_PATH)
test  = pd.read_csv(CFG.TEST_PATH)

print("Train shape:", train.shape)
print("Test shape :", test.shape)
train.head(2)

## Preprocessing function

In [None]:
contraction_map = {"n't":" not", "'re":" are", "'s":" is", "'d":" would", "'ll":" will", "'ve":" have", "'m":" am"}

def preprocess_text(text):
    if pd.isna(text):
        return ""
    s = html.unescape(str(text)).lower()
    for k,v in contraction_map.items():
        s = s.replace(k, v)
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"@\w+", " ", s)
    s = re.sub(r"[^a-z0-9\s\.\,\!\?\']", " ", s)
    s = re.sub(r"(.)\1{2,}", r"\1\1", s)   # coooool -> coool -> cool? keeps double
    s = re.sub(r"\s+", " ", s).strip()
    return s

train["clean_text"] = train["text"].apply(preprocess_text)
test["clean_text"]  = test["text"].apply(preprocess_text)

# quick sample
train["clean_text"].sample(3).tolist()

## Train/Validation Split

In [None]:
train_df, val_df = train_test_split(
    train,
    test_size=CFG.TEST_SIZE,
    random_state=CFG.RANDOM_SEED,
    shuffle=True
)
train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)

print("Train:", train_df.shape, "Val:", val_df.shape)

## TF-IDF vectorizer

In [None]:
print("Fitting TF-IDF (word)...")
tfidf_word = TfidfVectorizer(max_features=CFG.MAX_FEATURES_WORD, ngram_range=CFG.NGRAM_RANGE_WORD, analyzer="word")
tfidf_word.fit(train_df["clean_text"].values)

print("Fitting TF-IDF (char)...")
tfidf_char = TfidfVectorizer(max_features=CFG.MAX_FEATURES_CHAR, ngram_range=CFG.NGRAM_RANGE_CHAR, analyzer="char")
tfidf_char.fit(train_df["clean_text"].values)

print("Transforming datasets...")
X_train = hstack([
    tfidf_word.transform(train_df["clean_text"].values),
    tfidf_char.transform(train_df["clean_text"].values)
])
X_val = hstack([
    tfidf_word.transform(val_df["clean_text"].values),
    tfidf_char.transform(val_df["clean_text"].values)
])
X_test = hstack([
    tfidf_word.transform(test["clean_text"].values),
    tfidf_char.transform(test["clean_text"].values)
])

print("Shapes -> X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)

In [None]:
joblib.dump(tfidf_word, os.path.join(CFG.OUTPUT_DIR, "tfidf_word.pkl"))
joblib.dump(tfidf_char, os.path.join(CFG.OUTPUT_DIR, "tfidf_char.pkl"))
print("Saved vectorizers to", CFG.OUTPUT_DIR)

In [None]:
USE_WANDB = False
try:
    wandb.init(
        project=CFG.WAND_PROJECT,
        name=CFG.RUN_NAME,
        config={
            "model": "tfidf_lgbm",
            "max_features_word": CFG.MAX_FEATURES_WORD,
            "max_features_char": CFG.MAX_FEATURES_CHAR,
            "lr": CFG.LR,
            "num_leaves": CFG.NUM_LEAVES,
            "n_estimators": CFG.N_ESTIMATORS
        }
    )
    USE_WANDB = True
except Exception as e:
    print("W&B init failed or offline. Continuing without W&B. Err:", e)
    USE_WANDB = False


## Train one LightGBM per target label

In [None]:
models = {}
val_preds_proba = np.zeros((len(val_df), len(TARGET_COLS)))
train_preds_proba = np.zeros((len(train_df), len(TARGET_COLS)))

for i, col in enumerate(TARGET_COLS):
    print(f"\n=== Training label {col} ({i+1}/{len(TARGET_COLS)}) ===")
    y_train = train_df[col].values
    y_val   = val_df[col].values

    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val   = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "learning_rate": CFG.LR,
        "num_leaves": CFG.NUM_LEAVES,
        "verbosity": -1,
        "seed": CFG.RANDOM_SEED,
        "boosting_type": "gbdt",
        "feature_pre_filter": False
    }

    callbacks = [
        lgb.early_stopping(CFG.EARLY_STOPPING_ROUNDS),
        lgb.log_evaluation(100),
    ]
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=CFG.N_ESTIMATORS,
        valid_sets=[lgb_train, lgb_val],
        valid_names=["train","valid"],
        callbacks=callbacks
    )


    model_fname = os.path.join(CFG.OUTPUT_DIR, f"lgb_{col}.txt")
    model.save_model(model_fname)
    models[col] = model

    val_proba = model.predict(X_val, num_iteration=model.best_iteration)
    train_proba = model.predict(X_train, num_iteration=model.best_iteration)
    val_preds_proba[:, i] = val_proba
    train_preds_proba[:, i] = train_proba

    val_pred_bin = (val_proba > 0.5).astype(int)
    f1 = f1_score(y_val, val_pred_bin, zero_division=0)
    loss = log_loss(y_val, val_proba, labels=[0,1])

    print(f"Label {col} | val f1@0.5 = {f1:.4f} | val logloss = {loss:.4f}")

    if USE_WANDB:
        wandb.log({f"val_f1_{col}": f1, f"val_logloss_{col}": loss, "label": col})


## Threshold tuning on validation set 

In [None]:
best_thresholds = []
per_label_f1 = {}
for i, col in enumerate(TARGET_COLS):
    best_f1 = 0.0
    best_t = 0.5
    for t in np.linspace(0.1,0.9,81):
        pbin = (val_preds_proba[:, i] > t).astype(int)
        f1 = f1_score(val_df[col].values, pbin, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t = float(t)
    best_thresholds.append(best_t)
    per_label_f1[col] = best_f1

macro_val_f1 = np.mean(list(per_label_f1.values()))
print("Thresholds:", best_thresholds)
print("Per-label best F1 on validation:", per_label_f1)
print("Macro F1 on validation:", macro_val_f1)

if USE_WANDB:
    wandb.log({"val_macro_f1": macro_val_f1})
    for i, col in enumerate(TARGET_COLS):
        wandb.log({f"best_thr_{col}": best_thresholds[i], f"best_f1_{col}": per_label_f1[col]})


In [None]:
joblib.dump({
    "models": {col: os.path.join(CFG.OUTPUT_DIR, f"lgb_{col}.txt") for col in TARGET_COLS},
    "vectorizers": {
        "word": os.path.join(CFG.OUTPUT_DIR, "tfidf_word.pkl"),
        "char": os.path.join(CFG.OUTPUT_DIR, "tfidf_char.pkl"),
    },
    "thresholds": best_thresholds,
    "config": CFG.__dict__
}, os.path.join(CFG.OUTPUT_DIR, "model3_artifact.pkl"))

print("Saved artifacts to:", CFG.OUTPUT_DIR)

if USE_WANDB:
    wandb.finish()

## Submission

In [None]:
print("Predicting on test set...")
test_probas = np.zeros((X_test.shape[0], len(TARGET_COLS)))
for i, col in enumerate(TARGET_COLS):
    model = lgb.Booster(model_file=os.path.join(CFG.OUTPUT_DIR, f"lgb_{col}.txt"))
    test_proba = model.predict(X_test, num_iteration=model.best_iteration)
    test_probas[:, i] = test_proba

best_thresholds = np.array(best_thresholds)
test_preds_bin = (test_probas > best_thresholds).astype(int)

submission = pd.DataFrame({
    "id": test["id"],
    "anger":   test_preds_bin[:, 0],
    "fear":    test_preds_bin[:, 1],
    "joy":     test_preds_bin[:, 2],
    "sadness": test_preds_bin[:, 3],
    "surprise":test_preds_bin[:, 4]
})
submission_path = os.path.join(CFG.OUTPUT_DIR, "submission.csv")
submission.to_csv(submission_path, index=False)
print("Saved submission:", submission_path)
submission.head()