# DL-GENAi PROJECT
# Name  : Abhishek Saha
# Roll  : 23f1001572
# Model : TF-IDF + SGD Classifier

# imports

In [1]:
import os, re, html, time
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, log_loss
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import SGDClassifier
import wandb

from scipy.sparse import hstack


!wandb login 20d9b18a55f275c39d05bf53e51e8b328aeffff5

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


## Configuration

In [3]:
class CFG:
    TRAIN_PATH = "/kaggle/input/2025-sep-dl-gen-ai-project/train.csv"
    TEST_PATH  = "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv"
    SAMPLE_SUB = "/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv"
    RANDOM_SEED = 42
    TEST_SIZE = 0.1

    # TF-IDF
    MAX_FEATURES_WORD = 40000
    MAX_FEATURES_CHAR = 20000
    NGRAM_RANGE_WORD = (1,2)
    NGRAM_RANGE_CHAR = (3,5)

    # LightGBM
    LR = 0.05
    NUM_LEAVES = 127
    N_ESTIMATORS = 2000
    EARLY_STOPPING_ROUNDS = 50

    OUTPUT_DIR = "./model3_outputs"
    WAND_PROJECT = "23f1001572-t32025"
    RUN_NAME = "model3-tfidf-lgbm"

CFG = CFG()
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)
TARGET_COLS = ["anger","fear","joy","sadness","surprise"]

## Data Loader

In [4]:
train = pd.read_csv(CFG.TRAIN_PATH)
test  = pd.read_csv(CFG.TEST_PATH)

print("Train shape:", train.shape)
print("Test shape :", test.shape)
train.head(2)

Train shape: (6827, 8)
Test shape : (1707, 2)


Unnamed: 0,id,text,anger,fear,joy,sadness,surprise,emotions
0,0,the dentist that did the work apparently did a...,1,0,0,1,0,['anger' 'sadness']
1,1,i'm gonna absolutely ~~suck~~ be terrible duri...,0,1,0,1,0,['fear' 'sadness']


## Preprocessing function

In [5]:
contraction_map = {"n't":" not", "'re":" are", "'s":" is", "'d":" would", "'ll":" will", "'ve":" have", "'m":" am"}

def preprocess_text(text):
    if pd.isna(text):
        return ""
    s = html.unescape(str(text)).lower()
    for k,v in contraction_map.items():
        s = s.replace(k, v)
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"@\w+", " ", s)
    s = re.sub(r"[^a-z0-9\s\.\,\!\?\']", " ", s)
    s = re.sub(r"(.)\1{2,}", r"\1\1", s)   # coooool -> coool -> cool? keeps double
    s = re.sub(r"\s+", " ", s).strip()
    return s

train["clean_text"] = train["text"].apply(preprocess_text)
test["clean_text"]  = test["text"].apply(preprocess_text)

# quick sample
train["clean_text"].sample(3).tolist()

['no, but can we.. look, i do not have time for this can you just spare a couple of minutes i just need to get something off my chest i do not care, ok?',
 'i have to close my eyes and breath.',
 'i could not get loose and ended up pulling so hard that i pulled my tooth completely out.']

## Train/Validation Split

In [6]:
train_df, val_df = train_test_split(
    train,
    test_size=CFG.TEST_SIZE,
    random_state=CFG.RANDOM_SEED,
    shuffle=True
)
train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)

print("Train:", train_df.shape, "Val:", val_df.shape)

Train: (6144, 9) Val: (683, 9)


## TF-IDF vectorizer

In [7]:
print("Fitting TF-IDF (word)...")
tfidf_word = TfidfVectorizer(max_features=CFG.MAX_FEATURES_WORD, ngram_range=CFG.NGRAM_RANGE_WORD, analyzer="word")
tfidf_word.fit(train_df["clean_text"].values)

print("Fitting TF-IDF (char)...")
tfidf_char = TfidfVectorizer(max_features=CFG.MAX_FEATURES_CHAR, ngram_range=CFG.NGRAM_RANGE_CHAR, analyzer="char")
tfidf_char.fit(train_df["clean_text"].values)

print("Transforming datasets...")
X_train = hstack([
    tfidf_word.transform(train_df["clean_text"].values),
    tfidf_char.transform(train_df["clean_text"].values)
])
X_val = hstack([
    tfidf_word.transform(val_df["clean_text"].values),
    tfidf_char.transform(val_df["clean_text"].values)
])
X_test = hstack([
    tfidf_word.transform(test["clean_text"].values),
    tfidf_char.transform(test["clean_text"].values)
])

print("Shapes -> X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)

Fitting TF-IDF (word)...
Fitting TF-IDF (char)...
Transforming datasets...
Shapes -> X_train: (6144, 60000) X_val: (683, 60000) X_test: (1707, 60000)


In [8]:
joblib.dump(tfidf_word, os.path.join(CFG.OUTPUT_DIR, "tfidf_word.pkl"))
joblib.dump(tfidf_char, os.path.join(CFG.OUTPUT_DIR, "tfidf_char.pkl"))
print("Saved vectorizers to", CFG.OUTPUT_DIR)

Saved vectorizers to ./model3_outputs


In [9]:
USE_WANDB = False
try:
    wandb.init(
        project=CFG.WAND_PROJECT,
        name=CFG.RUN_NAME,
        config={
            "model": "tfidf_lgbm",
            "max_features_word": CFG.MAX_FEATURES_WORD,
            "max_features_char": CFG.MAX_FEATURES_CHAR,
            "lr": CFG.LR,
            "num_leaves": CFG.NUM_LEAVES,
            "n_estimators": CFG.N_ESTIMATORS
        }
    )
    USE_WANDB = True
except Exception as e:
    print("W&B init failed or offline. Continuing without W&B. Err:", e)
    USE_WANDB = False


[34m[1mwandb[0m: Currently logged in as: [33mabhisheksaha[0m ([33mabhisheksahaiitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Train one LightGBM per target label

In [10]:
models = {}
val_preds_proba = np.zeros((len(val_df), len(TARGET_COLS)))
train_preds_proba = np.zeros((len(train_df), len(TARGET_COLS)))

for i, col in enumerate(TARGET_COLS):
    print(f"\n=== Training label {col} ({i+1}/{len(TARGET_COLS)}) ===")
    y_train = train_df[col].values
    y_val   = val_df[col].values

    
    clf = SGDClassifier(
        loss="log_loss",
        alpha=1e-4,
        max_iter=1,
        warm_start=True,
        learning_rate="optimal",
        random_state=CFG.RANDOM_SEED
    )

    EPOCHS = 20
    for epoch in range(EPOCHS):
        clf.partial_fit(X_train, y_train, classes=[0,1])

        val_proba = clf.predict_proba(X_val)[:, 1]
        val_pred_bin = (val_proba > 0.5).astype(int)
        f1 = f1_score(y_val, val_pred_bin, zero_division=0)

        if USE_WANDB:
            wandb.log({f"{col}_epoch_f1": f1, f"{col}_epoch": epoch})

    # final predictions
    val_proba = clf.predict_proba(X_val)[:, 1]
    train_proba = clf.predict_proba(X_train)[:, 1]

    val_preds_proba[:, i] = val_proba
    train_preds_proba[:, i] = train_proba
    models[col] = clf

    print(f"Label {col} | Final F1 = {f1:.4f}")

    # save model
    model_fname = os.path.join(CFG.OUTPUT_DIR, f"lgb_{col}.txt")
    joblib.dump(clf, model_fname)



=== Training label anger (1/5) ===
Label anger | Final F1 = 0.2979

=== Training label fear (2/5) ===
Label fear | Final F1 = 0.8659

=== Training label joy (3/5) ===
Label joy | Final F1 = 0.6640

=== Training label sadness (4/5) ===
Label sadness | Final F1 = 0.7084

=== Training label surprise (5/5) ===
Label surprise | Final F1 = 0.7356


## Threshold tuning on validation set 

In [11]:
best_thresholds = []
per_label_f1 = {}
for i, col in enumerate(TARGET_COLS):
    best_f1 = 0.0
    best_t = 0.5
    for t in np.linspace(0.1,0.9,81):
        pbin = (val_preds_proba[:, i] > t).astype(int)
        f1 = f1_score(val_df[col].values, pbin, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t = float(t)
    best_thresholds.append(best_t)
    per_label_f1[col] = best_f1

macro_val_f1 = np.mean(list(per_label_f1.values()))
print("Thresholds:", best_thresholds)
print("Per-label best F1 on validation:", per_label_f1)
print("Macro F1 on validation:", macro_val_f1)

if USE_WANDB:
    wandb.log({"val_macro_f1": macro_val_f1})
    for i, col in enumerate(TARGET_COLS):
        wandb.log({f"best_thr_{col}": best_thresholds[i], f"best_f1_{col}": per_label_f1[col]})


Thresholds: [0.30000000000000004, 0.5, 0.37, 0.37, 0.35]
Per-label best F1 on validation: {'anger': 0.7131782945736435, 'fear': 0.8659286592865928, 'joy': 0.7733333333333333, 'sadness': 0.7744874715261959, 'surprise': 0.7688564476885645}
Macro F1 on validation: 0.779156841281666


In [12]:
joblib.dump({
    "models": {col: os.path.join(CFG.OUTPUT_DIR, f"lgb_{col}.txt") for col in TARGET_COLS},
    "vectorizers": {
        "word": os.path.join(CFG.OUTPUT_DIR, "tfidf_word.pkl"),
        "char": os.path.join(CFG.OUTPUT_DIR, "tfidf_char.pkl"),
    },
    "thresholds": best_thresholds,
    "config": CFG.__dict__
}, os.path.join(CFG.OUTPUT_DIR, "model3_artifact.pkl"))

print("Saved artifacts to:", CFG.OUTPUT_DIR)

if USE_WANDB:
    wandb.finish()

Saved artifacts to: ./model3_outputs


0,1
anger_epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
anger_epoch_f1,█▅▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
best_f1_anger,▁
best_f1_fear,▁
best_f1_joy,▁
best_f1_sadness,▁
best_f1_surprise,▁
best_thr_anger,▁
best_thr_fear,▁
best_thr_joy,▁

0,1
anger_epoch,19.0
anger_epoch_f1,0.29787
best_f1_anger,0.71318
best_f1_fear,0.86593
best_f1_joy,0.77333
best_f1_sadness,0.77449
best_f1_surprise,0.76886
best_thr_anger,0.3
best_thr_fear,0.5
best_thr_joy,0.37


## Submission

In [14]:
print("Predicting on test set...")
test_probas = np.zeros((X_test.shape[0], len(TARGET_COLS)))

for i, col in enumerate(TARGET_COLS):
    clf = joblib.load(os.path.join(CFG.OUTPUT_DIR, f"lgb_{col}.txt"))
    test_proba = clf.predict_proba(X_test)[:, 1]
    test_probas[:, i] = test_proba

best_thresholds = np.array(best_thresholds)
test_preds_bin = (test_probas > best_thresholds).astype(int)

submission = pd.DataFrame({
    "id": test["id"],
    "anger":   test_preds_bin[:, 0],
    "fear":    test_preds_bin[:, 1],
    "joy":     test_preds_bin[:, 2],
    "sadness": test_preds_bin[:, 3],
    "surprise":test_preds_bin[:, 4]
})


submission.to_csv("submission.csv", index=False)
submission.head()


Predicting on test set...


Unnamed: 0,id,anger,fear,joy,sadness,surprise
0,0,1,1,1,0,1
1,1,0,0,0,0,0
2,2,1,1,0,0,0
3,3,0,1,0,0,0
4,4,0,1,0,0,1
