# TFIDF + Elastic Net

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from tqdm import tqdm
import os
import pickle

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing
full_train_texts = train_df['text'].tolist() + dev_df['text'].tolist()
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,6),
    max_features=50000
)

tfidf.fit(full_train_texts)

X_train = tfidf.transform(train_df['text'])
X_dev = tfidf.transform(dev_df['text'])
X_test = tfidf.transform(test_df['text'])

# Train the best model fully
best_model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    C=1,
    l1_ratio= 0.3
    # **best_params
)
print("Fitting model")
best_model.fit(X_train, y_train)

# Evaluate again on Dev Set
print("Evaluation model")
dev_preds = best_model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score (retrained best model): {dev_macro_f1:.4f}")
# C=1, L1_ratio=0.7 -> 0.4172
# C=1, L1_ratio=0.5 -> 0.4222
# C=0.5, L1_ratio=0.5 -> 0.4024
# C=1, L1_ratio=0.3 -> 0.4261


Fitting model
Evaluation model
Validation Macro F1 Score (retrained best model): 0.4261


# TFIDF + XGBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import xgboost as xgb
from tqdm import tqdm
import os

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing
full_train_texts = train_df['text'].tolist() + dev_df['text'].tolist()
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,6),
    max_features=50000
)

print("Vectorizing data")
tfidf.fit(full_train_texts)

X_train = tfidf.transform(train_df['text'])
X_dev = tfidf.transform(dev_df['text'])
X_test = tfidf.transform(test_df['text'])
print("Vectorization complete.")

# Train XGBoost Classifier
print("Training XGBoost...")

model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(labels),
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

eval_set = [(X_train, y_train), (X_dev, y_dev)]

model.fit(
    X_train, y_train,
    eval_set=eval_set,
    # eval_metric='mlogloss',
    # early_stopping_rounds=10,  # Stop if no dev improvement in 10 rounds
    verbose=True
)

# Dev Set Evaluation
print("Evaluating on dev set...")
dev_preds = model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score: {dev_macro_f1:.4f}")

# # Predict on Test Set
# test_preds = model.predict(X_test)
# test_labels = [id2label[pred] for pred in test_preds]

# # Save submission
# output_df = pd.DataFrame({
#     'id': test_df['id'],
#     'label': test_labels
# })
# os.makedirs('outputs', exist_ok=True)
# output_df.to_csv('outputs/track_3_test.csv', index=False)

# print("Saved Track 3 submission to outputs/track_3_test.csv")

"""
'n_estimators': [300, 500],
'max_depth': [4, 6, 8],
'learning_rate': [0.01, 0.05, 0.1],
'subsample': [0.7, 0.8, 1.0],
'colsample_bytree': [0.7, 0.8, 1.0]
"""

# score = 0.3934

Vectorizing data
Vectorization complete.
Training XGBoost...
[0]	validation_0-mlogloss:1.59660	validation_1-mlogloss:1.59694
[1]	validation_0-mlogloss:1.58542	validation_1-mlogloss:1.58588
[2]	validation_0-mlogloss:1.57677	validation_1-mlogloss:1.57749
[3]	validation_0-mlogloss:1.56816	validation_1-mlogloss:1.56938
[4]	validation_0-mlogloss:1.56004	validation_1-mlogloss:1.56188
[5]	validation_0-mlogloss:1.55268	validation_1-mlogloss:1.55508
[6]	validation_0-mlogloss:1.54618	validation_1-mlogloss:1.54884
[7]	validation_0-mlogloss:1.54010	validation_1-mlogloss:1.54338
[8]	validation_0-mlogloss:1.53430	validation_1-mlogloss:1.53803
[9]	validation_0-mlogloss:1.52921	validation_1-mlogloss:1.53344
[10]	validation_0-mlogloss:1.52471	validation_1-mlogloss:1.52969
[11]	validation_0-mlogloss:1.52035	validation_1-mlogloss:1.52629
[12]	validation_0-mlogloss:1.51612	validation_1-mlogloss:1.52260
[13]	validation_0-mlogloss:1.51210	validation_1-mlogloss:1.51908
[14]	validation_0-mlogloss:1.50845	vali

"\n'n_estimators': [300, 500],\n'max_depth': [4, 6, 8],\n'learning_rate': [0.01, 0.05, 0.1],\n'subsample': [0.7, 0.8, 1.0],\n'colsample_bytree': [0.7, 0.8, 1.0]\n"

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import xgboost as xgb
import joblib
import pickle
import os

SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# Utility: Check if all models exist
def models_exist():
    required_files = [
        "tfidf_vectorizer.pkl",
        "logreg_model.pkl",
        "xgb_model.pkl",
        "meta_model.pkl",
        "label_maps.pkl"
    ]
    return all(os.path.exists(os.path.join(SAVE_DIR, fname)) for fname in required_files)

# Load Data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')

# Train + Save Models If Not Exist
if not models_exist():
    print("Models not found. Training models...")

    # Label Encoding
    all_labels = sorted(set(train_df['label']) | set(dev_df['label']))
    label2id = {label: i for i, label in enumerate(all_labels)}
    id2label = {i: label for label, i in label2id.items()}
    y_train = train_df['label'].map(label2id).values
    y_dev = dev_df['label'].map(label2id).values

    # TF-IDF Vectorization
    print("Fitting TF-IDF vectorizer...")
    tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 6), max_features=50000)
    tfidf.fit(pd.concat([train_df['text'], dev_df['text']]))

    X_train = tfidf.transform(train_df['text'])
    X_dev = tfidf.transform(dev_df['text'])

    # Train base models
    print("Training Logistic Regression...")
    logreg = LogisticRegression(
        penalty='elasticnet',
        solver='saga',
        max_iter=1000,
        C=1.0,
        l1_ratio=0.3,
        random_state=42,
        n_jobs=-1
    )
    logreg.fit(X_train, y_train)

    print("Training XGBoost...")
    xgb_clf = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(label2id),
        n_estimators=300,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    xgb_clf.fit(X_train, y_train)

    # Generate meta-features (dev set)
    print("Generating meta-features...")
    dev_preds_logreg = logreg.predict_proba(X_dev)
    dev_preds_xgb = xgb_clf.predict_proba(X_dev)
    X_meta_dev = np.hstack([dev_preds_logreg, dev_preds_xgb])

    # Train meta-model
    print("Training meta-model...")
    meta_model = LogisticRegression(max_iter=1000, random_state=42)
    meta_model.fit(X_meta_dev, y_dev)

    # Evaluate
    meta_dev_preds = meta_model.predict(X_meta_dev)
    meta_f1 = f1_score(y_dev, meta_dev_preds, average='macro')
    print(f"Stacked Dev Macro F1: {meta_f1:.4f}")

    # Save models
    print("Saving models...")
    with open(os.path.join(SAVE_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
        pickle.dump(tfidf, f)
    joblib.dump(logreg, os.path.join(SAVE_DIR, "logreg_model.pkl"))
    joblib.dump(xgb_clf, os.path.join(SAVE_DIR, "xgb_model.pkl"))
    joblib.dump(meta_model, os.path.join(SAVE_DIR, "meta_model.pkl"))
    with open(os.path.join(SAVE_DIR, "label_maps.pkl"), "wb") as f:
        pickle.dump({'label2id': label2id, 'id2label': id2label}, f)

    print("Models saved.") # used 38 mins
else:
    print("Saved models already exist. Skipping training.")


Models not found. Training models...
Fitting TF-IDF vectorizer...
Training Logistic Regression...
Training XGBoost...
Generating meta-features...
Training meta-model...
Stacked Dev Macro F1: 0.4316
Saving models...
Models saved.


In [5]:
import pandas as pd
import numpy as np
import pickle
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import xgboost as xgb

# Paths
SAVE_DIR = "saved_models"
TEST_PATH = "../data/test.csv"
SUBMISSION_PATH = "track_3_test.csv"

# Load train + dev data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv(TEST_PATH)

full_df = pd.concat([train_df, dev_df], ignore_index=True)

# Label Encoding
all_labels = sorted(full_df['label'].unique())
label2id = {label: idx for idx, label in enumerate(all_labels)}
id2label = {idx: label for label, idx in label2id.items()}
y_full = full_df['label'].map(label2id).values

# TF-IDF Vectorizer
print("Fitting TF-IDF on train+dev...")
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 6), max_features=50000)
X_full = tfidf.fit_transform(full_df['text'])
X_test = tfidf.transform(test_df['text'])

# Logistic Regression
print("Training Logistic Regression on train+dev...")
logreg = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    max_iter=1000,
    C=1.0,
    l1_ratio=0.3,
    random_state=42,
    n_jobs=-1
)
logreg.fit(X_full, y_full)

# XGBoost
print("Training XGBoost on train+dev...")
xgb_clf = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=len(label2id),
    n_estimators=300,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb_clf.fit(X_full, y_full)

# Generate meta-features on test set
print("Stacking model predictions on test...")
logreg_proba = logreg.predict_proba(X_test)
xgb_proba = xgb_clf.predict_proba(X_test)
X_meta_test = np.hstack([logreg_proba, xgb_proba])

# Meta-model (retrained on train+dev)
print("Training meta-model on full train+dev predictions...")
# For meta model training, simulate meta features by predicting on X_full (optional if desired)
meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(np.hstack([logreg.predict_proba(X_full), xgb_clf.predict_proba(X_full)]), y_full)

# Predict test labels
final_preds = meta_model.predict(X_meta_test)
final_labels = [id2label[p] for p in final_preds]

# Save submission
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "label": final_labels
})
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"Final submission saved to {SUBMISSION_PATH}")


Fitting TF-IDF on train+dev...
Training Logistic Regression on train+dev...
Training XGBoost on train+dev...
Stacking model predictions on test...
Training meta-model on full train+dev predictions...
✅ Final submission saved to track_3_test.csv


In [6]:
# Save models and artifacts
os.makedirs(SAVE_DIR, exist_ok=True)

# Save TF-IDF vectorizer
with open(os.path.join(SAVE_DIR, "tfidf_vectorizer_final.pkl"), "wb") as f:
    pickle.dump(tfidf, f)

# Save Logistic Regression
joblib.dump(logreg, os.path.join(SAVE_DIR, "logreg_model_final.pkl"))

# Save XGBoost model
joblib.dump(xgb_clf, os.path.join(SAVE_DIR, "xgb_model_final.pkl"))

# Save Meta model
joblib.dump(meta_model, os.path.join(SAVE_DIR, "meta_model_final.pkl"))

# Save label mappings
with open(os.path.join(SAVE_DIR, "label_maps_final.pkl"), "wb") as f:
    pickle.dump({'label2id': label2id, 'id2label': id2label}, f)

print("Final models and vectorizer saved to 'saved_models/'")


Final models and vectorizer saved to 'saved_models/'
