# TFIDF + Logistic Regression

In [None]:

# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing

# Combine train and dev for vectorizer fitting
full_train_texts = train_df['text'].tolist() + dev_df['text'].tolist()
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

# Mapping labels to ids
labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',   # character n-grams inside word boundaries
    ngram_range=(2,6),    # 2-6 character n-grams
    max_features=50000    # limit vocabulary size
)

# Fit on training + dev set
tfidf.fit(full_train_texts)

# Transform
X_train = tfidf.transform(train_df['text'])
X_dev = tfidf.transform(dev_df['text'])
X_test = tfidf.transform(test_df['text'])

# Train Logistic Regression
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# Evaluate on Dev Set
dev_preds = clf.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score: {dev_macro_f1:.4f}")

Validation Macro F1 Score: 0.4114
Saved Track 1 submission to outputs/track_1_test.csv


In [None]:
# Predict on Test Set
test_preds = clf.predict(X_test)
test_labels = [id2label[pred] for pred in test_preds]

# Save to CSV
output_df = pd.DataFrame({
    'id': test_df['id'],
    'label': test_labels
})
os.makedirs('outputs', exist_ok=True)
output_df.to_csv('outputs/track_1_test.csv', index=False)

print("Saved Track 1 submission to outputs/track_1_test.csv")

# TFIDF + Elastic Net

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from tqdm import tqdm
import os
import pickle

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing
full_train_texts = train_df['text'].tolist() + dev_df['text'].tolist()
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,6),
    max_features=50000
)

tfidf.fit(full_train_texts)

X_train = tfidf.transform(train_df['text'])
X_dev = tfidf.transform(dev_df['text'])
X_test = tfidf.transform(test_df['text'])


# Too time consuming TODO: Try with cv before submitting
# # Hyperparameter Grid
# param_grid = {
#     'C': [0.01, 10],
#     'l1_ratio': [0.3, 0.7]
# }
# grid = list(ParameterGrid(param_grid))

# # Cache file
# CACHE_FILE = './cache/ENET_CV5_cache.pkl'

# # Load previous cache if exists
# if os.path.exists(CACHE_FILE):
#     with open(CACHE_FILE, 'rb') as f:
#         cache = pickle.load(f)
#     print(f"Loaded {len(cache)} cached results.")
# else:
#     cache = {}

# best_score = 0
# best_params = None

# # Define scoring
# scorer = make_scorer(f1_score, average='macro')

# # Search
# for params in tqdm(grid, desc="Grid Search with 5-Fold CV"):
#     param_key = tuple(sorted(params.items()))  # make a hashable key

#     if param_key in cache:
#         macro_f1 = cache[param_key]
#     else:
#         print(f'Start of parameter tuning: {param_key}')
#         model = LogisticRegression(
#             penalty='elasticnet',
#             solver='saga',
#             max_iter=1000,
#             random_state=42,
#             n_jobs=-1,
#             **params
#         )
#         # Perform 5-Fold CV here!
#         scores = cross_val_score(model, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1)
#         macro_f1 = np.mean(scores)

#         cache[param_key] = macro_f1

#         # Save cache every time
#         with open(CACHE_FILE, 'wb') as f:
#             pickle.dump(cache, f)
#         print(f'Cached {param_key}, f1: {macro_f1:.4f}')

#     if macro_f1 > best_score:
#         best_score = macro_f1
#         best_params = params

# print(f"Best parameters: {best_params}")
# print(f"Best Macro F1 Score (5-Fold CV): {best_score:.4f}")

# Train the best model fully
best_model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    C=1,
    l1_ratio= 0.3
    # **best_params
)
print("Fitting model")
best_model.fit(X_train, y_train)

# Evaluate again on Dev Set
print("Evaluation model")
dev_preds = best_model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score (retrained best model): {dev_macro_f1:.4f}")
# C=1, L1_ratio=0.7 -> 0.4172
# C=1, L1_ratio=0.5 -> 0.4222
# C=0.5, L1_ratio=0.5 -> 0.4024
# C=1, L1_ratio=0.3 -> 0.4261


Fitting model
Evaluation model
Validation Macro F1 Score (retrained best model): 0.4261


In [None]:
# Predict on Test Set
test_preds = clf.predict(X_test)
test_labels = [id2label[pred] for pred in test_preds]

# Save to CSV
output_df = pd.DataFrame({
    'id': test_df['id'],
    'label': test_labels
})
os.makedirs('outputs', exist_ok=True)
output_df.to_csv('outputs/track_1_test.csv', index=False)

print("Saved Track 1 submission to outputs/track_1_test.csv")

# Word2Vec + XGBoost

In [None]:
import gensim.downloader as api
w2v = api.load('word2vec-google-news-300') 

In [8]:
import gensim.downloader as api
glove_twitter = api.load('glove-twitter-200')  



In [13]:
fast_wiki = api.load('fasttext-wiki-news-subwords-300')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import xgboost as xgb
from tqdm import tqdm
import os

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing

# Combine train and dev for label mapping
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

# Map labels to integers
labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# Load pre-trained Word2Vec embeddings
print("Loading Word2Vec model...")
w2v = fast_wiki # This is 300-dim Google News vectors
print("Word2Vec model loaded.")

embedding_dim = 300

# Function to vectorize text
def text_to_vector(text, model, dim):
    words = text.split()
    word_vectors = []
    for word in words:
        if word in model:
            word_vectors.append(model[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(dim)

# Vectorize train, dev, and test sets
print("Vectorizing texts...")

X_train = np.vstack([text_to_vector(text, w2v, embedding_dim) for text in tqdm(train_df['text'])])
X_dev = np.vstack([text_to_vector(text, w2v, embedding_dim) for text in tqdm(dev_df['text'])])
X_test = np.vstack([text_to_vector(text, w2v, embedding_dim) for text in tqdm(test_df['text'])])

print("Vectorization complete.")

# Train XGBoost Classifier
print("Training XGBoost...")

model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(labels),
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Evaluate on Dev Set
dev_preds = model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score: {dev_macro_f1:.4f}")

# # Predict on Test Set
# test_preds = model.predict(X_test)
# test_labels = [id2label[pred] for pred in test_preds]

# # Save submission
# output_df = pd.DataFrame({
#     'id': test_df['id'],
#     'label': test_labels
# })
# os.makedirs('outputs', exist_ok=True)
# output_df.to_csv('outputs/track_3_test.csv', index=False)

# print("Saved Track 3 submission to outputs/track_3_test.csv")

"""
n_estimators=300,
learning_rate=0.1,
max_depth=6,
-> 0.3628

n_estimators=500,
learning_rate=0.5,
max_depth=8,
-> 0.3645

Twitter
n_estimators=300,
learning_rate=0.1,
max_depth=6,
-> 0.3235

Fast
n_estimators=300,
learning_rate=0.1,
max_depth=6,
-> 0.3570



'n_estimators': [300, 500],
'max_depth': [4, 6, 8],
'learning_rate': [0.01, 0.05, 0.1],
'subsample': [0.7, 0.8, 1.0],
'colsample_bytree': [0.7, 0.8, 1.0]
"""


Loading Word2Vec model...
Word2Vec model loaded.
Vectorizing texts...


100%|██████████| 32000/32000 [00:02<00:00, 11747.66it/s]
100%|██████████| 4000/4000 [00:00<00:00, 9016.40it/s]
100%|██████████| 4000/4000 [00:00<00:00, 10453.97it/s]


Vectorization complete.
Training XGBoost...
Validation Macro F1 Score: 0.3570


"\nn_estimators=300,\nlearning_rate=0.1,\nmax_depth=6,\n-> 0.3628\n\nTwitter\nn_estimators=300,\nlearning_rate=0.1,\nmax_depth=6,\n-> 0.3235\n\n\nn_estimators=500,\nlearning_rate=0.5,\nmax_depth=8,\n-> 0.3645\n\n'n_estimators': [300, 500],\n'max_depth': [4, 6, 8],\n'learning_rate': [0.01, 0.05, 0.1],\n'subsample': [0.7, 0.8, 1.0],\n'colsample_bytree': [0.7, 0.8, 1.0]\n"

In [None]:
import pandas as pd
import numpy as np
import gensim.downloader as api
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import xgboost as xgb
from tqdm import tqdm
import os
import pickle

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing

full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# Load pre-trained Word2Vec embeddings
print("Loading Word2Vec model...")
w2v = api.load('word2vec-google-news-300')  # Pretrained 300d embeddings
print("Word2Vec model loaded.")

embedding_dim = 300

# Function to vectorize text
def text_to_vector(text, model, dim):
    words = text.split()
    word_vectors = []
    for word in words:
        if word in model:
            word_vectors.append(model[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(dim)

# Vectorize train, dev, and test sets (with Cache)
CACHE_FOLDER = 'cache'
os.makedirs(CACHE_FOLDER, exist_ok=True)

def get_cached_vectors(filename, texts):
    filepath = os.path.join(CACHE_FOLDER, filename)
    if os.path.exists(filepath):
        print(f"Loading cached vectors from {filepath}...")
        return np.load(filepath)
    else:
        print(f"Vectorizing and caching {filename}...")
        vectors = np.vstack([text_to_vector(text, w2v, embedding_dim) for text in tqdm(texts)])
        np.save(filepath, vectors)
        return vectors

X_train = get_cached_vectors('train_vectors.npy', train_df['text'])
X_dev = get_cached_vectors('dev_vectors.npy', dev_df['text'])
X_test = get_cached_vectors('test_vectors.npy', test_df['text'])

print("Vectorization complete.")

# Cache or train best XGBoost model
BEST_MODEL_CACHE = os.path.join(CACHE_FOLDER, 'xgb_best_model.pkl')

if os.path.exists(BEST_MODEL_CACHE):
    print(f"Loading best model from cache {BEST_MODEL_CACHE}...")
    with open(BEST_MODEL_CACHE, 'rb') as f:
        best_model = pickle.load(f)
else:
    print("Starting hyperparameter tuning with GridSearchCV...")

    xgb_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(labels),
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )

    param_grid = {
        'n_estimators': [100, 300],
        '   ': [0.05, 0.1],
        'max_depth': [4, 6, 8]
    }

    grid = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=5,
        scoring='f1_macro',
        verbose=2,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    print(f"Best Params: {grid.best_params_}")
    print(f"Best CV Macro F1: {grid.best_score_:.4f}")

    best_model = grid.best_estimator_

    # Save best model to cache
    with open(BEST_MODEL_CACHE, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"Saved best model to {BEST_MODEL_CACHE}")

# Evaluate on Dev Set
dev_preds = best_model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score (best model): {dev_macro_f1:.4f}")

# Predict on Test Set
test_preds = best_model.predict(X_test)
test_labels = [id2label[pred] for pred in test_preds]

# Save submission
output_df = pd.DataFrame({
    'id': test_df['id'],
    'label': test_labels
})
os.makedirs('outputs', exist_ok=True)
output_df.to_csv('outputs/track_3_test.csv', index=False)

print("Saved Track 3 submission to outputs/track_3_test.csv")


# TFIDF + XGBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import xgboost as xgb
from tqdm import tqdm
import os

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing
full_train_texts = train_df['text'].tolist() + dev_df['text'].tolist()
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,6),
    max_features=50000
)

print("Vectorizing data")
tfidf.fit(full_train_texts)

X_train = tfidf.transform(train_df['text'])
X_dev = tfidf.transform(dev_df['text'])
X_test = tfidf.transform(test_df['text'])
print("Vectorization complete.")

# Train XGBoost Classifier
print("Training XGBoost...")

model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(labels),
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

eval_set = [(X_train, y_train), (X_dev, y_dev)]

model.fit(
    X_train, y_train,
    eval_set=eval_set,
    # eval_metric='mlogloss',
    # early_stopping_rounds=10,  # ⛔ Stop if no dev improvement in 10 rounds
    verbose=True
)

# Dev Set Evaluation
print("Evaluating on dev set...")
dev_preds = model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score: {dev_macro_f1:.4f}")

# # Predict on Test Set
# test_preds = model.predict(X_test)
# test_labels = [id2label[pred] for pred in test_preds]

# # Save submission
# output_df = pd.DataFrame({
#     'id': test_df['id'],
#     'label': test_labels
# })
# os.makedirs('outputs', exist_ok=True)
# output_df.to_csv('outputs/track_3_test.csv', index=False)

# print("Saved Track 3 submission to outputs/track_3_test.csv")

"""
'n_estimators': [300, 500],
'max_depth': [4, 6, 8],
'learning_rate': [0.01, 0.05, 0.1],
'subsample': [0.7, 0.8, 1.0],
'colsample_bytree': [0.7, 0.8, 1.0]
"""

# score = 0.3934

Vectorizing data
Vectorization complete.
Training XGBoost...
[0]	validation_0-mlogloss:1.59660	validation_1-mlogloss:1.59694
[1]	validation_0-mlogloss:1.58542	validation_1-mlogloss:1.58588
[2]	validation_0-mlogloss:1.57677	validation_1-mlogloss:1.57749
[3]	validation_0-mlogloss:1.56816	validation_1-mlogloss:1.56938
[4]	validation_0-mlogloss:1.56004	validation_1-mlogloss:1.56188
[5]	validation_0-mlogloss:1.55268	validation_1-mlogloss:1.55508
[6]	validation_0-mlogloss:1.54618	validation_1-mlogloss:1.54884
[7]	validation_0-mlogloss:1.54010	validation_1-mlogloss:1.54338
[8]	validation_0-mlogloss:1.53430	validation_1-mlogloss:1.53803
[9]	validation_0-mlogloss:1.52921	validation_1-mlogloss:1.53344
[10]	validation_0-mlogloss:1.52471	validation_1-mlogloss:1.52969
[11]	validation_0-mlogloss:1.52035	validation_1-mlogloss:1.52629
[12]	validation_0-mlogloss:1.51612	validation_1-mlogloss:1.52260
[13]	validation_0-mlogloss:1.51210	validation_1-mlogloss:1.51908
[14]	validation_0-mlogloss:1.50845	vali

"\n'n_estimators': [300, 500],\n'max_depth': [4, 6, 8],\n'learning_rate': [0.01, 0.05, 0.1],\n'subsample': [0.7, 0.8, 1.0],\n'colsample_bytree': [0.7, 0.8, 1.0]\n"

# Old

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from tqdm import tqdm

# 1. Load data
# ----------------
# Read CSVs into pandas DataFrames. 'test' has no labels.
train = pd.read_csv("train.csv")  # Training set with labels
dev = pd.read_csv("dev.csv")      # Development set with labels
test = pd.read_csv("test.csv")    # Test set without labels

# 2. Vectorize text
# --------------------
# Convert raw text into TF-IDF feature vectors.
# - ngram_range=(2,6): consider character n-grams of length 2 to 6.
# - min_df=5: ignore infrequent n-grams to reduce noise.
vectorizer = TfidfVectorizer(ngram_range=(2,6), min_df=5)
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
X_test = vectorizer.transform(test['text'])

# 3. Hyperparameter tuning with progress bar
# --------------------------------------------
# Define grid of Random Forest hyperparameters:
# - n_estimators: number of trees in the forest
# - max_depth: maximum depth of each tree
# - max_features: number of features to consider at each split

# param_grid = {
#     'n_estimators': [100, 200, 500],
#     'max_depth': [None, 10, 20],
#     'max_features': ['sqrt', 'log2', 0.3]
# }

param_grid = {
    'n_estimators': [200],
    'max_depth': [None],
    'max_features': ['sqrt']
}

# Create a list of all parameter combinations
import itertools
param_list = list(itertools.product(
    param_grid['n_estimators'],
    param_grid['max_depth'],
    param_grid['max_features']
))

best_score = 0
best_params = None
print("Tuning Random Forest hyperparameters:")
for n_est, depth, feat in tqdm(param_list, desc="Grid Search", unit="combo"):
    # Initialize a Random Forest with current hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_est,
        max_depth=depth,
        max_features=feat,
        oob_score=False,
        random_state=42,
        n_jobs=-1
    )
    # 3-fold cross-validation for macro-F1 on TRAIN
    scores = cross_val_score(
        rf, X_train, y_train,
        scoring='f1_macro',
        cv=3,
        n_jobs=-1
    )
    mean_score = scores.mean()
    # Update best parameters if current is better
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'n_estimators': n_est, 'max_depth': depth, 'max_features': feat}

print(f"\nBest params: {best_params} with CV macro-F1 = {best_score:.4f}")

# 4. Train best Random Forest on full TRAIN set
# ------------------------------------------------
best_rf = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_train, y_train)

# 5. Evaluate on DEV set
# -----------------------
dev_preds = best_rf.predict(X_dev)
print("DEV Macro F1:", f1_score(y_dev, dev_preds, average='macro'))

# 6. (Optional) Retrain on TRAIN + DEV and predict TEST
# ------------------------------------------------------
# all_texts = pd.concat([train['text'], dev['text']])
# all_labels = pd.concat([train['label'], dev['label']])
# X_all = vectorizer.fit_transform(all_texts)
# best_rf.fit(X_all, all_labels)
# test_preds = best_rf.predict(X_test)
# submission = pd.DataFrame({'id': test['id'], 'label': test_preds})
# submission.to_csv('track_1_test.csv', index=False)


Tuning Random Forest hyperparameters:


Grid Search: 100%|██████████| 1/1 [15:44<00:00, 944.87s/combo]



Best params: {'n_estimators': 200, 'max_depth': None, 'max_features': 'sqrt'} with CV macro-F1 = 0.2862
DEV Macro F1: 0.2918208850519727


# Linear regression

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from tqdm import tqdm

# 1. Load data
# ----------------
train = pd.read_csv("train.csv")  # Training set
dev = pd.read_csv("dev.csv")      # Development/validation set
test = pd.read_csv("test.csv")    # Test set (unlabeled)

# 2. Vectorize text
# --------------------
vectorizer = TfidfVectorizer(ngram_range=(2,6), min_df=5)
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
X_test = vectorizer.transform(test['text'])

# 3. Manual hyperparameter tuning with progress bar
# ---------------------------------------------------
# Define grid of regularization strengths
param_C = [0.01, 0.1, 1, 10]
best_score = 0
best_C = None
print("Tuning regularization strength C:")
for C in tqdm(param_C, desc="Grid Search - C values", unit="value"):
    # Initialize model with current C
    lr = LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=1000,
        C=C,
        n_jobs=-1,
        random_state=42
    )
    # Perform 3-fold CV on training set, scoring macro-F1
    scores = cross_val_score(
        lr, X_train, y_train,
        scoring='f1_macro',
        cv=3,
        n_jobs=-1
    )
    mean_score = scores.mean()
    if mean_score > best_score:
        best_score = mean_score
        best_C = C
        best_scores = scores

print(f"\nBest C found: {best_C} with CV macro-F1 = {best_score:.4f}")

# Train best model on full TRAIN
best_lr = LogisticRegression(
    penalty='l2',
    solver='saga',
    max_iter=1000,
    C=best_C,
    n_jobs=-1,
    random_state=42
)
best_lr.fit(X_train, y_train)

# 4. Evaluate on DEV set
# -----------------------
dev_preds = best_lr.predict(X_dev)
print("DEV Macro F1:", f1_score(y_dev, dev_preds, average='macro'))

# 5. (Optional) Retrain on TRAIN + DEV and predict TEST
# ------------------------------------------------------
# all_texts = pd.concat([train['text'], dev['text']])
# all_labels = pd.concat([train['label'], dev['label']])
# X_all = vectorizer.fit_transform(all_texts)
# best_lr.fit(X_all, all_labels)
# test_preds = best_lr.predict(X_test)

# # Save submission
# submission = pd.DataFrame({'id': test['id'], 'label': test_preds})
# submission.to_csv('track_1_test.csv', index=False)


Tuning regularization strength C:


Grid Search - C values: 100%|██████████| 4/4 [00:09<00:00,  2.49s/value]



Best C found: 1 with CV macro-F1 = 0.3164
DEV Macro F1: 0.3149562265024328
