In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from tqdm import tqdm

# 1. Load data
# ----------------
# Read CSVs into pandas DataFrames. 'test' has no labels.
train = pd.read_csv("train.csv")  # Training set with labels
dev = pd.read_csv("dev.csv")      # Development set with labels
test = pd.read_csv("test.csv")    # Test set without labels

# 2. Vectorize text
# --------------------
# Convert raw text into TF-IDF feature vectors.
# - ngram_range=(2,6): consider character n-grams of length 2 to 6.
# - min_df=5: ignore infrequent n-grams to reduce noise.
vectorizer = TfidfVectorizer(ngram_range=(2,6), min_df=5)
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
X_test = vectorizer.transform(test['text'])

# 3. Hyperparameter tuning with progress bar
# --------------------------------------------
# Define grid of Random Forest hyperparameters:
# - n_estimators: number of trees in the forest
# - max_depth: maximum depth of each tree
# - max_features: number of features to consider at each split

# param_grid = {
#     'n_estimators': [100, 200, 500],
#     'max_depth': [None, 10, 20],
#     'max_features': ['sqrt', 'log2', 0.3]
# }

param_grid = {
    'n_estimators': 200,
    'max_depth': None,
    'max_features': 'sqrt'
}

# Create a list of all parameter combinations
import itertools
param_list = list(itertools.product(
    param_grid['n_estimators'],
    param_grid['max_depth'],
    param_grid['max_features']
))

best_score = 0
best_params = None
print("Tuning Random Forest hyperparameters:")
for n_est, depth, feat in tqdm(param_list, desc="Grid Search", unit="combo"):
    # Initialize a Random Forest with current hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_est,
        max_depth=depth,
        max_features=feat,
        oob_score=False,
        random_state=42,
        n_jobs=-1
    )
    # 3-fold cross-validation for macro-F1 on TRAIN
    scores = cross_val_score(
        rf, X_train, y_train,
        scoring='f1_macro',
        cv=3,
        n_jobs=-1
    )
    mean_score = scores.mean()
    # Update best parameters if current is better
    if mean_score > best_score:
        best_score = mean_score
        best_params = {'n_estimators': n_est, 'max_depth': depth, 'max_features': feat}

print(f"\nBest params: {best_params} with CV macro-F1 = {best_score:.4f}")

# 4. Train best Random Forest on full TRAIN set
# ------------------------------------------------
best_rf = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_train, y_train)

# 5. Evaluate on DEV set
# -----------------------
dev_preds = best_rf.predict(X_dev)
print("DEV Macro F1:", f1_score(y_dev, dev_preds, average='macro'))

# 6. (Optional) Retrain on TRAIN + DEV and predict TEST
# ------------------------------------------------------
# all_texts = pd.concat([train['text'], dev['text']])
# all_labels = pd.concat([train['label'], dev['label']])
# X_all = vectorizer.fit_transform(all_texts)
# best_rf.fit(X_all, all_labels)
# test_preds = best_rf.predict(X_test)
# submission = pd.DataFrame({'id': test['id'], 'label': test_preds})
# submission.to_csv('track_1_test.csv', index=False)


: 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

# 1. Load data
train = pd.read_csv("train.csv")
dev = pd.read_csv("dev.csv")
test = pd.read_csv("test.csv")

# 2. Vectorize text
vectorizer = TfidfVectorizer(ngram_range=(2,6), min_df=5)
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']

# 3. Define a single set of RF hyperparameters
# We'll start with a common default: 200 trees, no max depth, sqrt features
rf_params = {
    'n_estimators': 200,
    'max_depth': None,
    'max_features': 'sqrt'
}
print(f"Training Random Forest with params: {rf_params}")

# 4. Quick cross-validation to estimate performance
rf = RandomForestClassifier(
    **rf_params,
    random_state=42,
    n_jobs=-1
)
# 3-fold CV on training set
cv_scores = cross_val_score(rf, X_train, y_train, scoring='f1_macro', cv=3, n_jobs=-1)
print(f"CV Macro F1 scores: {cv_scores}")
print(f"Mean CV Macro F1: {cv_scores.mean():.4f}")

# 5. Train on full training set and evaluate on DEV set
rf.fit(X_train, y_train)
dev_preds = rf.predict(X_dev)
print("DEV Macro F1:", f1_score(y_dev, dev_preds, average='macro'))

# 6. (Optional) Retrain on TRAIN+DEV and predict TEST
# all_texts = pd.concat([train['text'], dev['text']])
# all_labels = pd.concat([train['label'], dev['label']])
# X_all = vectorizer.fit_transform(all_texts)
# rf.fit(X_all, all_labels)
# test_preds = rf.predict(vectorizer.transform(test['text']))
# submission = pd.DataFrame({'id': test['id'], 'label': test_preds})
# submission.to_csv('track_1_test.csv', index=False)


: 

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from tqdm import tqdm

# 1. Load data
# ----------------
train = pd.read_csv("train.csv")  # Training set
dev = pd.read_csv("dev.csv")      # Development/validation set
test = pd.read_csv("test.csv")    # Test set (unlabeled)

# 2. Vectorize text
# --------------------
vectorizer = TfidfVectorizer(ngram_range=(2,6), min_df=5)
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
X_dev = vectorizer.transform(dev['text'])
y_dev = dev['label']
X_test = vectorizer.transform(test['text'])

# 3. Manual hyperparameter tuning with progress bar
# ---------------------------------------------------
# Define grid of regularization strengths
param_C = [0.01, 0.1, 1, 10]
best_score = 0
best_C = None
print("Tuning regularization strength C:")
for C in tqdm(param_C, desc="Grid Search - C values", unit="value"):
    # Initialize model with current C
    lr = LogisticRegression(
        penalty='l2',
        solver='saga',
        max_iter=1000,
        C=C,
        n_jobs=-1,
        random_state=42
    )
    # Perform 3-fold CV on training set, scoring macro-F1
    scores = cross_val_score(
        lr, X_train, y_train,
        scoring='f1_macro',
        cv=3,
        n_jobs=-1
    )
    mean_score = scores.mean()
    if mean_score > best_score:
        best_score = mean_score
        best_C = C
        best_scores = scores

print(f"\nBest C found: {best_C} with CV macro-F1 = {best_score:.4f}")

# Train best model on full TRAIN
best_lr = LogisticRegression(
    penalty='l2',
    solver='saga',
    max_iter=1000,
    C=best_C,
    n_jobs=-1,
    random_state=42
)
best_lr.fit(X_train, y_train)

# 4. Evaluate on DEV set
# -----------------------
dev_preds = best_lr.predict(X_dev)
print("DEV Macro F1:", f1_score(y_dev, dev_preds, average='macro'))

# 5. (Optional) Retrain on TRAIN + DEV and predict TEST
# ------------------------------------------------------
# all_texts = pd.concat([train['text'], dev['text']])
# all_labels = pd.concat([train['label'], dev['label']])
# X_all = vectorizer.fit_transform(all_texts)
# best_lr.fit(X_all, all_labels)
# test_preds = best_lr.predict(X_test)

# # Save submission
# submission = pd.DataFrame({'id': test['id'], 'label': test_preds})
# submission.to_csv('track_1_test.csv', index=False)


Tuning regularization strength C:


Grid Search - C values: 100%|██████████| 4/4 [00:09<00:00,  2.49s/value]



Best C found: 1 with CV macro-F1 = 0.3164
DEV Macro F1: 0.3149562265024328
