# Baseline approach: TF-IDF
As a first approach, we will use TF-IDF as the classification model. Final results obtained against validation set:

- toxic: ROC AUC = 0.8261
- severe_toxic: ROC AUC = 0.6264
- obscene: ROC AUC = 0.8336
- threat: ROC AUC = 0.6214
- insult: ROC AUC = 0.7721
- identity_hate: ROC AUC = 0.6237

# 1. Dependencies

In [None]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, roc_auc_score

from config import RAW_DATA_DIR, MODEL_BASE_PATH, LABELS, RANDOM_STATE

# 2. Configuration

In [None]:
# Add any configurations or utility functions here
TEST_SIZE = 0.2              # Proportion of data to use for validation
MAX_FEATURES = 10000         # Maximum number of features for TF-IDF
C = 4                        # Regularization strength for Logistic Regression
SOLVER = 'liblinear'         # Solver for Logistic Regression
MODEL_ID = 'baseline_tfidf'  # Identifier for the model

# 3. Read data

In [None]:
# Load dataset
df = pd.read_csv(RAW_DATA_DIR / 'train.csv')

# Display basic info
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# 4. Preprocess data

In [None]:
# Ensure that all columns are loaded with the correct data types
str_cols = ['id', 'comment_text']
for col in LABELS:
    if col in df.columns:
        df[col] = df[col].astype(int)
for col in str_cols:
    if col in df.columns:
        df[col] = df[col].astype(str)

In [None]:
# Define features and labels
X = df['comment_text']
y = df[LABELS]

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# 5. Build pipeline

In [None]:
# Define TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, stop_words='english', ngram_range=(1, 2))

# Fit and transform the training data, transform the validation data
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# One-vs-Rest Logistic Regression
clf = OneVsRestClassifier(LogisticRegression(C=C, solver=SOLVER))
clf.fit(X_train_vec, y_train)

# 6. Evaluate model

In [None]:
y_pred = clf.predict(X_val_vec)

# Classification report per label
for i, label in enumerate(LABELS):
    print(f"\n=== {label.upper()} ===")
    print(classification_report(y_val[label], y_pred[:, i]))

# ROC AUC per label
for i, label in enumerate(LABELS):
    score = roc_auc_score(y_val[label], y_pred[:, i])
    print(f"{label}: ROC AUC = {score:.4f}")

# 7. Save model and vectorizer

In [None]:
# Save model and vectorizer
if not MODEL_BASE_PATH.exists():
    MODEL_BASE_PATH.mkdir(parents=True)
if not (MODEL_BASE_PATH / MODEL_ID).exists():
    (MODEL_BASE_PATH / MODEL_ID).mkdir(parents=True)

joblib.dump(clf, MODEL_BASE_PATH / MODEL_ID / 'model.pkl')
joblib.dump(vectorizer, MODEL_BASE_PATH / MODEL_ID / 'vectorizer.pkl')