In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

In [4]:
# Load data
df = pd.read_csv('data/train.csv')
X = df['comment_text'].fillna(' ')
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [5]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
X_vec = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [6]:
# Define models
models = {
    'LogisticRegression': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    'NaiveBayes': OneVsRestClassifier(MultinomialNB()),
    'SVM': OneVsRestClassifier(LinearSVC())
}


In [9]:
# Train + evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    # Save
    joblib.dump(model, f'models/{name.lower()}_model.pkl')


Training LogisticRegression...
Results for LogisticRegression:
              precision    recall  f1-score   support

           0       0.91      0.59      0.72      3056
           1       0.58      0.26      0.36       321
           2       0.92      0.61      0.73      1715
           3       0.47      0.11      0.18        74
           4       0.83      0.48      0.61      1614
           5       0.69      0.14      0.23       294

   micro avg       0.88      0.53      0.66      7074
   macro avg       0.73      0.36      0.47      7074
weighted avg       0.87      0.53      0.65      7074
 samples avg       0.05      0.05      0.05      7074


Training NaiveBayes...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Results for NaiveBayes:
              precision    recall  f1-score   support

           0       0.96      0.36      0.53      3056
           1       0.20      0.00      0.01       321
           2       0.95      0.29      0.45      1715
           3       0.50      0.01      0.03        74
           4       0.87      0.20      0.32      1614
           5       0.00      0.00      0.00       294

   micro avg       0.93      0.27      0.42      7074
   macro avg       0.58      0.15      0.22      7074
weighted avg       0.86      0.27      0.41      7074
 samples avg       0.03      0.02      0.02      7074


Training SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Results for SVM:
              precision    recall  f1-score   support

           0       0.86      0.69      0.77      3056
           1       0.54      0.29      0.38       321
           2       0.89      0.70      0.78      1715
           3       0.57      0.27      0.37        74
           4       0.79      0.57      0.66      1614
           5       0.71      0.24      0.36       294

   micro avg       0.84      0.62      0.72      7074
   macro avg       0.73      0.46      0.55      7074
weighted avg       0.83      0.62      0.71      7074
 samples avg       0.06      0.06      0.06      7074



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
# Save vectorizer
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

['models/tfidf_vectorizer.pkl']