In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# For scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Optional: set a random seed
RANDOM_SEED = 42


In [21]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission.shape)

train_df.head()


Train shape: (159571, 8)
Test shape: (153164, 2)
Sample submission shape: (153164, 7)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [22]:
# Inspect columns & missing data
train_df.info()
train_df.isnull().sum()

label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# How many comments have each label?
print(train_df[label_cols].sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


In [23]:
import re

def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Example: limit repeated punctuation like "!!!!" to "!!!"
    text = re.sub(r'([!?])\1{2,}', r'\1\1\1', text)
    
    # Normalize elongated words: "cooooool" -> "coool"
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # Keep letters, digits, some punctuation: @, #, !, ?, *, etc.
    text = re.sub(r'[^a-z0-9@#!?*]+', ' ', text)
    
    return text

X_train_cleaned = X_train.apply(clean_text)
X_val_cleaned   = X_val.apply(clean_text)

In [24]:
X = train_df["comment_text"]
y = train_df[label_cols]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_SEED
)

X_train.shape, X_val.shape


((127656,), (31915,))

In [25]:
X_train_cleaned = X_train.apply(clean_text)
X_val_cleaned   = X_val.apply(clean_text)


In [26]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# 1. Define separate vectorizers
word_vect = TfidfVectorizer(
    stop_words='english',
    max_features=20000,       
    ngram_range=(1,2),        # unigrams + bigrams
    lowercase=True
)

char_vect = TfidfVectorizer(
    analyzer='char_wb',       # word-boundary char n-grams
    ngram_range=(3,5),
    max_features=30000       
)

# 2. Combine them
combined_features = FeatureUnion([
    ('word_v', word_vect),
    ('char_v', char_vect)
])

# 3. Build a pipeline
pipeline = Pipeline([
    ('features', combined_features),
    ('clf', OneVsRestClassifier(
        LogisticRegression(
            class_weight='balanced',
            C=10,            
            max_iter=200
        )
    ))
])

# 4. Fit on training set
pipeline.fit(X_train_cleaned, y_train)

# 5. Predict on validation set
val_preds = pipeline.predict(X_val_cleaned)

from sklearn.metrics import classification_report
print(classification_report(y_val, val_preds, zero_division=0, target_names=label_cols))



               precision    recall  f1-score   support

        toxic       0.70      0.83      0.76      3056
 severe_toxic       0.35      0.71      0.47       321
      obscene       0.76      0.87      0.81      1715
       threat       0.31      0.78      0.45        74
       insult       0.63      0.83      0.71      1614
identity_hate       0.34      0.64      0.45       294

    micro avg       0.64      0.82      0.72      7074
    macro avg       0.52      0.78      0.61      7074
 weighted avg       0.66      0.82      0.73      7074
  samples avg       0.07      0.08      0.07      7074



In [27]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as st

param_distributions = {
    'features__word_v__max_features': [10000, 20000],
    'features__char_v__max_features': [15000, 30000],
    # Sample C from a log distribution between 0.01 and 100
    'clf__estimator__C': st.loguniform(1e-2, 1e2)
}

rand_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,               # number of random combos to try
    scoring='f1_micro',      # or 'roc_auc' if Kaggle uses AUC
    cv=3,
    verbose=2,
    random_state=42
)

rand_search.fit(X_train_cleaned, y_train)

print("Best Params:", rand_search.best_params_)
print("Best Score:", rand_search.best_score_)

best_model = rand_search.best_estimator_

val_preds = best_model.predict(X_val_cleaned)
print(classification_report(y_val, val_preds, zero_division=0, target_names=label_cols))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END clf__estimator__C=0.31489116479568624, features__char_v__max_features=15000, features__word_v__max_features=10000; total time=  36.3s
[CV] END clf__estimator__C=0.31489116479568624, features__char_v__max_features=15000, features__word_v__max_features=10000; total time=  36.3s
[CV] END clf__estimator__C=0.31489116479568624, features__char_v__max_features=15000, features__word_v__max_features=10000; total time=  37.0s
[CV] END clf__estimator__C=8.471801418819979, features__char_v__max_features=15000, features__word_v__max_features=10000; total time=  41.6s
[CV] END clf__estimator__C=8.471801418819979, features__char_v__max_features=15000, features__word_v__max_features=10000; total time=  39.4s
[CV] END clf__estimator__C=8.471801418819979, features__char_v__max_features=15000, features__word_v__max_features=10000; total time=  40.7s
[CV] END clf__estimator__C=0.04207988669606638, features__char_v__max_features=15000, f

In [28]:
X_full_cleaned = train_df["comment_text"].apply(clean_text)
y_full = train_df[label_cols]

final_model = best_model  # if from RandomizedSearch
final_model.fit(X_full_cleaned, y_full)


In [29]:
test_cleaned = test_df["comment_text"].apply(clean_text)
test_preds = final_model.predict_proba(test_cleaned)  


In [31]:
submission_df = sample_submission.copy()
submission_df[label_cols] = test_preds  # If probabilities are required

submission_df.to_csv("my_submission1.csv", index=False)
print("Submission file saved: my_submission.csv")


Submission file saved: my_submission.csv
