In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import joblib # for saving the model

# Loading the dataset
print("Loading dataset...")
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Combine all toxic labels into one
df_train['label'] = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
df_test['label'] = df_test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(df_train['comment_text'])
Y_train = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_test = vectorizer.transform(df_test['comment_text'])
Y_test = df_test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

with open('models/vectorizer.pkl', 'wb') as f:
    joblib.dump(tokenizer, f)
print("Dataset loaded and converted to vector")

Loading dataset...
Dataset loaded and converted to vector


In [2]:
print("Training model...")
# Train a Gradient Boosting model for multi-label classification
base_model = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_depth=5, random_state=42)
model = OneVsRestClassifier(base_model)

model.fit(X_train, Y_train)

joblib.dump(model, 'models/toxic_classifier.pkl') # save the model
print("Model training complete.")

# Evaluate the model
print("Evaluating model on the train dataset...")
Y_pred = model.predict(X_train)
accuracy = accuracy_score(Y_train, Y_pred)
f1 = f1_score(Y_train, Y_pred, average='micro')
precision = precision_score(Y_train, Y_pred, average='micro')
recall = recall_score(Y_train, Y_pred, average='micro')
print(f"Accuracy(train): {accuracy}")
print(f"F1 Score(train): {f1}")
print(f"Precision(train): {precision}")
print(f"Recall(train): {recall}")

print("")

print("Evaluating model on the test dataset...")
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred, average='micro')
precision = precision_score(Y_test, Y_pred, average='micro')
recall = recall_score(Y_test, Y_pred, average='micro')
print(f"Accuracy(test): {accuracy}")
print(f"F1 Score(test): {f1}")
print(f"Precision(test): {precision}")
print(f"Recall(test): {recall}")

Training model...
Model training complete.
Evaluating model on the train dataset...
Accuracy(train): 0.9102092485476684
F1 Score(train): 0.47319865032247044
Precision(train): 0.944662346521146
Recall(train): 0.3156590119095105

Evaluating model on the test dataset...
Accuracy(test): 0.9058394923088976
F1 Score(test): 0.20469187484709273
Precision(test): 0.16891740320337387
Recall(test): 0.25969099186094635


In [1]:
# Making predictions using the model
import joblib 

model = joblib.load('models/toxic_classifier.pkl')
vectorizer = joblib.load('models/vectorizer.pkl')
def predict(comment):
    comment_vector = vectorizer.transform([comment])
    prediction = model.predict(comment_vector)
    return prediction[0]

comments = [
    "I really love this!",
    "This is the worst thing ever.",
    "You are an idiot and should be banned!",
    "The government needs to address this issue.",
    "You fucking idiot, I am gonna make you suffer.",
    "that's fucking awesome."
]

print("Classifying example comments...")
for comment in comments:
    labels = predict(comment)
    print(f"Comment: {comment}\nLabels: {labels}\n")    

Classifying example comments...
Comment: I really love this!
Labels: [0 0 0 0 0 0]

Comment: This is the worst thing ever.
Labels: [0 0 0 0 0 0]

Comment: You are an idiot and should be banned!
Labels: [0 0 0 0 0 0]

Comment: The government needs to address this issue.
Labels: [0 0 0 0 0 0]

Comment: You fucking idiot, I am gonna make you suffer.
Labels: [1 0 1 0 1 0]

Comment: that's fucking awesome.
Labels: [1 0 1 0 0 0]

