 ✅ Detect offensive, informal, and camouflaged content
✅ Handle multilingual, mixed-language, and emojis
✅ Generalize well to new, unseen offensive variations

# Feature Engineering (Extracting Useful Information for ML)

In [2]:
# extract powerful features from comments:
'''
 - TF-IDF Vectorization (to convert words into numbers) --> captures important words
 - Extract Stylometric Features (uppercase %, special characters, word elongation) --> detects hidden offensive tricks
 - Detect Camouflaged Patterns (spacing tricks, leetspeak, excessive punctuation)
'''

'\n - TF-IDF Vectorization (to convert words into numbers) --> captures important words\n - Extract Stylometric Features (uppercase %, special characters, word elongation) --> detects hidden offensive tricks\n - Detect Camouflaged Patterns (spacing tricks, leetspeak, excessive punctuation)\n'

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re

In [4]:
# Loading dataset
df = pd.read_csv("Comments_Dataset.csv")

In [5]:
# 1. TF-IDF Vectorization (Convert text into numeric representation)
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['text']).toarray()

In [6]:
# 2. Extracting Stylometric Features
def extract_text_features(text):
    return {
        "uppercase_ratio": sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0,
        "special_char_count": sum(1 for c in text if not c.isalnum() and c != " "),
        "elongated_word_count": sum(1 for w in text.split() if len(set(w)) < len(w) - 2),
        "contains_repeated_letters": any(re.search(r"(.)\1{2,}", w) for w in text.split())
    }

In [7]:
df_features = df['text'].apply(lambda x: extract_text_features(x))
df_features = pd.DataFrame(df_features.tolist())

In [8]:
# 3. Combine Features --> helps making the model more robust
import numpy as np
X_combined = np.hstack((X_tfidf, df_features.values))
y = df['label']

# Training Model

In [9]:
# train a random forest method
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [11]:
# train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [12]:
# Evaluate Model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 79.72%
              precision    recall  f1-score   support

           0       0.75      0.75      0.75      7186
           1       0.83      0.83      0.83     10681

    accuracy                           0.80     17867
   macro avg       0.79      0.79      0.79     17867
weighted avg       0.80      0.80      0.80     17867

