# Hate Speech Detection using Traditional Machine Learning Approaches

This notebook implements classical ML models: Logistic Regression, Random Forest, and LightGBM.

In [ ]:
# Configuration and Constants
RANDOM_SEED = 1234
TEST_SIZE = 0.2
VAL_SIZE = 0.25
DATA_PATH = '/content/drive/MyDrive/hatespeech/hatexplain_detailed.csv'
MAX_ITER = 1000
N_BOOTSTRAP_ITERATIONS = 1000


In [ ]:
# Standard library imports
import re
import string
import random
import time
import warnings
warnings.filterwarnings("ignore")

# Third-party imports - Data manipulation
import numpy as np
import pandas as pd

# Third-party imports - Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Third-party imports - NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer

# Third-party imports - Machine Learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, accuracy_score, classification_report, 
    confusion_matrix, roc_curve, auc
)
from sklearn.utils import shuffle
from sklearn.utils import resample

# Third-party imports - Other ML libraries
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from scipy.sparse import hstack
from wordcloud import WordCloud

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Google Colab setup
from google.colab import drive
drive.mount('/content/drive/', force_remount=False)


In [ ]:
# Data Loading and Initial Processing
def load_and_prepare_data(file_path, random_seed=RANDOM_SEED):
    """Load data and perform initial transformations."""
    # Load data
    raw_data = pd.read_csv(file_path, index_col=0)
    
    # Reset index and select columns
    processed_data = raw_data.reset_index()
    processed_data = processed_data[['text', 'text_type']]
    
    # Convert labels to binary (0 for normal, 1 for toxic)
    label_mapping = {'normal': 0}
    processed_data['text_type'] = processed_data['text_type'].map(
        lambda x: 0 if x == 'normal' else 1
    )
    
    # Shuffle data
    processed_data = processed_data.sample(
        frac=1, random_state=random_seed
    ).reset_index(drop=True)
    
    return processed_data

# Execute data loading
dataframe_main = load_and_prepare_data(DATA_PATH)

# Display basic info
print(f"Dataset shape: {dataframe_main.shape}")
print(f"\nDataset info:")
print(dataframe_main.describe().T)
print(f"\nFirst few rows:")
print(dataframe_main.head())


In [ ]:
# Text Preprocessing Pipeline
class TextPreprocessor:
    """Handles all text preprocessing operations."""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        """Clean text by removing URLs, mentions, hashtags, and special chars."""
        text = str(text).lower()
        text = re.sub(pattern=r"http\S+", repl="<URL>", string=text)
        text = re.sub(pattern=r"@\w+", repl="<USER>", string=text)
        text = re.sub(pattern=r"#\w+", repl="<HASHTAG>", string=text)
        text = re.sub(pattern=r"[^\w\s]", repl="", string=text)
        text = re.sub(pattern=r"\s+", repl=" ", string=text)
        return text.strip()
    
    def remove_stopwords(self, text):
        """Remove stopwords from text."""
        words = text.split()
        filtered_words = filter(lambda w: w not in self.stop_words, words)
        return ' '.join(filtered_words)
    
    def lemmatize(self, text):
        """Lemmatize words in text."""
        words = text.split()
        lemmatized_words = map(self.lemmatizer.lemmatize, words)
        return ' '.join(lemmatized_words)
    
    def preprocess(self, texts):
        """Apply full preprocessing pipeline."""
        # Step 1: Clean
        cleaned = list(map(self.clean_text, texts))
        # Step 2: Remove stopwords
        no_stopwords = list(map(self.remove_stopwords, cleaned))
        # Step 3: Lemmatize
        lemmatized = list(map(self.lemmatize, no_stopwords))
        return lemmatized

# Initialize preprocessor and process data
preprocessor = TextPreprocessor()
dataframe_classical_ml = dataframe_main.copy()
dataframe_classical_ml['text'] = preprocessor.preprocess(dataframe_classical_ml['text'])


In [ ]:
# Data Splitting Function
def split_data(features, labels, test_size=TEST_SIZE, val_size=VAL_SIZE, random_seed=RANDOM_SEED):
    """Split data into train, validation, and test sets."""
    # First split: train+val vs test
    X_temp, X_test, y_temp, y_test = train_test_split(
        features, labels, test_size=test_size, random_state=random_seed
    )
    
    # Second split: train vs val
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=random_seed
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Split the data
features = dataframe_classical_ml['text'].tolist()
labels = dataframe_classical_ml['text_type'].tolist()

train_features_classical, val_features_classical, test_features_classical, \
train_labels_classical, val_labels_classical, test_labels_classical = split_data(
    features, labels
)

print(f"Training samples: {len(train_features_classical)}")
print(f"Validation samples: {len(val_features_classical)}")
print(f"Test samples: {len(test_features_classical)}")


In [ ]:
# TF-IDF Vectorization
import joblib

# Load pre-trained vectorizer
try:
    tfidf_vectorizer = joblib.load("tfidf_vectorizer_2_binary.pkl")
    print("Loaded pre-trained TF-IDF vectorizer")
except FileNotFoundError:
    print("Vectorizer file not found, would need to train one")
    tfidf_vectorizer = None

if tfidf_vectorizer is not None:
    # Transform all splits
    train_features_classical = tfidf_vectorizer.transform(train_features_classical)
    val_features_classical = tfidf_vectorizer.transform(val_features_classical)
    test_features_classical = tfidf_vectorizer.transform(test_features_classical)
    print("TF-IDF transformation complete")


In [ ]:
# Class Distribution Visualization
status_counts = dataframe_classical_ml['text_type'].value_counts()

fig, ax = plt.subplots(figsize=(12, 8))
status_counts.plot(kind='bar', ax=ax)

# Add value labels on bars
for idx in range(len(status_counts)):
    v = status_counts.iloc[idx]
    ax.text(idx, v, str(v), ha='center', va='bottom')

ax.set_title('Label Distribution by text_type')
ax.set_xlabel('text_type')
ax.set_ylabel('Count')
plt.tight_layout()
plt.show()

print("\nClass distribution shows imbalance - will use class weights during training")


# Part 3: Model Development and Performance Analysis

## 3.1 Traditional Machine Learning Algorithms

In [ ]:
# Model Training Utilities
class ModelTrainer:
    """Helper class for training and evaluating models."""
    
    @staticmethod
    def train_and_evaluate(model, X_train, y_train, X_val, y_val):
        """Train model and return validation F1 score."""
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)
        f1 = f1_score(y_val, predictions, average='weighted')
        return f1, model
    
    @staticmethod
    def grid_search(param_grid, model_class, X_train, y_train, X_val, y_val, **model_kwargs):
        """Perform grid search using nested loops."""
        best_score = 0
        best_model = None
        best_params = None
        
        # Get parameter names and values
        param_names = list(param_grid.keys())
        param_values = [param_grid[name] for name in param_names]
        
        # Nested loop iteration
        def nested_loop_search(depth, current_params):
            nonlocal best_score, best_model, best_params
            
            if depth == len(param_names):
                # All parameters set, train and evaluate
                try:
                    model = model_class(**current_params, **model_kwargs)
                    score, trained_model = ModelTrainer.train_and_evaluate(
                        model, X_train, y_train, X_val, y_val
                    )
                    
                    param_str = ', '.join([f"{k}={v}" for k, v in current_params.items()])
                    print(f"Validation F1-Score for {param_str}: {score:.4f}")
                    
                    if score > best_score:
                        best_score = score
                        best_model = trained_model
                        best_params = current_params.copy()
                except Exception as e:
                    param_str = ', '.join([f"{k}={v}" for k, v in current_params.items()])
                    print(f"Skipping {param_str} due to error: {e}")
            else:
                # Recursively set next parameter
                for value in param_values[depth]:
                    current_params[param_names[depth]] = value
                    nested_loop_search(depth + 1, current_params)
        
        nested_loop_search(0, {})
        return best_model, best_params, best_score

trainer = ModelTrainer()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    roc_curve,
    auc
)
from sklearn.utils import resample


def calculate_bootstrap_f1_confidence_interval(y_true, y_pred, n_iterations=1000, average='weighted'):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    f1_scores = []

    # Use while loop instead of for loop
iteration_count = 0
while iteration_count < n_iterations:
        indices = resample(np.arange(len(y_true)))
        if len(np.unique(y_true[indices])) < 2:
            continue
        f1 = f1_score(y_true[indices], y_pred[indices], average=average)
        f1_scores.append(f1)

    f1_mean = np.mean(f1_scores)
    ci_lower = np.percentile(f1_scores, 2.5)
    ci_upper = np.percentile(f1_scores, 97.5)
    return f1_mean, ci_lower, ci_upper


def calculate_bootstrap_auc_confidence_interval(y_true, y_scores, n_iterations=1000):
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    auc_scores = []

    # Use while loop instead of for loop
iteration_count = 0
while iteration_count < n_iterations:
        indices = resample(np.arange(len(y_true)))
        if len(np.unique(y_true[indices])) < 2:
            continue
        fpr, tpr, _ = roc_curve(y_true[indices], y_scores[indices])
        auc_score = auc(fpr, tpr)
        auc_scores.append(auc_score)
    iteration_count += 1

    ci_lower = np.percentile(auc_scores, 2.5)
    ci_upper = np.percentile(auc_scores, 97.5)
    return np.mean(auc_scores), ci_lower, ci_upper


def visualize_confusion_matrix(y_true, y_pred, labels=['No Issue', 'Issue']):
    conf_matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()


def visualize_roc_curve(y_true, y_scores, label_prefix="Model"):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{label_prefix} AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {label_prefix}')
    plt.legend(loc='lower right')
    plt.show()
    return roc_auc


def evaluate_classical_ml_model(model, X_test, y_test, model_name="Model", use_proba=False):
    print(f"\n--- Evaluation Report: {model_name} ---\n")

    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    f1_mean, f1_ci_low, f1_ci_high = calculate_bootstrap_f1_confidence_interval(y_test, y_pred)
    print(f"Weighted F1 Score: {f1_mean:.4f}")
    print(f"95% CI for F1 Score: [{f1_ci_low:.4f}, {f1_ci_high:.4f}]")

    visualize_confusion_matrix(y_test, y_pred)

    # ROC + AUC
    if use_proba:
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        y_scores = model.decision_function(X_test)

    auc_score, auc_ci_low, auc_ci_high = calculate_bootstrap_auc_confidence_interval(y_test, y_scores)
    print(f"AUC Score: {auc_score:.4f}")
    print(f"95% CI for AUC: [{auc_ci_low:.4f}, {auc_ci_high:.4f}]")

    visualize_roc_curve(y_test, y_scores, label_prefix=model_name)


# --- Usage Example ---
# For SVM with decision_function:
#evaluate_model(SVM, X_test_1, y_test_1, model_name="SVM", use_proba=False)

# For models like RandomForest or LogisticRegression with predict_proba:
# evaluate_model(rf_model, X_test, y_test, model_name="Random Forest", use_proba=True)


#### 3.1.1 Linear Classification: Logistic Regression Model

In [ ]:
# Logistic Regression Hyperparameter Tuning
random.seed(RANDOM_SEED)

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2'],
    'class_weight': ['balanced', None]
}

start_time = time.time()
best_model_lr, best_params_lr, best_f1_score = trainer.grid_search(
    lr_param_grid,
    LogisticRegression,
    train_features_classical,
    train_labels_classical,
    val_features_classical,
    val_labels_classical,
    random_state=42,
    max_iter=MAX_ITER
)
end_time = time.time()

print(f"\nBest Hyperparameters: {best_params_lr}")
print(f"Best Validation F1-Score: {best_f1_score:.4f}")
print(f"Total Parameter Tuning Time: {end_time - start_time:.2f} seconds")


In [ ]:
# Train final Logistic Regression model
logistic_regression_model = LogisticRegression(
    C=10, solver='lbfgs', penalty='l2', 
    class_weight=None, random_state=42, max_iter=MAX_ITER
)
logistic_regression_model.fit(train_features_classical, train_labels_classical)

# Evaluate
evaluate_classical_ml_model(
    logistic_regression_model, 
    test_features_classical, 
    test_labels_classical, 
    model_name="Logistic Regression", 
    use_proba=True
)


In [ ]:
# Logistic Regression Feature Importance
feature_names = tfidf_vectorizer.get_feature_names_out()
coefficients = logistic_regression_model.coef_[0]

importance_dataframe = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

importance_dataframe['Importance (Abs)'] = importance_dataframe['Coefficient'].abs()
importance_dataframe['Scaled Importance'] = (
    importance_dataframe['Importance (Abs)'] / importance_dataframe['Importance (Abs)'].max()
) * 100

top_20_features = importance_dataframe.nlargest(20, 'Importance (Abs)')

plt.figure(figsize=(8, 6))
sns.set_style("whitegrid")
sns.barplot(
    x='Scaled Importance',
    y='Feature',
    data=top_20_features.sort_values('Scaled Importance'),
    palette="crest"
)
plt.title("Top 20 Important Features (Logistic Regression)", fontsize=14, fontweight='bold')
plt.xlabel("Relative Importance (%)", fontsize=12)
plt.ylabel("TF-IDF Feature", fontsize=12)
plt.tight_layout()
plt.savefig("top_features_lr.png", dpi=300)
plt.show()

print(top_20_features[['Feature', 'Coefficient', 'Scaled Importance']])


#### 3.1.2 Ensemble Method: Random Forest Classifier

In [ ]:
# Random Forest Hyperparameter Tuning
random.seed(RANDOM_SEED)

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

start_time = time.time()
best_model_rf, best_params_rf, best_f1_score_rf = trainer.grid_search(
    rf_param_grid,
    RandomForestClassifier,
    train_features_classical,
    train_labels_classical,
    val_features_classical,
    val_labels_classical,
    random_state=42,
    n_jobs=-1
)
end_time = time.time()

print(f"\nBest Hyperparameters: {best_params_rf}")
print(f"Best Validation F1-Score: {best_f1_score_rf:.4f}")
print(f"Total Parameter Tuning Time: {end_time - start_time:.2f} seconds")


#### 3.1.2 Ensemble Method: Random Forest Classifier

In [ ]:
# Random Forest Hyperparameter Tuning
random.seed(RANDOM_SEED)

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

start_time = time.time()
best_model_rf, best_params_rf, best_f1_score_rf = trainer.grid_search(
    rf_param_grid,
    RandomForestClassifier,
    train_features_classical,
    train_labels_classical,
    val_features_classical,
    val_labels_classical,
    random_state=42,
    n_jobs=-1
)
end_time = time.time()

print(f"\nBest Hyperparameters: {best_params_rf}")
print(f"Best Validation F1-Score: {best_f1_score_rf:.4f}")
print(f"Total Parameter Tuning Time: {end_time - start_time:.2f} seconds")


In [ ]:
# Train final Random Forest model
random_forest_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
random_forest_model.fit(train_features_classical, train_labels_classical)

# Evaluate
evaluate_classical_ml_model(
    random_forest_model,
    test_features_classical,
    test_labels_classical,
    model_name="Random Forest",
    use_proba=True
)


In [ ]:
# Random Forest Feature Importance
if random_forest_model is not None:
    feature_names = tfidf_vectorizer.get_feature_names_out()
    feature_importance = random_forest_model.feature_importances_
    feature_importance = (feature_importance / feature_importance.max()) * 100
    
    important_features = sorted(
        zip(feature_importance, feature_names),
        key=lambda x: x[0],
        reverse=True
    )
    
    top_n = 20
    top_features = important_features[:top_n]
    importances, names = zip(*top_features)
    
    importances = importances[::-1]
    names = names[::-1]
    
    plt.figure(figsize=(8, 6))
    sns.set_style("whitegrid")
    sns.barplot(x=list(importances), y=list(names), palette="viridis")
    
    plt.title("Top 20 Important Features (Random Forest)", fontsize=14, fontweight='bold')
    plt.xlabel("Relative Importance (%)", fontsize=12)
    plt.ylabel("TF-IDF Feature", fontsize=12)
    plt.tight_layout()
    plt.savefig("top_features_rf.png", dpi=300)
    plt.show()


#### 3.1.3 Gradient Boosting: LightGBM Classifier

In [ ]:
# LightGBM Hyperparameter Tuning
random.seed(RANDOM_SEED)

lgbm_param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [-1, 10],
    'num_leaves': [31, 50],
    'min_child_samples': [10, 20],
    'class_weight': [None, 'balanced']
}

start_time = time.time()
best_model_lgbm, best_params_lgbm, best_f1_score_lgbm = trainer.grid_search(
    lgbm_param_grid,
    LGBMClassifier,
    train_features_classical,
    train_labels_classical,
    val_features_classical,
    val_labels_classical,
    random_state=42,
    n_jobs=-1,
    importance_type="gain"
)
end_time = time.time()

print(f"\nBest Hyperparameters: {best_params_lgbm}")
print(f"Best Validation F1-Score: {best_f1_score_lgbm:.4f}")
print(f"Total Parameter Tuning Time: {end_time - start_time:.2f} seconds")


In [ ]:
# Train final LightGBM model
lightgbm_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1,
    num_leaves=50,
    min_child_samples=10,
    class_weight=None,
    random_state=42,
    n_jobs=-1,
    importance_type="gain"
)
lightgbm_model.fit(train_features_classical, train_labels_classical)

# Evaluate
evaluate_classical_ml_model(
    lightgbm_model,
    test_features_classical,
    test_labels_classical,
    model_name="LightGBM",
    use_proba=True
)


In [ ]:
# LightGBM Feature Importance
if lightgbm_model is not None:
    feature_names = tfidf_vectorizer.get_feature_names_out()
    feature_importance = lightgbm_model.feature_importances_
    feature_importance = (feature_importance / feature_importance.max()) * 100
    
    important_features = sorted(
        zip(feature_importance, feature_names),
        key=lambda x: x[0],
        reverse=True
    )
    
    top_n = 20
    top_features = important_features[:top_n]
    importances, names = zip(*top_features)
    
    importances = importances[::-1]
    names = names[::-1]
    
    plt.figure(figsize=(8, 6))
    sns.set_style("whitegrid")
    sns.barplot(x=list(importances), y=list(names), palette="viridis")
    
    plt.title("Top 20 Important Features (LightGBM)", fontsize=14, fontweight='bold')
    plt.xlabel("Relative Importance (%)", fontsize=12)
    plt.ylabel("TF-IDF Feature", fontsize=12)
    plt.tight_layout()
    plt.savefig("top_features_lgbm.png", dpi=300)
    plt.show()


In [ ]:
# Model Performance Summary
models_performance = {
    'Logistic Regression': {
        'best_params': best_params_lr if 'best_params_lr' in globals() else None,
        'best_f1': best_f1_score if 'best_f1_score' in globals() else None
    },
    'Random Forest': {
        'best_params': best_params_rf if 'best_params_rf' in globals() else None,
        'best_f1': best_f1_score_rf if 'best_f1_score_rf' in globals() else None
    },
    'LightGBM': {
        'best_params': best_params_lgbm if 'best_params_lgbm' in globals() else None,
        'best_f1': best_f1_score_lgbm if 'best_f1_score_lgbm' in globals() else None
    }
}

# Create comparison dataframe
comparison_data = []
for model_name, perf in models_performance.items():
    if perf['best_f1'] is not None:
        comparison_data.append({
            'Model': model_name,
            'Best F1 Score': perf['best_f1'],
            'Best Parameters': str(perf['best_params'])
        })

if comparison_data:
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('Best F1 Score', ascending=False)
    print("\nModel Performance Comparison:")
    print(comparison_df.to_string(index=False))
    
    # Visualize comparison
    fig, ax = plt.subplots(figsize=(10, 6))
    comparison_df.plot(x='Model', y='Best F1 Score', kind='barh', ax=ax, legend=False)
    ax.set_xlabel('Best F1 Score')
    ax.set_title('Model Performance Comparison (Validation F1 Score)')
    plt.tight_layout()
    plt.show()
