In [None]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../data/hate-speech-dataset.csv")

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text) # Removes non-word characters
    text = re.sub(r'\s+', ' ', text) # Removes extra spaces
    text = text.lower()
    return text

df['Content'] = df['Content'].apply(preprocess_text)

In [None]:
X = df['Content']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [None]:
y_pred = model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n {conf_matrix}')
print(f'Classification Report:\n {class_report}')

In [None]:
joblib.dump(model, '../backend/models/hate_speech_model.pkl')
joblib.dump(vectorizer, '../backend/models/tfidf_vectorizer.pkl')