In [1]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
df = pd.read_csv('labeled_data.csv')
tweets = df['tweet']
labels = df['class']

In [4]:
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [5]:
df['cleaned_tweet'] = tweets.apply(preprocess_text)

In [6]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_tweet'])
y = labels

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [15]:
model = LogisticRegression(multi_class='ovr', solver='liblinear', random_state=42)
model.fit(X_train_smote, y_train_smote)



In [16]:
y_pred = model.predict(X_test)

In [17]:
y_pred_proba = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')

In [18]:
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score (weighted): {f1:.4f}")
print(f"ROC AUC (weighted, ovo): {roc_auc:.4f}")

Accuracy: 0.8433
F1-score (weighted): 0.8596
ROC AUC (weighted, ovo): 0.9176


In [13]:
def test_input(text):
    cleaned_text = preprocess_text(text)
    vectorized_text = tfidf.transform([cleaned_text])
    predicted_class = model.predict(vectorized_text)[0]
    predicted_probabilities = model.predict_proba(vectorized_text)[0]
    class_labels = {0: 'Hate Speech', 1: 'Offensive Language', 2: 'Neither'}
    print(f"Input: {text}")
    print(f"Predicted Class: {class_labels[predicted_class]}")
    print("Class Probabilities:")
    for cls, prob in zip(class_labels.values(), predicted_probabilities):
        print(f"  {cls}: {prob:.4f}")

In [32]:
example_text = "Stop, you are not ugly but so pretty"
test_input(example_text)

Input: Stop, you are not ugly but so pretty
Predicted Class: Hate Speech
Class Probabilities:
  Hate Speech: 0.5013
  Offensive Language: 0.0740
  Neither: 0.4248
