In [1]:
pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Namita Sathish\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
# Load dataset
df = pd.read_csv('datasets/clean_master_dataset.csv')

In [4]:
# Handle missing values in 'comment_text'
df['comment_text'] = df['comment_text'].fillna('')  # Replace NaN with empty string

# Features and labels
X = df['comment_text']
y = df['intent']


In [5]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(X)


In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)


In [9]:
import os
import joblib

os.makedirs('models', exist_ok=True)
os.makedirs('vectorizers', exist_ok=True)

joblib.dump(model, 'models/cyberbullying_model.pkl')
joblib.dump(vectorizer, 'vectorizers/cyberbullying_vectorizer.pkl')


['vectorizers/cyberbullying_vectorizer.pkl']

In [10]:
# Evaluate model
y_pred = model.predict(X_test)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

=== Classification Report ===
                     precision    recall  f1-score   support

                age       0.92      0.96      0.94      1598
          ethnicity       0.73      0.93      0.82      1592
             gender       0.58      0.81      0.68      1595
      identity_hate       0.00      0.00      0.00        14
             insult       0.01      0.04      0.01        79
  not_cyberbullying       0.97      0.84      0.90     42638
            obscene       0.02      0.16      0.04       118
other_cyberbullying       0.25      0.62      0.36      1565
           religion       0.79      0.94      0.86      1600
       severe_toxic       0.16      0.53      0.25       678
             threat       0.00      0.00      0.00         6
              toxic       0.67      0.61      0.64      7722

           accuracy                           0.80     59205
          macro avg       0.42      0.54      0.46     59205
       weighted avg       0.87      0.80      0.83   

In [None]:
# --- PREDICTION INTERFACE ---
def predict_cyberbullying(comment_text):
    comment_vector = vectorizer.transform([comment_text])
    prediction = model.predict(comment_vector)[0]
    return prediction

# Interactive prediction
while True:
    user_input = input("\nEnter a comment to classify (or type 'exit' to quit):\n> ")
    if user_input.lower() == 'exit':
        break
    label = predict_cyberbullying(user_input)
    print(f"Predicted Label: {label}")

Predicted Label: gender


: 