In [None]:
# Install required libraries
!pip install pandas numpy tensorflow scikit-learn

# Import libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import zipfile

# Unzip the uploaded archive
with zipfile.ZipFile("archive.zip", "r") as zip_ref:
    zip_ref.extractall(".")

print("‚úÖ Unzipped successfully!")


In [None]:
import os
os.listdir()


In [None]:
import pandas as pd

df = pd.read_csv("cyberbullying_tweets.csv")
print("‚úÖ File loaded successfully!")
df.head()


In [None]:
import re

# --- Detect which column has the text ---
possible_cols = ['tweet_text', 'text', 'content', 'comment', 'Tweet']
text_col = None
for col in possible_cols:
    if col in df.columns:
        text_col = col
        break

if text_col is None:
    raise ValueError("‚ùå No text column found! Check df.columns and update the code with your column name.")

print(f"‚úÖ Using text column: {text_col}")

# --- Define a simple cleaning function ---
def clean_text(text):
    text = str(text).lower()                 # lowercase
    text = re.sub(r'http\S+', '', text)      # remove links
    text = re.sub(r'@\w+', '', text)         # remove mentions
    text = re.sub(r'[^a-z\s]', '', text)     # keep only letters
    text = re.sub(r'\s+', ' ', text).strip() # remove extra spaces
    return text

# --- Apply cleaning to create a new column ---
df['clean_text'] = df[text_col].apply(clean_text)

# --- Show sample output ---
print("\n‚úÖ Cleaned text samples:")
display(df[['clean_text']].head())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Step 1: Detect which column has the labels (cyberbullying type) ---
possible_label_cols = ['cyberbullying_type', 'label', 'category', 'target']
label_col = None
for col in possible_label_cols:
    if col in df.columns:
        label_col = col
        break

if label_col is None:
    raise ValueError("‚ùå No label column found! Check df.columns and update the code with your label column name.")

print(f"‚úÖ Using label column: {label_col}")

# --- Step 2: Encode text labels into numbers ---
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df[label_col])

# Show label mapping
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("\nüéØ Label mapping:")
print(label_map)

# --- Step 3: Split into training & test sets (80% / 20%) ---
X = df['clean_text']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n‚úÖ Data split complete!")
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# --- Step 1: Tokenize the text ---
max_words = 10000   # vocabulary size
max_len = 100       # maximum length of a comment

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

print("‚úÖ Text tokenization and padding complete!")

# --- Step 2: Prepare labels for neural network ---
num_classes = len(le.classes_)
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

print("‚úÖ Labels one-hot encoded!")

# --- Step 3: Build the LSTM model ---
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("‚úÖ Model built successfully!")

# --- Step 4: Train the model ---
history = model.fit(
    X_train_pad, y_train_cat,
    validation_split=0.2,
    epochs=3,  # You can increase to 5‚Äì10 for better results
    batch_size=128,
    verbose=1
)

# --- Step 5: Evaluate the model ---
loss, acc = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print(f"\nüéØ Test Accuracy: {acc:.2f}")


In [None]:
# --- Step 1: Define a function to predict a comment ---
def predict_comment(comment):
    seq = tokenizer.texts_to_sequences([comment])
    pad = pad_sequences(seq, maxlen=100, padding='post')
    pred = model.predict(pad)
    class_idx = pred.argmax(axis=1)[0]
    label = le.inverse_transform([class_idx])[0]
    confidence = pred[0][class_idx] * 100
    print(f"üí¨ Comment: {comment}")
    print(f"‚ö° Predicted Label: {label} ({confidence:.2f}% confidence)\n")

# --- Step 2: Try with some examples ---
predict_comment("You are such a loser!")
predict_comment("I love this song so much!")
predict_comment("Stop spreading hate online.")
predict_comment("You‚Äôre amazing and kind!")


In [None]:
import matplotlib.pyplot as plt

# --- Plot Training Accuracy ---
plt.figure(figsize=(6,4))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

# --- Plot Training Loss ---
plt.figure(figsize=(6,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# --- Step 1: Save the model ---
model.save("cyberbullying_lstm_model.h5")
print("‚úÖ Model saved successfully as cyberbullying_lstm_model.h5")

# --- Step 2: Save the tokenizer and label encoder ---
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("‚úÖ Tokenizer and label encoder saved successfully!")


In [None]:
from tensorflow.keras.models import load_model
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Load saved model and files ---
model = load_model("cyberbullying_lstm_model.h5")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

print("‚úÖ Model, tokenizer, and label encoder loaded!")

# --- Predict again ---
def predict_comment(comment):
    seq = tokenizer.texts_to_sequences([comment])
    pad = pad_sequences(seq, maxlen=100, padding='post')
    pred = model.predict(pad)
    class_idx = pred.argmax(axis=1)[0]
    label = le.inverse_transform([class_idx])[0]
    confidence = pred[0][class_idx] * 100
    print(f"üí¨ Comment: {comment}")
    print(f"‚ö° Predicted Label: {label} ({confidence:.2f}% confidence)\n")

predict_comment("You are so stupid!")  # test example
