<a href="https://colab.research.google.com/github/plaethos01/notebooks_codes/blob/main/cyberBullying.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping
import re
import nltk
from nltk.corpus import stopwords

In [3]:
# Download stopwords (if not already downloaded)
nltk.download('stopwords')

# Load Hinglish stop words
stopwords_hinglish = set("stopwords.txt")  # Replace "your_stopwords" with your actual list of Hinglish stopwords

# Load the dataset
df = pd.read_csv("final_dataset_hinglish.csv")

# Load Hinglish stop words from file
with open("stopwords.txt", "r", encoding="utf-8") as file:
    stopwords_hinglish = set(file.read().splitlines())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Preprocess text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove Hinglish stop words
    text = ' '.join(word for word in text.split() if word not in stopwords_hinglish)

    return text

df['preprocessed_text'] = df['headline'].apply(preprocess_text)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(
    df['preprocessed_text'], df['label'], test_size=0.2, random_state=42
)


In [5]:
# Tokenize text and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=50)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=50)

# Build CNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=50))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the CNN model
model.fit(X_train_seq, y_train, epochs=10, validation_data=(X_val_seq, y_val), callbacks=[early_stopping])

# Save the trained model
model.save('cyber_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

# Build VGG19-like model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=50))
model.add(Conv1D(64, 3, activation='relu', padding='same'))
model.add(Conv1D(64, 3, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 3, activation='relu', padding='same'))
model.add(Conv1D(128, 3, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(256, 3, activation='relu', padding='same'))
model.add(Conv1D(256, 3, activation='relu', padding='same'))
model.add(Conv1D(256, 3, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(512, 3, activation='relu', padding='same'))
model.add(Conv1D(512, 3, activation='relu', padding='same'))
model.add(Conv1D(512, 3, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the CNN model
model.fit(X_train_seq, y_train, epochs=10, validation_data=(X_val_seq, y_val), callbacks=[early_stopping])

# Save the trained model
model.save('cyber_model_vgg19.h5')


Epoch 1/10
Epoch 2/10
Epoch 3/10


In [7]:

# Extract features from the CNN model
cnn_features_train = model.predict(X_train_seq)
cnn_features_val = model.predict(X_val_seq)

# Drop rows with missing labels in y_val
valid_indices = ~pd.isnull(y_val)
y_val_cleaned = y_val[valid_indices]
cnn_features_val_cleaned = cnn_features_val[valid_indices]




In [11]:
import numpy as np

# Check and handle NaN values in cnn_features_train
nan_indices_train = np.isnan(cnn_features_train)
if np.any(nan_indices_train):
    cnn_features_train[nan_indices_train] = 0

# Check and handle NaN values in cnn_features_val_cleaned
nan_indices_val = np.isnan(cnn_features_val_cleaned)
if np.any(nan_indices_val):
    cnn_features_val_cleaned[nan_indices_val] = 0

# Train an SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(cnn_features_train, y_train)

# Save the SVM model
joblib.dump(svm_model, 'svm_model.pkl')

# Evaluate the model
y_pred = svm_model.predict(cnn_features_val_cleaned)
print("Accuracy:", accuracy_score(y_val_cleaned, y_pred))
print("Classification Report:\n", classification_report(y_val_cleaned, y_pred))


Accuracy: 0.6325068870523416
Classification Report:
               precision    recall  f1-score   support

          -1       0.63      1.00      0.77      2296
           0       0.00      0.00      0.00      1334

    accuracy                           0.63      3630
   macro avg       0.32      0.50      0.39      3630
weighted avg       0.40      0.63      0.49      3630



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
