In [1]:
import torch
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
lstm_model = torch.load('/content/drive/My Drive/model_lstm.pth', map_location=device)
cnn_model = torch.load('/content/drive/My Drive/model_cnn.pth', map_location=device)
bert_model = torch.load('/content/drive/My Drive/model.pth', map_location=device)

  lstm_model = torch.load('/content/drive/My Drive/model_lstm.pth', map_location=device)
  cnn_model = torch.load('/content/drive/My Drive/model_cnn.pth', map_location=device)
  bert_model = torch.load('/content/drive/My Drive/model.pth', map_location=device)


In [5]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_lstm_cnn = tf.keras.preprocessing.text.Tokenizer()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
data = pd.read_csv('cyberbullying_tweets.csv')
X = data['tweet_text']
y = data['cyberbullying_type']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [7]:
tokenizer_lstm_cnn.fit_on_texts(X)
max_length = max([len(seq) for seq in tokenizer_lstm_cnn.texts_to_sequences(X)])

In [8]:
def preprocess_text_lstm_cnn(text, max_length):
    sequence = tokenizer_lstm_cnn.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    return torch.tensor(padded_sequence, dtype=torch.int64).to(device)

In [9]:
def preprocess_text_bert(text, max_length=128):
    encoding = bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return encoding['input_ids'].to(device), encoding['attention_mask'].to(device)

In [50]:
def ensemble_predict(text, max_length=128):
    try:
        #print("Starting prediction process...")

        # Get LSTM prediction (TensorFlow model)
        #print("Processing LSTM...")
        lstm_input = preprocess_text_lstm_cnn(text, max_length)
        lstm_input = lstm_input.cpu().numpy()  # Convert to numpy for TensorFlow
        lstm_output = lstm_model.predict(lstm_input, verbose=0)
        lstm_probs = tf.nn.softmax(lstm_output).numpy()
        #print("LSTM probabilities computed")

        # Get CNN prediction (TensorFlow model)
        #print("Processing CNN...")
        cnn_input = preprocess_text_lstm_cnn(text, max_length)
        cnn_input = cnn_input.cpu().numpy()  # Convert to numpy for TensorFlow
        cnn_output = cnn_model.predict(cnn_input, verbose=0)
        cnn_probs = tf.nn.softmax(cnn_output).numpy()
        #print("CNN probabilities computed")

        # Get BERT prediction (PyTorch model)
        #print("Processing BERT...")
        bert_input_ids, bert_attention_mask = preprocess_text_bert(text)
        with torch.no_grad():
            outputs = bert_model(bert_input_ids, attention_mask=bert_attention_mask)
            logits = outputs.logits.to('cpu')
            bert_probs = torch.nn.functional.softmax(logits, dim=1).numpy()
        #print("BERT probabilities computed")

        # Ensure all arrays are the right shape
        #print(f"Probability shapes - LSTM: {lstm_probs.shape}, CNN: {cnn_probs.shape}, BERT: {bert_probs.shape}")

        # Ensemble Voting - Probability Averaging
        weights = [1/3, 1/3, 1/3]
        final_probs = (weights[0] * lstm_probs + weights[1] * cnn_probs + weights[2] * bert_probs)
        #print("Computed final probabilities")

        # Get final predicted class
        final_prediction = np.argmax(final_probs, axis=1)[0]
        result = label_encoder.inverse_transform([final_prediction])[0]
        #print(f"Final prediction: {result}")

        return result

    except Exception as e:
        print(f"Error processing text: {text}")
        print(f"Error location: {e.__traceback__.tb_lineno}")
        print(f"Exception type: {type(e).__name__}")
        print(f"Exception details: {str(e)}")
        return "not cyber bullying"  # Default prediction in case of error

In [81]:
sample_text = "Thanks to stupid like you"
predicted_label = ensemble_predict(sample_text)
print(f"Final Predicted Label: {predicted_label}")

Final Predicted Label: other_cyberbullying


In [52]:
def evaluate_ensemble(X_test, y_test):
    y_true = label_encoder.inverse_transform(y_test)
    y_pred = [ensemble_predict(text) for text in X_test]

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Run evaluation
evaluate_ensemble(X_test, y_test)

Accuracy: 0.9446
Precision: 0.9460
Recall: 0.9446
F1-Score: 0.9444

Classification Report:
                     precision    recall  f1-score   support

                age       1.00      1.00      1.00      1598
          ethnicity       1.00      0.99      1.00      1592
             gender       0.96      0.97      0.96      1595
  not_cyberbullying       0.91      0.80      0.85      1589
other_cyberbullying       0.82      0.91      0.86      1565
           religion       0.99      0.99      0.99      1600

           accuracy                           0.94      9539
          macro avg       0.95      0.94      0.94      9539
       weighted avg       0.95      0.94      0.94      9539

