<a href="https://colab.research.google.com/github/naisyh/CD-FYP/blob/main/TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
!pip install compress-fasttext




In [78]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras import backend as K
import compress_fasttext

In [79]:
MAX_LEN = 128                # Max sequence length for BERT and FastText
EMBEDDING_DIM = 300          # FastText vector size
BATCH_SIZE = 16
EPOCHS = 3

In [80]:
    # from google.colab import files
    # uploaded = files.upload()

In [81]:
df = pd.read_csv('HateMalay Dataset.csv')  # CSV must contain 'messages' and 'hate' columns

# Extract tweets and labels
tweets = df['messages'].astype(str).tolist()
labels = df['hate'].tolist()

# Encode string labels to integers
le = LabelEncoder()
labels_enc = le.fit_transform(labels)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels_enc, test_size=0.1, stratify=labels_enc, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)


In [83]:
# ============================
# 🔠 Tokenize with DistilBERT
# ============================
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

def encode_tweets(tweets):
    return tokenizer(tweets, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='tf')

train_encodings = encode_tweets(X_train)
val_encodings = encode_tweets(X_val)
test_encodings = encode_tweets(X_test)

In [84]:
# ============================
# 🔤 Load FastText
# ============================
fasttext_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load("fasttext-ms-mini")  # Adjust path

# Tokenize raw text into words
def tokenize(text):
    return text.split()

# Convert text to FastText embeddings
def fasttext_embedding(tweet_tokens):
    vectors = []
    for token in tweet_tokens:
        vectors.append(fasttext_model[token] if token in fasttext_model else np.zeros(EMBEDDING_DIM))
    if len(vectors) < MAX_LEN:
        vectors += [np.zeros(EMBEDDING_DIM)] * (MAX_LEN - len(vectors))
    else:
        vectors = vectors[:MAX_LEN]
    return np.array(vectors)

# Prepare FastText inputs
X_train_fasttext = np.array([fasttext_embedding(tokenize(t)) for t in X_train])
X_val_fasttext = np.array([fasttext_embedding(tokenize(t)) for t in X_val])
X_test_fasttext = np.array([fasttext_embedding(tokenize(t)) for t in X_test])




In [85]:
# ============================
# 🧠 Capsule Layer Definition
# ============================
def squash(vectors, axis=-1):
    s_squared_norm = tf.reduce_sum(tf.square(vectors), axis, keepdims=True)
    scale = s_squared_norm / (1 + s_squared_norm) / tf.sqrt(s_squared_norm + K.epsilon())
    return scale * vectors

class CapsuleLayer(layers.Layer):
    def __init__(self, num_capsules, dim_capsule, routings=3, **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.num_capsules = num_capsules
        self.dim_capsule = dim_capsule
        self.routings = routings

    def build(self, input_shape):
        self.W = self.add_weight(shape=[input_shape[-1], self.num_capsules * self.dim_capsule],
                                 initializer='glorot_uniform', trainable=True)

    def call(self, inputs):
        u_hat = tf.tensordot(inputs, self.W, axes=1)  # Linear transformation
        u_hat = tf.reshape(u_hat, (-1, inputs.shape[1], self.num_capsules, self.dim_capsule))
        u_hat = tf.transpose(u_hat, perm=[0, 2, 1, 3])
        b = tf.zeros_like(u_hat[..., 0])  # Routing logits
        for i in range(self.routings):
            c = tf.nn.softmax(b, axis=1)
            s = tf.reduce_sum(c[..., tf.newaxis] * u_hat, axis=2)
            v = squash(s)
            if i < self.routings - 1:
                b += tf.reduce_sum(u_hat * v[:, :, tf.newaxis, :], axis=-1)
        return tf.reshape(v, (-1, self.num_capsules * self.dim_capsule))


In [86]:

# # ============================
# # 🔧 Build Model Architecture
# # ============================
# input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
# attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
# fasttext_input = layers.Input(shape=(MAX_LEN, EMBEDDING_DIM), dtype=tf.float32, name='fasttext_input')

# # --- DistilBERT branch ---
# distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
# distilbert_outputs = distilbert_model(input_ids, attention_mask=attention_mask)[0]
# conv = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(distilbert_outputs)
# capsule = CapsuleLayer(num_capsules=10, dim_capsule=16)(conv)  # Output shape: (None, 160)

# # --- FastText branch ---
# bi_gru = layers.Bidirectional(layers.GRU(128, return_sequences=True))(fasttext_input)
# attention = layers.Attention()([bi_gru, bi_gru])
# attention_output = tf.reduce_sum(attention * bi_gru, axis=1)  # Output shape: (None, 256)

# # --- Merge both channels ---
# concat = layers.Concatenate()([capsule, attention_output])  # Shape: (None, 416)
# fc1 = layers.Dense(200, activation='relu')(concat)
# fc2 = layers.Dense(100, activation='relu')(fc1)
# output = layers.Dense(2, activation='softmax')(fc2)  # Binary classification output

# # Create Model
# model = Model(inputs=[input_ids, attention_mask, fasttext_input], outputs=output)
# model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])
# model.summary()


In [87]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from transformers import TFDistilBertModel

# Define your constants
MAX_LEN = 128
EMBEDDING_DIM = 300

# Define inputs
input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
fasttext_input = layers.Input(shape=(MAX_LEN, EMBEDDING_DIM), dtype=tf.float32, name='fasttext_input')

# --- DistilBERT branch ---
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

# Custom layer to wrap the DistilBERT model
class DistilBertLayer(layers.Layer):
    def __init__(self, model, **kwargs):
        super(DistilBertLayer, self).__init__(**kwargs)
        self.model = model
    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs[0]  # Return the last hidden state
    def get_config(self):
        config = super().get_config()
        config.update({
            "model": self.model.config._name_or_path  # Store the model name instead
        })
        return config

# Create an instance of the custom layer
distilbert_layer = DistilBertLayer(distilbert_model)

# Pass the Keras Input layers directly to the custom layer
distilbert_outputs = distilbert_layer([input_ids, attention_mask])

# Apply convolution and capsule layers on the DistilBERT outputs
conv = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(distilbert_outputs)
capsule = CapsuleLayer(num_capsules=10, dim_capsule=16)(conv)  # Output shape: (None, 160)

# --- FastText branch ---
bi_gru = layers.Bidirectional(layers.GRU(128, return_sequences=True))(fasttext_input)
attention = layers.Attention()([bi_gru, bi_gru])

# Custom layer to handle the attention output
class AttentionOutputLayer(layers.Layer):
    def call(self, inputs):
        bi_gru, attention = inputs
        return tf.reduce_sum(attention * bi_gru, axis=1)  # Output shape: (None, 256)

# Create an instance of the custom attention output layer
attention_output_layer = AttentionOutputLayer()

# Pass the Keras Input layers directly to the custom layer
attention_output = attention_output_layer([bi_gru, attention])

# --- Merge both channels ---
concat = layers.Concatenate()([capsule, attention_output])  # Shape: (None, 416)
fc1 = layers.Dense(200, activation='relu')(concat)
fc2 = layers.Dense(100, activation='relu')(fc1)
output = layers.Dense(2, activation='softmax')(fc2)  # Binary classification output

# Create Model
model = Model(inputs=[input_ids, attention_mask, fasttext_input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [88]:
from tensorflow.keras.callbacks import ModelCheckpoint

# ============================
# 🚀 Train the Model
# ============================

checkpoint = ModelCheckpoint(
    'model_checkpoint.keras',  # Filename to save the model
    monitor='val_loss',        # Monitor validation loss
    save_best_only=True,       # Save only the best model
    mode='min',                # Save when the monitored quantity is minimized
    verbose=1                 # Print messages when saving
)

history = model.fit(
    {'input_ids': train_encodings['input_ids'],
     'attention_mask': train_encodings['attention_mask'],
     'fasttext_input': X_train_fasttext},
    np.array(y_train),
    validation_data=(
        {'input_ids': val_encodings['input_ids'],
         'attention_mask': val_encodings['attention_mask'],
         'fasttext_input': X_val_fasttext},
        np.array(y_val)
    ),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint]  # Add the checkpoint callback here
)


Epoch 1/3
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.6220 - loss: 0.6603
Epoch 1: val_loss improved from inf to 0.62031, saving model to model_checkpoint.keras
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1142s[0m 4s/step - accuracy: 0.6221 - loss: 0.6602 - val_accuracy: 0.6621 - val_loss: 0.6203
Epoch 2/3
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.6660 - loss: 0.6089
Epoch 2: val_loss improved from 0.62031 to 0.60016, saving model to model_checkpoint.keras
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1074s[0m 4s/step - accuracy: 0.6660 - loss: 0.6089 - val_accuracy: 0.6735 - val_loss: 0.6002
Epoch 3/3
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.6939 - loss: 0.5842
Epoch 3: val_loss improved from 0.60016 to 0.59954, saving model to model_checkpoint.keras
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1106s[0m 4s

In [89]:
model.save('my_model.keras')

In [90]:
model.summary()


In [91]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [92]:

# ============================
# 📊 Evaluate the Model
# ============================
test_preds = model.predict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'fasttext_input': X_test_fasttext
})
test_pred_labels = np.argmax(test_preds, axis=1)

# Print classification report
print(classification_report(y_test, test_pred_labels, target_names=le.classes_))


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 7s/step
              precision    recall  f1-score   support

          no       0.68      0.98      0.80       301
         yes       0.88      0.26      0.40       189

    accuracy                           0.70       490
   macro avg       0.78      0.62      0.60       490
weighted avg       0.75      0.70      0.65       490



In [96]:
def predict_hate_speech(user_input):
    # Tokenize and encode the input text
    encoded_input = encode_tweets([user_input])

    # Create FastText embeddings for the input
    fasttext_input = fasttext_embedding(tokenize(user_input))
    fasttext_input = np.array([fasttext_input])  # Reshape for model input
    # Make predictions
    predictions = model.predict({
        'input_ids': encoded_input['input_ids'],
        'attention_mask': encoded_input['attention_mask'],
        'fasttext_input': fasttext_input
    })
    # Get the predicted label
    predicted_label = np.argmax(predictions, axis=1)[0]
    return le.inverse_transform([predicted_label])[0]  # Convert back to original label
# Example usage for user input
user_input = input("Enter a message to classify: ")
predicted_output = predict_hate_speech(user_input)
print(f"Predicted Output: {predicted_output}")

Enter a message to classify: macam bangkai
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 969ms/step
Predicted Output: no


In [94]:
! pip install streamlit

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 578, in _determine_conflicts
    return check_install_conflicts(to_install)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/operations/check.py", line 101, in check_install_conflicts
    package_set, _ = create_package_set_from_installed()
              

KeyboardInterrupt: 

In [None]:
import streamlit as st


# Load the trained model
# model = tf.keras.models.load_model('my_model.keras', custom_objects={'DistilBertLayer': DistilBertLayer})

# Load FastText model
fasttext_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load("fasttext-ms-mini")  # Adjust path

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Define constants
MAX_LEN = 128
EMBEDDING_DIM = 300

# Function to encode tweets
def encode_tweets(tweets):
    return tokenizer(tweets, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='tf')

# Function to create FastText embeddings
def fasttext_embedding(tweet_tokens):
    vectors = []
    for token in tweet_tokens:
        vectors.append(fasttext_model[token] if token in fasttext_model else np.zeros(EMBEDDING_DIM))
    if len(vectors) < MAX_LEN:
        vectors += [np.zeros(EMBEDDING_DIM)] * (MAX_LEN - len(vectors))
    else:
        vectors = vectors[:MAX_LEN]
    return np.array(vectors)

# Function to predict hate speech
def predict_hate_speech(user_input):
    encoded_input = encode_tweets([user_input])
    fasttext_input = fasttext_embedding(user_input.split())
    fasttext_input = np.array([fasttext_input])  # Reshape for model input

    predictions = model.predict({
        'input_ids': encoded_input['input_ids'],
        'attention_mask': encoded_input['attention_mask'],
        'fasttext_input': fasttext_input
    })
    predicted_label = np.argmax(predictions, axis=1)[0]
    return predicted_label  # Return the predicted label

# Streamlit app layout
st.title("Cyberbullying Detection")
st.write("Enter a message to classify whether it is cyberbullying or not.")

# User input
user_input = st.text_area("Message:")

if st.button("Predict"):
    if user_input:
        prediction = predict_hate_speech(user_input)
        if prediction == 1:  # Assuming 1 indicates cyberbullying
            st.write("Prediction: **Cyberbullying**")
        else:
            st.write("Prediction: **Not Cyberbullying**")
    else:
        st.write("Please enter a message.")
