In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load the datasets
olid_train = pd.read_csv('olid-train-small.csv')
olid_test = pd.read_csv('olid-test.csv')
hasoc_train = pd.read_csv('hasoc-train.csv')

# Preprocessing: Tokenize and pad the sequences
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(olid_train['text'])

max_length = 50
padding_type='post'
trunc_type='post'

# Tokenizing OLID train and test datasets
olid_train_sequences = tokenizer.texts_to_sequences(olid_train['text'])
olid_train_padded = pad_sequences(olid_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
olid_test_sequences = tokenizer.texts_to_sequences(olid_test['text'])
olid_test_padded = pad_sequences(olid_test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Tokenizing HASOC dataset
hasoc_train_sequences = tokenizer.texts_to_sequences(hasoc_train['text'])
hasoc_train_padded = pad_sequences(hasoc_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Labels
olid_train_labels = olid_train['labels']
olid_test_labels = olid_test['labels']
hasoc_train_labels = hasoc_train['labels']

In [22]:
# Define CNN model for text classification
embedding_dim = 128

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 128)           1280000   
                                                                 
 conv1d_4 (Conv1D)           (None, 46, 128)           82048     
                                                                 
 global_max_pooling1d_4 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_8 (Dense)             (None, 128)               16512     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 1)                 129       
                                                      

In [23]:
# # Train the CNN model on the OLID dataset
history_olid = model.fit(olid_train_padded, olid_train_labels, epochs=10, batch_size=32, validation_split=0.1)

# Train the CNN model on the HASOC dataset
# history_hasoc = model.fit(hasoc_train_padded, hasoc_train_labels, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10


2024-10-04 22:34:09.765805: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10


2024-10-04 22:34:12.503643: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
# # Evaluate the model on OLID test set (In-domain)
olid_pred = (model.predict(olid_test_padded) > 0.5).astype(int)
print(classification_report(olid_test_labels, olid_pred))
print("Confusion Matrix (In-Domain):")
print(confusion_matrix(olid_test_labels, olid_pred))

              precision    recall  f1-score   support

           0       0.84      0.76      0.80       620
           1       0.50      0.62      0.56       240

    accuracy                           0.72       860
   macro avg       0.67      0.69      0.68       860
weighted avg       0.75      0.72      0.73       860

Confusion Matrix (In-Domain):
[[473 147]
 [ 91 149]]


2024-10-04 22:34:24.011851: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [25]:
# Evaluate the model on OLID test set after training on HASOC dataset (Cross-domain)
# olid_pred_cross = (model.predict(olid_test_padded) > 0.5).astype(int)
# print(classification_report(olid_test_labels, olid_pred_cross))
# print("Confusion Matrix (Cross-Domain):")
# print(confusion_matrix(olid_test_labels, olid_pred_cross))