# CNN baseline for MetaHate

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Reading the data

In [None]:
data = pd.read_csv('/data/metahate.csv', sep='\t', names=['label', 'text'])

texts = data['text'].tolist()
labels = data['label'].tolist()

## Splitting the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts,              # Input features (text data)
    labels,             # Target labels corresponding to the input features
    test_size=0.2,      # The proportion of the dataset to include in the test split (20% in this case)
    random_state=42     # Seed for reproducibility (set to 42 in this case)
)

## Tokenizing the text data and convert to sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Converting the training and testing text data into sequences of integers
# Each word in the texts is replaced with its corresponding integer index obtained from the Tokenizer
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

## Padding sequences to ensure uniform length

In [None]:
# Setting the maximum length for sequences
maxlen = 512

# If a sequence is shorter than 'maxlen', it is padded with zeros at the beginning,
# and if it is longer, it is truncated to match 'maxlen'
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen)

## Creating a simple neural network

In [None]:
# Creating a Sequential model
model = Sequential()

# Adding an Embedding layer to the model
model.add(Embedding(
    input_dim=len(tokenizer.word_index) + 1, # Size of the vocabulary (number of unique words + 1)
    output_dim=64, # Dimension of the dense embedding
    input_length=maxlen) # Length of input sequences (padded/truncated to 'maxlen')
)

# Flattening the 3D tensor output from the Embedding layer to a 2D tensor
model.add(Flatten())

# Adding a Dense layer with ReLU activation
model.add(Dense(
    64, # 64 units in the layer
    activation='relu', # ReLU activation
    input_dim=maxlen) # Specifies the input dimension (length of the flattened output from the previous layer)
)

# Adding the output layer with a single unit and sigmoid activation (for binary classification)
model.add(Dense(1, activation='sigmoid'))


## Compiling and training the model

In [None]:
model.compile(
    optimizer='adam', # 'adam' is chosen as the optimization algorithm, known for its efficiency in training neural networks
    loss='binary_crossentropy', # 'binary_crossentropy' is selected as the loss function as we are performing a binary classification tasks
    metrics=['accuracy'] # The model will be evaluated based on 'accuracy' during training
)

model.fit(
    X_train_padded, # The padded sequences of input features
    y_train, # The corresponding target labels for the training data
    epochs=1, # Number of times the entire training dataset is passed through the neural network
    batch_size=32, # Number of samples processed in each iteration (updating the model weights)
    validation_split=0.2 # The fraction of the training data to be used as validation data during training
)

## Evaluating the model on the test set

In [None]:
# Obtaining raw predictions for the test set by thresholding the predictions at 0.5 and converting boolean values to integers (0 or 1)
predictions = (model.predict(X_test_padded) > 0.5).astype(int).flatten()

## Calculating the metrics

In [None]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
weighted_f1 = f1_score(y_test, predictions, average='weighted')
micro_f1 = f1_score(y_test, predictions, average='micro')
macro_f1 = f1_score(y_test, predictions, average='macro')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print(f"Weighted F1 Score: {weighted_f1}")
print(f"Micro F1 Score: {micro_f1}")
print(f"Macro F1 Score: {macro_f1}")