# <center>INFO557: Final Project
### <center>Namig Abbasov 

## Import Libraries and Load Data 

In [1]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.metrics import f1_score
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input,Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import os
import tensorflow as tf

  from pandas.core import (


In [2]:
train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")

## Data Preprocessing 

In [3]:
### reproducibility

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [4]:
### prepare labels and train and dev sets 

label_cols = ["admiration", "amusement", "gratitude", "love", "pride", "relief", "remorse"]
X_train_text = train_df["text"].astype(str)
y_train = train_df[label_cols].values
X_dev_text = dev_df["text"].astype(str)
y_dev = dev_df[label_cols].values

In [5]:
### Tokenization

vocab_size = 20000
max_len = 120

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)
word_index = tokenizer.word_index

X_train = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=max_len)
X_dev = pad_sequences(tokenizer.texts_to_sequences(X_dev_text), maxlen=max_len)

In [6]:
### Load GloVe and build embedding matrix to initialize weights in neural network

embedding_index = {}
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

## Model Development, Training, and Prediction

In [7]:
### Build Model 
def create_model():
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_len, trainable=True),  
        Bidirectional(LSTM(64, return_sequences=True)),
        GlobalMaxPooling1D(),  
        BatchNormalization(),
        Dropout(0.4),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(len(label_cols), activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
model = create_model()

2025-05-05 00:14:51.350561: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Max
2025-05-05 00:14:51.350652: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2025-05-05 00:14:51.350669: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2025-05-05 00:14:51.350702: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-05 00:14:51.350729: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
### Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
### train model
model.fit(X_train, y_train, epochs=15, batch_size=32,
          validation_data=(X_dev, y_dev), callbacks=[early_stop], verbose=1)

Epoch 1/15


2025-05-05 00:14:53.873667: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 180ms/step - accuracy: 0.4715 - loss: 0.2509 - val_accuracy: 0.4290 - val_loss: 0.0860
Epoch 2/15
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 180ms/step - accuracy: 0.6392 - loss: 0.0963 - val_accuracy: 0.6269 - val_loss: 0.0698
Epoch 3/15
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 192ms/step - accuracy: 0.6349 - loss: 0.0829 - val_accuracy: 0.6034 - val_loss: 0.0681
Epoch 4/15
[1m788/788[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 193ms/step - accuracy: 0.6138 - loss: 0.0733 - val_accuracy: 0.5481 - val_loss: 0.0688
Epoch 5/15
[1m547/788[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m45s[0m 187ms/step - accuracy: 0.6015 - loss: 0.0658

In [None]:
### predict on dev
dev_probs = model.predict(X_dev)

## Threshold Tuning as Hyperparameter Tuning

In [None]:
best_f1 = 0
best_thresh = 0.5

for t in np.arange(0.3, 0.6, 0.02):
    dev_preds = (dev_probs > t).astype(int)
    f1 = f1_score(y_dev, dev_preds, average='micro')
    print(f"Threshold {t:.2f} → Micro F1: {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nBest threshold: {best_thresh:.2f} with Micro F1: {best_f1:.4f}")

## Save predictions

In [None]:
final_preds = (dev_probs > best_thresh).astype(int)
dev_submission = dev_df[["text"]].copy()
dev_submission[label_cols] = final_preds
dev_submission.to_csv("submission_dev.csv", index=False)

with zipfile.ZipFile("submission_dev.zip", 'w') as zipf:
    zipf.write("submission_dev.csv")
print("Dev submission saved as submission_dev.zip")