In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, hamming_loss, roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Load datasets
train_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Thesis_Work/Thesis-II/DataFiles/full_dataset/train_dataset.csv'
validation_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Thesis_Work/Thesis-II/DataFiles/full_dataset/validation_dataset.csv'
test_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Thesis_Work/Thesis-II/DataFiles/full_dataset/test_dataset.csv'

df_train = pd.read_csv(train_dataset_path)
validation_df = pd.read_csv(validation_dataset_path)
test_df = pd.read_csv(test_dataset_path)

# Drop the 'emotion' column
df_train = df_train.drop(columns=['emotion'], errors='ignore')
validation_df = validation_df.drop(columns=['emotion'], errors='ignore')
test_df = test_df.drop(columns=['emotion'], errors='ignore')

# Removing samples with only 0 in their labels
df_train = df_train.loc[ df_train.apply(lambda x: sum(x[1:]), axis=1)>0 ]
validation_df = validation_df.loc[ validation_df.apply(lambda x: sum(x[1:]), axis=1)>0 ]
test_df = test_df.loc[ test_df.apply(lambda x: sum(x[1:]), axis=1)>0 ]



In [None]:
df_train

Unnamed: 0,cleaned_text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,i was born in 98 so i feel like your 98 loss i...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"wow, you all are heroes!",0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,if its not obvious everyone is having issues w...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,this architecture will be treasured even more ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,"that sucks, i hope you find the finances to be...",0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147842,in what way is banning someone from using your...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
147843,you would be floored to hear how many times i ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
147844,what does obamacare have to do with that? name...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
147845,that is not the typical sexual dynamic in west...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:

# Prepare datasets
X_train = df_train["cleaned_text"].values
y_train = df_train.drop(columns=['cleaned_text']).values
X_val = validation_df["cleaned_text"].values
y_val = validation_df.drop(columns=['cleaned_text']).values
X_test = test_df["cleaned_text"].values
y_test = test_df.drop(columns=['cleaned_text']).values

# Tokenization and Padding
max_words = 3000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Build LSTM Model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train_pad, y_train, epochs=20, batch_size=128, validation_data=(X_val_pad, y_val), callbacks=[early_stopping])

# Evaluate Model
y_train_pred = (model.predict(X_train_pad) > 0.5).astype(int)
y_val_pred = (model.predict(X_val_pad) > 0.5).astype(int)
y_test_pred = (model.predict(X_test_pad) > 0.5).astype(int)

print("Training Classification Report:\n", classification_report(y_train, y_train_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Confusion Matrix and Hamming Loss
conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_test_pred.argmax(axis=1))
print("Confusion Matrix:\n", conf_matrix)
print("Hamming Loss:", hamming_loss(y_test, y_test_pred))

# ROC Curve
def plot_roc_curve(y_true, y_pred):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(y_true.shape[1]):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    for i in range(y_true.shape[1]):
        plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

plot_roc_curve(y_test, y_test_pred)

# Hyperparameter Tuning (Optional)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
 169/1137 [===>..........................] - ETA: 21:21 - loss: 0.1290 - accuracy: 0.3934

In [None]:

# Prediction Function
def predict_samples(text_samples, model, tokenizer, max_len):
    text_samples_seq = tokenizer.texts_to_sequences(text_samples)
    text_samples_pad = pad_sequences(text_samples_seq, maxlen=max_len)
    samples_pred_labels = (model.predict(text_samples_pad) > 0.5).astype(int)
    samples_pred_labels_df = pd.DataFrame(samples_pred_labels)
    samples_pred_labels_df = samples_pred_labels_df.apply(lambda x: [emotions[i] for i in range(len(x)) if x[i]==1], axis=1)
    return pd.DataFrame({"Text": text_samples, "Emotions": list(samples_pred_labels_df)})

# Predict new samples
test_text = df_train.sample(5)["cleaned_text"].values.tolist()
print(predict_samples(test_text, model, tokenizer, max_len))


# **Improved Version With Attention**

In [None]:
!pip install tensorflow
!pip install sklearn
!pip install matplotlib


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, hamming_loss, roc_curve, auc
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import matplotlib.pyplot as plt

# Load datasets
train_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Thesis_Work/Thesis-II/DataFiles/full_dataset/train_dataset.csv'
validation_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Thesis_Work/Thesis-II/DataFiles/full_dataset/validation_dataset.csv'
test_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Thesis_Work/Thesis-II/DataFiles/full_dataset/test_dataset.csv'

df_train = pd.read_csv(train_dataset_path)
validation_df = pd.read_csv(validation_dataset_path)
test_df = pd.read_csv(test_dataset_path)

# Drop the 'emotion' column
df_train = df_train.drop(columns=['emotion'], errors='ignore')
validation_df = validation_df.drop(columns=['emotion'], errors='ignore')
test_df = test_df.drop(columns=['emotion'], errors='ignore')

# Removing samples with only 0 in their labels
df_train = df_train.loc[ df_train.apply(lambda x: sum(x[1:]), axis=1)>0 ]
validation_df = validation_df.loc[ validation_df.apply(lambda x: sum(x[1:]), axis=1)>0 ]
test_df = test_df.loc[ test_df.apply(lambda x: sum(x[1:]), axis=1)>0 ]


# Prepare datasets
X_train = df_train["cleaned_text"].values
y_train = df_train.drop(columns=['cleaned_text']).values
X_val = validation_df["cleaned_text"].values
y_val = validation_df.drop(columns=['cleaned_text']).values
X_test = test_df["cleaned_text"].values
y_test = test_df.drop(columns=['cleaned_text']).values

# Tokenization and Padding
max_words = 3000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=max_words, max_df=0.85)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Function to get top 10 important words
def get_top_tfidf_words(tfidf_vectorizer, response, top_n=10):
    sorted_nzs = np.argsort(response.data)[:-(top_n+1):-1]
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    return feature_names[sorted_nzs]

# Get top 10 important words for each document
top_words_train = [get_top_tfidf_words(vectorizer, X_train_tfidf[i]) for i in range(X_train_tfidf.shape[0])]
top_words_val = [get_top_tfidf_words(vectorizer, X_val_tfidf[i]) for i in range(X_val_tfidf.shape[0])]
top_words_test = [get_top_tfidf_words(vectorizer, X_test_tfidf[i]) for i in range(X_test_tfidf.shape[0])]

# Create attention mask based on top words
def create_attention_mask(tokenizer, sequences, top_words):
    attention_masks = []
    for seq, words in zip(sequences, top_words):
        mask = np.isin(seq, [tokenizer.word_index[word] for word in words if word in tokenizer.word_index])
        attention_masks.append(mask)
    return pad_sequences(attention_masks, maxlen=max_len, padding='post', truncating='post', value=0)

train_attention_masks = create_attention_mask(tokenizer, X_train_seq, top_words_train)
val_attention_masks = create_attention_mask(tokenizer, X_val_seq, top_words_val)
test_attention_masks = create_attention_mask(tokenizer, X_test_seq, top_words_test)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, Layer, Multiply
from tensorflow.keras.callbacks import EarlyStopping

# Attention Layer using TF-IDF scores
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], input_shape[-1]), initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[-1],), initializer="zeros", trainable=True)
        self.u = self.add_weight(name="att_u", shape=(input_shape[-1], 1), initializer="glorot_uniform", trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x, mask):
        # Compute attention scores
        u_t = tf.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        a_t = tf.tensordot(u_t, self.u, axes=1)
        a_t = tf.nn.softmax(a_t, axis=1)
        # Apply the mask
        a_t = a_t * mask
        a_t = a_t / tf.reduce_sum(a_t, axis=1, keepdims=True)
        # Compute context vector
        output = x * a_t
        return tf.reduce_sum(output, axis=1)

# Build LSTM Model with Attention
input_layer = Input(shape=(max_len,))
mask_input = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(input_layer)
lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
dropout_layer = Dropout(0.5)(lstm_layer)
attention_layer = AttentionLayer()([dropout_layer, mask_input])
dense_layer = Dense(64, activation='relu')(attention_layer)
dropout_layer_2 = Dropout(0.5)(dense_layer)
output_layer = Dense(y_train.shape[1], activation='sigmoid')(dropout_layer_2)

model = Model(inputs=[input_layer, mask_input], outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit([X_train_pad, train_attention_masks], y_train, epochs=20, batch_size=128, validation_data=([X_val_pad, val_attention_masks], y_val), callbacks=[early_stopping])

# Evaluate Model
y_train_pred = (model.predict([X_train_pad, train_attention_masks]) > 0.5).astype(int)
y_val_pred = (model.predict([X_val_pad, val_attention_masks]) > 0.5).astype(int)
y_test_pred = (model.predict([X_test_pad, test_attention_masks]) > 0.5).astype(int)

print("Training Classification Report:\n", classification_report(y_train, y_train_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Confusion Matrix and Hamming Loss
conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_test_pred.argmax(axis=1))
print("Confusion Matrix:\n", conf_matrix)
print("Hamming Loss:", hamming_loss(y_test, y_test_pred))

# ROC Curve
def plot_roc_curve(y_true, y_pred):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(y_true.shape[1]):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    for i in range(y_true.shape[1]):
        plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

plot_roc_curve(y_test, y_test_pred)

# Prediction Function
def predict_samples(text_samples, model, tokenizer, max_len):
    text_samples_seq = tokenizer.texts_to_sequences(text_samples)
    text_samples_pad = pad_sequences(text_samples_seq, maxlen=max_len)
    text_samples_tfidf = vectorizer.transform(text_samples)
    top_words_samples = [get_top_tfidf_words(vectorizer, text_samples_tfidf[i]) for i in range(text_samples_tfidf.shape[0])]
    attention_masks_samples = create_attention_mask(tokenizer, text_samples_seq, top_words_samples)
    samples_pred_labels = (model.predict([text_samples_pad, attention_masks_samples]) > 0.5).astype(int)
    samples_pred_labels_df = pd.DataFrame(samples_pred_labels)
    samples_pred_labels_df = samples_pred_labels_df.apply(lambda x: [emotions[i] for i in range(len(x)) if x[i]==1], axis=1)
    return pd.DataFrame({"Text": text_samples, "Emotions": list(samples_pred_labels_df)})

# Predict new samples
test_text = df_train.sample(5)["cleaned_text"].values.tolist()
print(predict_samples(test_text, model, tokenizer, max_len))
