In [2]:
import json

def extract_emotion_cause_pairs(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    emotion_cause_pairs = []
    for conv in data:
        for pair in conv['emotion-cause_pairs']:
            emotion, cause = pair[0].split('_'), pair[1]
            emotion = emotion[1]  # Extract the emotion from the emotion-cause pair
            emotion_cause_pairs.append({'emotion': emotion, 'cause': cause})
    
    with open(output_file, 'w') as f:
        json.dump(emotion_cause_pairs, f, indent=4)

input_file = '/kaggle/input/train12345678/Subtask_1_train.json'
output_file = 'emotion_cause_pairs.json'
extract_emotion_cause_pairs(input_file, output_file)


# Emotion Identification

In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Layer, Input, Dot, Concatenate
from tensorflow.keras.initializers import Constant
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import json
from collections import Counter 

# Load GloVe word embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Define custom attention layer
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = tf.keras.backend.squeeze(tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b), axis=-1)
        at = tf.keras.backend.softmax(et, axis=-1)
        at = tf.keras.backend.expand_dims(at, axis=-1)
        output = x * at
        return tf.keras.backend.sum(output, axis=1)

# Load emotion-cause pairs from JSON file
def load_emotion_cause_pairs(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    emotions = [pair['emotion'] for pair in data]
    causes = [pair['cause'] for pair in data]
    return emotions, causes

# Load emotion-cause pairs from JSON file
emotion_cause_file_path = "emotion_cause_pairs.json"
emotions, causes = load_emotion_cause_pairs(emotion_cause_file_path)

emotion_counts = Counter(emotions)

# Print number of samples for each emotion before SMOTE
print("Before SMOTE:")
for emotion, count in emotion_counts.items():
    print(f"{emotion}: {count} samples")

# Encode emotions
label_to_int = {'joy': 0,'sadness': 1,'anger': 2,'neutral': 3,'surprise': 4,'disgust': 5,'fear': 6}
encoded_emotions = np.array([label_to_int[emotion] for emotion in emotions])

# Tokenize causes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(causes)
sequences = tokenizer.texts_to_sequences(causes)

# Pad sequences to ensure uniform length
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Convert data to numpy arrays
X = np.array(padded_sequences)
y = np.array(encoded_emotions)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Calculate and print the number of samples for each emotion after SMOTE
resampled_emotion_counts = Counter(y_resampled)
print("\nAfter SMOTE:")
for emotion_idx, count in resampled_emotion_counts.items():
    emotion = [key for key, value in label_to_int.items() if value == emotion_idx][0]
    print(f"{emotion}: {count} samples")

# Load pre-trained GloVe embeddings
glove_embeddings_index = load_glove_embeddings('/kaggle/input/glove-embeddings/glove.6B.300d.txt')

# Create embedding matrix
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Further split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define input layer
inputs = Input(shape=(max_len,))

# Embedding layer with GloVe embeddings
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_len,
                            trainable=False)(inputs)

# Bidirectional LSTM layer
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)

# Apply attention
attention_output = AttentionLayer()(lstm_layer)

# Dense layer for classification
output = Dense(7, activation='softmax')(attention_output)  # Change units to 7 for 7 classes

# Define model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])  # Change loss function

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Classification report on test data
y_pred_test = model.predict(X_test)
y_pred_classes_test = np.argmax(y_pred_test, axis=1)
print("Classification Report on Test Data:")
print(classification_report(y_test, y_pred_classes_test, zero_division=1))



Before SMOTE:
surprise: 2185 samples
anger: 2130 samples
sadness: 1443 samples
joy: 2760 samples
disgust: 534 samples
fear: 312 samples

After SMOTE:
surprise: 2760 samples
anger: 2760 samples
sadness: 2760 samples
joy: 2760 samples
disgust: 2760 samples
fear: 2760 samples
Epoch 1/20




[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 59ms/step - accuracy: 0.2331 - loss: 1.7736 - val_accuracy: 0.3460 - val_loss: 1.5742
Epoch 2/20
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 57ms/step - accuracy: 0.3654 - loss: 1.5355 - val_accuracy: 0.3842 - val_loss: 1.5160
Epoch 3/20
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 60ms/step - accuracy: 0.4124 - loss: 1.4430 - val_accuracy: 0.3996 - val_loss: 1.4937
Epoch 4/20
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 59ms/step - accuracy: 0.4539 - loss: 1.3848 - val_accuracy: 0.4109 - val_loss: 1.4862
Epoch 5/20
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 56ms/step - accuracy: 0.4740 - loss: 1.3353 - val_accuracy: 0.4211 - val_loss: 1.4887
Epoch 6/20
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 59ms/step - accuracy: 0.5132 - loss: 1.2501 - val_accuracy: 0.4200 - val_loss: 1.4798
Epoch 7/20
[1m332/332[0m 

In [18]:
# Classification report on test data with causes
print("Classification Report on Test Data:")
for pred_class, true_class, cause in zip(y_pred_classes_test, y_test, tokenizer.sequences_to_texts(X_test)):
    true_emotion = [key for key, value in label_to_int.items() if value == true_class][0]
    pred_emotion = [key for key, value in label_to_int.items() if value == pred_class][0]
    print(f"Cause: {cause}\nTrue Emotion: {true_emotion}\tPredicted Emotion: {pred_emotion}\n")


Classification Report on Test Data:
Cause: 7 i wish i did not have to move
True Emotion: surprise	Predicted Emotion: sadness

Cause: 11 yeah
True Emotion: surprise	Predicted Emotion: surprise

Cause: 6 i won it fair and square
True Emotion: anger	Predicted Emotion: sadness

Cause: 13 why do not we all go get something to eat
True Emotion: joy	Predicted Emotion: anger

Cause: gonna picked know 6 good
True Emotion: sadness	Predicted Emotion: disgust

Cause: 2 nice sidestep on the do do thing
True Emotion: joy	Predicted Emotion: joy

Cause: be that 11 finish like hear does be worried wait waiter us 14
True Emotion: disgust	Predicted Emotion: disgust

Cause: 1 kathy kathy hi kathy kathy
True Emotion: joy	Predicted Emotion: surprise

Cause: 2 you are right it is a ridiculous name
True Emotion: sadness	Predicted Emotion: disgust

Cause: have just go it centimeters
True Emotion: sadness	Predicted Emotion: sadness

Cause: 6 what do you got
True Emotion: surprise	Predicted Emotion: surprise

Ca

In [19]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes_test)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[360  49  42 100  22   9]
 [ 39 246  75  52  98  74]
 [ 43  74 237  86  78  36]
 [120  66  79 165  61  43]
 [ 27  68  53  57 223 105]
 [ 17  75  32  38 138 225]]


# causal-model

In [32]:
import numpy as np

# Load GloVe embeddings into memory
def load_glove_vectors(glove_file):
    word_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

# Function to embed text using GloVe
def embed_text_with_glove(text, word_vectors):
    words = text.split()
    vectors = [word_vectors[word] for word in words if word in word_vectors]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(len(next(iter(word_vectors.values()))))  # return zero vector if no words found in GloVe

glove_file_path = '/kaggle/input/glove-embeddings/glove.6B.300d.txt'
glove_vectors = load_glove_vectors(glove_file_path)

# Preprocess data and embed text using GloVe
X_embedded = []
y = []

for conversation in data:
    utterance_ids_in_pairs = [pair[1].split('_')[0] for pair in conversation['emotion-cause_pairs']]
    for utterance in conversation['conversation']:
        utterance_id = str(utterance['utterance_ID'])
        text = utterance['text']
        label = 1 if utterance_id in utterance_ids_in_pairs else 0
        embedded_text = embed_text_with_glove(text, glove_vectors)
        X_embedded.append(embedded_text)
        y.append(label)

# Convert lists to numpy arrays
X_embedded = np.array(X_embedded)
y = np.array(y)

print(X_embedded)
print(y)

np.save('X_embedded.npy', X_embedded)
np.save('y.npy', y)

[[-0.05166728  0.01978239  0.08015028 ... -0.15271837 -0.05106007
   0.14995442]
 [-0.09447813  0.06604666 -0.01660517 ... -0.16656166 -0.10229301
   0.30511767]
 [-0.21591417 -0.00461309 -0.00910984 ... -0.14511292 -0.05147767
   0.16813822]
 ...
 [-0.08328585  0.1775775  -0.1382305  ... -0.412725    0.06362925
   0.24444999]
 [-0.05025455  0.10541382  0.05623436 ... -0.20307216  0.12311244
   0.10726354]
 [-0.00326    -0.12505667  0.11440733 ... -0.11017466 -0.03874334
   0.35833466]]
[1 0 1 ... 0 1 0]


In [38]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the data
X_embedded = np.load('X_embedded.npy')
y = np.load('y.npy')

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X_embedded, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Predict probabilities on the test set
y_pred_proba = model.predict(X_test)

# Threshold probabilities to obtain predicted classes
y_pred = (y_pred_proba > 0.5).astype(int)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

Epoch 1/10


  ):


[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5385 - loss: 0.6828 - val_accuracy: 0.6177 - val_loss: 0.6341
Epoch 2/10
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6032 - loss: 0.6477 - val_accuracy: 0.6559 - val_loss: 0.6233
Epoch 3/10
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6251 - loss: 0.6336 - val_accuracy: 0.6564 - val_loss: 0.6175
Epoch 4/10
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6333 - loss: 0.6366 - val_accuracy: 0.6608 - val_loss: 0.6150
Epoch 5/10
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6404 - loss: 0.6349 - val_accuracy: 0.6676 - val_loss: 0.6123
Epoch 6/10
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6501 - loss: 0.6278 - val_accuracy: 0.6608 - val_loss: 0.6150
Epoch 7/10
[1m298/298[0m [32m━━━━━━━