# LSTM model with POS

## Preprocessing

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import pandas as pd

# Example DataFrame columns: 
df = pd.read_csv('emotion_data_merged_4_POS_crf.csv')

# Tokenize sentences
sentence_tokenizer = Tokenizer()
sentence_tokenizer.fit_on_texts(df['sentence'])
X_sentences = sentence_tokenizer.texts_to_sequences(df['sentence'])
X_sentences = pad_sequences(X_sentences, maxlen=100)  # Adjust maxlen as needed

# Tokenize POS tags
pos_tokenizer = Tokenizer()
pos_tokenizer.fit_on_texts(df['POS_crf'])
X_pos = pos_tokenizer.texts_to_sequences(df['POS_crf'])
X_pos = pad_sequences(X_pos, maxlen=100)  # Ensure this matches the text sequence length

# Encode labels
y = pd.get_dummies(df['emotion']).values

# Split the dataset
X_sentences_train, X_sentences_test, X_pos_train, X_pos_test, y_train, y_test = train_test_split(X_sentences, X_pos, y, test_size=0.2, random_state=42)


## Building the model

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate
from tensorflow.keras.optimizers import Adam

# Define inputs
sentence_input = Input(shape=(100,), dtype='int32', name='sentence_input')  # Adjust shape based on your data
pos_input = Input(shape=(100,), dtype='int32', name='pos_input')  # Adjust shape as needed

# Embeddings
sentence_emb = Embedding(input_dim=len(sentence_tokenizer.word_index)+1, output_dim=128, input_length=100)(sentence_input)
pos_emb = Embedding(input_dim=len(pos_tokenizer.word_index)+1, output_dim=64, input_length=100)(pos_input)

# LSTM layers
sentence_lstm = LSTM(64)(sentence_emb)
pos_lstm = LSTM(32)(pos_emb)

# Concatenate the outputs
concatenated = concatenate([sentence_lstm, pos_lstm], axis=-1)

# Add a classifier
output = Dense(len(df['emotion'].unique()), activation='softmax')(concatenated)

# Build the model
model = Model(inputs=[sentence_input, pos_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sentence_input (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 pos_input (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 100, 128)     10559616    ['sentence_input[0][0]']         
                                                                                                  
 embedding_1 (Embedding)        (None, 100, 64)      6059072     ['pos_input[0][0]']              
                                                                                              

## Training the model

In [3]:
model.fit([X_sentences_train, X_pos_train], y_train, batch_size=32, epochs=10, validation_data=([X_sentences_test, X_pos_test], y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c01f0a48e0>

## Evaluating the model

In [4]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate([X_sentences_test, X_pos_test], y_test)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Test Loss: 0.2122635692358017
Test Accuracy: 0.9616082906723022


## Kaggle dataset

In [11]:
# Load the dataset
kaggle_df = pd.read_csv('kaggle_data_POS_crf.csv', sep= ',')
kaggle_df.head()

Unnamed: 0,id,sentence,POS_crf
0,0,Girls are happy when they get flowers,"[('Girls', 'VB'), ('are', 'DT'), ('happy', 'JJ..."
1,1,His jaw dropped in disbelief when he saw the p...,"[('His', 'NNP'), ('jaw', 'NN'), ('dropped', 'N..."
2,2,Sometimes the ugly stench makes me wanna throw...,"[('Sometimes', 'NNP'), ('the', 'NN'), ('ugly',..."
3,3,The foul odor from the garbage bin was disgust...,"[('The', 'DT'), ('foul', 'JJ'), ('odor', 'NN')..."
4,4,"I can’t believe it, they lost the game in the ...","[('I', 'PRP'), ('can’t', 'VBP'), ('believe', '..."


## Preprocess the sentences and POS tags

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming sentence_tokenizer and pos_tokenizer were trained on the original dataset
X_new_sentences = sentence_tokenizer.texts_to_sequences(kaggle_df['sentence'])
X_new_sentences = pad_sequences(X_new_sentences, maxlen=100)

X_new_pos = pos_tokenizer.texts_to_sequences(kaggle_df['POS_crf'])
X_new_pos = pad_sequences(X_new_pos, maxlen=100)

## Predict labels

In [17]:
import numpy as np

# Predict
predictions = model.predict([X_new_sentences, X_new_pos])
predicted_classes = np.argmax(predictions, axis=1)

# Convert predicted classes back to labels using the mapping from training
unique_labels = df['emotion'].unique()  
label_to_index = {label: index for index, label in enumerate(unique_labels)}
index_to_label = {index: label for label, index in label_to_index.items()}

# Now use this mapping to convert predicted classes back to labels
predicted_labels = [index_to_label[k] for k in predicted_classes]


 1/45 [..............................] - ETA: 1s



In [24]:
# Create a DataFrame with the IDs and predicted labels
results_df = pd.DataFrame({
    'id': kaggle_df['id'],  # Make sure 'id' is the correct column name in your dataset
    'emotion': predicted_labels
})

# Save the DataFrame to a CSV file
results_df.to_csv('LSTM_POS.csv', index=False)


Predictions saved to predicted_labels.csv


In [25]:
results_df['emotion'].value_counts()

emotion
anger        374
surprise     288
sadness      221
fear         213
happiness    176
disgust      164
Name: count, dtype: int64

In [20]:
# Save the model
model.save('LSTM_POS_model.h5')