In [25]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np


In [36]:
# Load the datasets
train_data = pd.read_csv('Train.csv')
mapping_data = pd.read_csv('Mapping.csv')
test_data = pd.read_csv('Test.csv')

train_data.head(), mapping_data.head(), test_data.head()


(   Unnamed: 0                                               TEXT  Label
 0           0  Vacation wasted ! #vacation2017 #photobomb #ti...      0
 1           1  Oh Wynwood, you’re so funny! : @user #Wynwood ...      1
 2           2  Been friends since 7th grade. Look at us now w...      2
 3           3  This is what it looks like when someone loves ...      3
 4           4  RT @user this white family was invited to a Bl...      3,
    Unnamed: 0 emoticons  number
 0           0         😜       0
 1           1         📸       1
 2           2         😍       2
 3           3         😂       3
 4           4         😉       4,
    Unnamed: 0  id                                               TEXT
 0           0   0  Thought this was cool...#Repost (get_repost)・・...
 1           1   1  Happy 4th! Corte madera parade. #everytownusa ...
 2           2   2  Luv. Or at least something close to it. @ Unio...
 3           3   3  There's a slice of pie under that whipped crea...
 4          

In [29]:
# Parameters
vocab_size = 10000  # Maximum number of words in the vocabulary
embedding_dim = 100  # Size of the word embeddings
max_length = 100  # Maximum length of the input sequences
padding_type = 'post'
truncating_type = 'post'
oov_token = "<OOV>"  # Token for out of vocabulary words

# Initialize the tokenizer and fit on training texts
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_data['TEXT'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(train_data['TEXT'])

# Pad sequences to ensure uniform length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)

# Check the padded sequences
padded_sequences[:5]


array([[ 655, 6434,    1, 5442, 1773,    1,    1,  151, 1419,  314,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [ 214, 1364, 1192,   23,  633,    3, 1364,  200,    1, 1365,  510,
           1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [31]:
# Get labels from train data
labels = train_data['Label'].values

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Check the shape of train and validation sets
X_train.shape, X_val.shape


((56000, 100), (14000, 100))

In [33]:
# Build the sequential model
model = Sequential()

# Add an embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# Add an LSTM layer
model.add(LSTM(128, return_sequences=False))

# Add a dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Add a dense layer
model.add(Dense(64, activation='relu'))

# Add the final output layer (softmax for multi-class classification)
model.add(Dense(len(mapping_data), activation='softmax'))  # Output size matches the number of emoji labels

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()




In [35]:
# Train the model on the training data
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)


Epoch 1/10
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 83ms/step - accuracy: 0.2064 - loss: 2.7740 - val_accuracy: 0.2178 - val_loss: 2.7347
Epoch 2/10
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 82ms/step - accuracy: 0.2137 - loss: 2.7438 - val_accuracy: 0.2178 - val_loss: 2.7341
Epoch 3/10
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 84ms/step - accuracy: 0.2146 - loss: 2.7399 - val_accuracy: 0.2178 - val_loss: 2.7336
Epoch 4/10
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 85ms/step - accuracy: 0.2164 - loss: 2.7384 - val_accuracy: 0.2178 - val_loss: 2.7336
Epoch 5/10
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 85ms/step - accuracy: 0.2165 - loss: 2.7358 - val_accuracy: 0.2178 - val_loss: 2.7312
Epoch 6/10
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 85ms/step - accuracy: 0.2169 - loss: 2.7319 - val_accuracy: 0.2178 - val_loss: 2.731

In [39]:
# Convert test text to sequences and pad them
test_sequences = tokenizer.texts_to_sequences(test_data['TEXT'])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)

# Make predictions on the test data
predictions = model.predict(test_padded)

# Get the predicted emoji labels
predicted_labels = np.argmax(predictions, axis=1)

# Map the predicted labels back to emojis using mapping_data
predicted_emojis = mapping_data['emoticons'][predicted_labels]

# Display the first few predicted emojis for the test set
predicted_emojis[:5]


[1m812/812[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 38ms/step


9    ❤
9    ❤
9    ❤
9    ❤
9    ❤
Name: emoticons, dtype: object