# Links
https://www.kaggle.com/code/ngyptr/lstm-sentiment-analysis-keras/notebook
https://peyrone.medium.com/building-an-easy-sentiment-analysis-model-using-keras-89ec3d6308b8
https://keras.io/examples/nlp/bidirectional_lstm_imdb/

# Imports

In [1]:
import pandas as pd

import numpy as np
import keras

from keras import layers
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Loading the data

In [2]:
df = pd.read_csv("./data/twitter_training_cleaned_preprocessed.csv", index_col=0)
df.dropna(inplace=True)
df.head()

Unnamed: 0,tweet,sentiment,no_sw,no_sw_lem
0,booo,Negative,booo,booo
1,ok hear me out microsoft is going to pull a mi...,Neutral,hear microsoft pull mix ass prove choose xbox ...,hear microsoft pull mix ass prove choose xbox ...
2,hopping on the uzi is pretty good fortunate...,Positive,hopping uzi pretty fortunate time twitchtvjoke65,hopping uzi pretty fortunate time twitchtvjoke65
3,mr christoph sandrock best pubg teammate rescu...,Positive,mr christoph sandrock pubg teammate rescuer cr...,mr christoph sandrock pubg teammate rescuer cr...
4,eamaddennfl what is up with these player ratin...,Negative,eamaddennfl player ratings algorithm wrong,eamaddennfl player ratings algorithm wrong


# Preparing the data

In [3]:
dt = df.copy()
dt["sentiment_label"] = df["sentiment"].map({"Positive": 0, "Negative": 1, "Neutral": 0, "Irrelevant": 0})
dt["sentiment_label"] = dt["sentiment_label"].astype(np.float32)
dt["tweet"] = dt["tweet"].astype("string")
dt.drop(columns=['sentiment', "no_sw", "no_sw_lem"], axis=1, inplace=True)
dt.head()

Unnamed: 0,tweet,sentiment_label
0,booo,1.0
1,ok hear me out microsoft is going to pull a mi...,0.0
2,hopping on the uzi is pretty good fortunate...,0.0
3,mr christoph sandrock best pubg teammate rescu...,0.0
4,eamaddennfl what is up with these player ratin...,1.0


In [4]:
max_features=20000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(dt['tweet'].values)

X = tokenizer.texts_to_sequences(dt['tweet'].values)
X = tf.keras.preprocessing.sequence.pad_sequences(X)
y = dt['sentiment_label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(40812, 62) (40812,)
(13605, 62) (13605,)


# Variant: binary target

## Model 1a: functional, bidirectional-LSTM

In [6]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model1 = keras.Model(inputs, outputs)
model1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,757,761
Trainable params: 2,757,761
Non-train

In [7]:
model1.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model1.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x28812637c70>

## Model 1b: sequential-LSTM

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

In [15]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 128
BATCH_SIZE = 32
NUM_EPOCHS = 15

MAX_FEATURES = 20000
MAX_SENTENCE_LENGTH = 62

In [16]:
model2 = Sequential()

model2.add(Embedding(MAX_FEATURES, EMBEDDING_SIZE, input_length=MAX_SENTENCE_LENGTH))
model2.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(1, activation='sigmoid'))

model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 62, 128)           2560000   
                                                                 
 lstm_4 (LSTM)               (None, 128)               131584    
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [17]:
model2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model2.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2882b18e3e0>

# Variant: multi-class target

## Model 2a: functional, bidirectional-LSTM

In [18]:
# todo

## Model 2b: sequential-LSTM

In [None]:
# todo