In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


from keras.models import Sequential
from keras.layers import Dense, Embedding,LSTM, SpatialDropout1D

In [32]:
twt = pd.read_csv('Tweets.csv')
twt.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)


In [33]:
twt.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [34]:
twt.query("airline_sentiment_confidence < 0.5").head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
114,569861209781989377,positive,0.3482,,0.0,Virgin America,,AdamJdubs,,0,@VirginAmerica come back to #PHL already. We n...,,2015-02-23 06:07:54 -0800,Earth,Eastern Time (US & Canada)


In [35]:
twt = twt.query("airline_sentiment_confidence > 0.8")

In [36]:
tokenizer = Tokenizer(num_words=300)
tokenizer.fit_on_texts(twt['text'])

X = tokenizer.texts_to_sequences(twt['text'])

for i in range(0,4):
    print(X[i])

[97, 62, 229]
[97, 99, 131, 1, 15, 20, 56, 57, 23]
[97, 9, 99, 4, 131, 200, 84, 21]
[97, 90, 270, 203, 4, 7, 5, 201, 28, 211, 23, 29, 99, 131, 2, 126, 200, 84, 134]


In [37]:
words = tokenizer.word_index
list(words.items())[0:5]

[('to', 1), ('the', 2), ('i', 3), ('a', 4), ('for', 5)]

In [38]:
len(tokenizer.word_index)

12802

In [39]:
X = pad_sequences(X,maxlen=300,padding='post')

for i in range(0,1):
    print(X[i])

[ 97  62 229   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   

In [40]:
X.shape

(10459, 300)

In [41]:
print(X)

[[ 97  62 229 ...   0   0   0]
 [ 97  99 131 ...   0   0   0]
 [ 97   9  99 ...   0   0   0]
 ...
 [ 13  98 294 ...   0   0   0]
 [ 13  89 265 ...   0   0   0]
 [ 13   6  23 ...   0   0   0]]


In [42]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(twt['airline_sentiment'])
y.max()


2

In [43]:
y = to_categorical(y,num_classes=3)
y

# Alternative way !
# y2 = pd.get_dummies(twt.airline_sentiment).to_numpy()

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [44]:
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index),output_dim=128,input_shape=(300,))) 
# input_dim = Size of the vocabulary
# output_dim = Dimension of the dense embedding
# In Keras3 implementation the parameter input_length is removed as it is redundant as you can use input_shape as kwarg. 
# Even if you won't provide input_shape the API can infer it from the input being passed to model.
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196,dropout=0.2,recurrent_dropout=0,activation='tanh',recurrent_activation='sigmoid',unroll=False,use_bias=True))
model.add(Dense(units=3,activation="softmax"))

  super().__init__(**kwargs)


In [45]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

None


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [47]:
model.fit(x=X_train,
    y=y_train,
    batch_size=30,
    epochs=20,
    validation_data=(X_test,y_test),
    verbose=True)

Epoch 1/20
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 214ms/step - accuracy: 0.6878 - loss: 0.8379 - val_accuracy: 0.7136 - val_loss: 0.7985
Epoch 2/20
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 207ms/step - accuracy: 0.7095 - loss: 0.8055 - val_accuracy: 0.7136 - val_loss: 0.7995
Epoch 3/20
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 205ms/step - accuracy: 0.7065 - loss: 0.8116 - val_accuracy: 0.7136 - val_loss: 0.7978
Epoch 4/20
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 208ms/step - accuracy: 0.7113 - loss: 0.8024 - val_accuracy: 0.7136 - val_loss: 0.7995
Epoch 5/20
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 203ms/step - accuracy: 0.6995 - loss: 0.8211 - val_accuracy: 0.7136 - val_loss: 0.7974
Epoch 6/20
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 203ms/step - accuracy: 0.7063 - loss: 0.8108 - val_accuracy: 0.7136 - val_loss: 0.7978
Epoch 7/20

<keras.src.callbacks.history.History at 0x144ce80bbe0>

In [48]:
loss, acc = model.evaluate(X_test,y_test)

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 64ms/step - accuracy: 0.7157 - loss: 0.7951
