In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from google.colab import files
import numpy as np

In [7]:
tweets = pd.read_csv("Tweets.csv")

In [8]:
tweets

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [9]:
tweets.shape

(14640, 15)

In [10]:
tweets.dtypes

Unnamed: 0,0
tweet_id,int64
airline_sentiment,object
airline_sentiment_confidence,float64
negativereason,object
negativereason_confidence,float64
airline,object
airline_sentiment_gold,object
name,object
negativereason_gold,object
retweet_count,int64


In [11]:
tweets.groupby(['airline_sentiment']).size()

Unnamed: 0_level_0,0
airline_sentiment,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


In [12]:
#Filtrar os tweets que possuem uma confiança maior sobre o sentimento
tweets = tweets[tweets['airline_sentiment_confidence'] >= 0.8]

In [13]:
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


In [14]:
tweets.shape

(10459, 15)

# Pré-processamento

Tokenizando a mensagem

In [15]:
token = Tokenizer(num_words=100)
#cira um vocabulário atribuindo um índice único para cada palavra, com base na frequência de ocorrência. 100 palavras
token.fit_on_texts(tweets['text'].values)
print(token.word_counts)



In [16]:
#converte os textos em sequências numéricas
X = token.texts_to_sequences(tweets['text'].values)

#Todas as sequências são ajustadas para terem exatamente 100 tokens.
#Se um texto for menor, ele recebe zeros no final (padding='post').
#Se um texto for maior, ele será cortado para os primeiros 100 tokens.
X = pad_sequences(X, padding='post', maxlen=100)

In [17]:
print(X)

[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


Transformando a variável alvo

In [18]:
labelencoder = LabelEncoder()

In [19]:
y = labelencoder.fit_transform(tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


OneHotEncoding

In [20]:
y = to_categorical(y)

In [21]:
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [22]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
token.word_index

{'to': 1,
 'the': 2,
 'i': 3,
 'a': 4,
 'for': 5,
 'you': 6,
 'flight': 7,
 'united': 8,
 'and': 9,
 'on': 10,
 'my': 11,
 'usairways': 12,
 'americanair': 13,
 'is': 14,
 'in': 15,
 'southwestair': 16,
 'of': 17,
 'jetblue': 18,
 'me': 19,
 'your': 20,
 'it': 21,
 'not': 22,
 'have': 23,
 'no': 24,
 'was': 25,
 'with': 26,
 'at': 27,
 'that': 28,
 'this': 29,
 'get': 30,
 'from': 31,
 'cancelled': 32,
 'but': 33,
 'service': 34,
 'be': 35,
 'now': 36,
 'are': 37,
 'an': 38,
 't': 39,
 'co': 40,
 '2': 41,
 'we': 42,
 'been': 43,
 'can': 44,
 'http': 45,
 'thanks': 46,
 'just': 47,
 'customer': 48,
 'so': 49,
 'do': 50,
 'time': 51,
 'hours': 52,
 'help': 53,
 'hold': 54,
 'up': 55,
 'amp': 56,
 'they': 57,
 'us': 58,
 'out': 59,
 'plane': 60,
 'will': 61,
 'what': 62,
 'our': 63,
 'still': 64,
 'why': 65,
 'when': 66,
 'flights': 67,
 'how': 68,
 'delayed': 69,
 "i'm": 70,
 'all': 71,
 'call': 72,
 'hour': 73,
 'one': 74,
 'flightled': 75,
 'thank': 76,
 'gate': 77,
 'bag': 78,
 'if': 

LSTM é a sigla para Long Short-Term Memory, ou seja, **memória de curto e longo prazo**. É um tipo de rede neural recorrente (RNN) que consegue **memorizar informações por longos períodos de tempo**.

In [24]:
model = Sequential()
#transformar tokens numéricos (índices de palavras) em vetores densos de dimensão fixa. Fundamental para captura de relações semânticas
model.add(Embedding(input_dim=len(token.word_index),
                    output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196,
               dropout=0.22,
               recurrent_dropout=0,
               activation='tanh',
               recurrent_activation='sigmoid',
               unroll=False,
               use_bias=True))
model.add(Dense(units=3,
                activation="softmax"))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
#cada epoch significa a passagem de dados pela rede
model.fit(X_train, y_train, epochs=10, batch_size=30, validation_data=(X_test, y_test), verbose=True)

Epoch 1/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.7003 - loss: 0.8351 - val_accuracy: 0.7192 - val_loss: 0.7953
Epoch 2/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.7038 - loss: 0.8173 - val_accuracy: 0.7192 - val_loss: 0.7890
Epoch 3/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.7022 - loss: 0.8193 - val_accuracy: 0.7192 - val_loss: 0.7959
Epoch 4/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.7123 - loss: 0.8023 - val_accuracy: 0.7192 - val_loss: 0.7933
Epoch 5/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.7065 - loss: 0.8129 - val_accuracy: 0.7192 - val_loss: 0.7895
Epoch 6/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.7004 - loss: 0.8199 - val_accuracy: 0.7192 - val_loss: 0.7884
Epoch 7/10
[1m245/24

<keras.src.callbacks.history.History at 0x7f584bc49550>

In [27]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7126 - loss: 0.7998
Loss:  0.7893103957176208
Accuracy:  0.7192479372024536
