In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from google.colab import files
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
tweets = pd.read_csv('Tweets2.csv')

In [5]:
tweets.shape

(74682, 4)

In [6]:
tweets.dtypes

Unnamed: 0,0
id,int64
local,object
sentiment,object
text,object


In [7]:
tweets

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [8]:
tweets.groupby(['sentiment']).size()

Unnamed: 0_level_0,0
sentiment,Unnamed: 1_level_1
Irrelevant,12990
Negative,22542
Neutral,18318
Positive,20832


In [9]:
tweets.loc[tweets['sentiment']=='Irrelevant','sentiment'] = 'Neutral'

In [10]:
tweets = tweets.dropna(subset=['text'])
tweets.reset_index(drop=True, inplace=True)

In [11]:
tweets.shape

(73996, 4)

# Supervisionado

In [12]:
token = Tokenizer(num_words=100)
token.fit_on_texts(tweets['text'].values)

In [13]:
X = token.texts_to_sequences(tweets['text'].values)
X = pad_sequences(X, padding="post", maxlen=100)

In [14]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(tweets['sentiment'])
print(y)

[2 2 2 ... 2 2 2]


In [15]:
y = to_categorical(y)
print(y)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
X_test

array([[10, 21,  0, ...,  0,  0,  0],
       [ 1, 77,  0, ...,  0,  0,  0],
       [27, 79,  0, ...,  0,  0,  0],
       ...,
       [ 2,  5,  4, ...,  0,  0,  0],
       [49, 50,  2, ...,  0,  0,  0],
       [ 2, 22, 33, ...,  0,  0,  0]], dtype=int32)

In [17]:
modelo = Sequential()
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh',
                recurrent_activation='sigmoid', unroll=False, use_bias=True))
modelo.add(Dense(units=3,activation="softmax"))

In [18]:
modelo.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])


In [19]:
modelo.fit(X_train, y_train, epochs=5, batch_size=500,verbose=True)

Epoch 1/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 47ms/step - accuracy: 0.4135 - loss: 1.0844
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 45ms/step - accuracy: 0.4218 - loss: 1.0817
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.4245 - loss: 1.0808
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.4185 - loss: 1.0829
Epoch 5/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.4178 - loss: 1.0828


<keras.src.callbacks.history.History at 0x7f82d15c0450>

In [20]:
_, accuracy = modelo.evaluate(X_test,y_test)
print("Accuracy: ", accuracy)

[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.4172 - loss: 1.0830
Accuracy:  0.41700732707977295


# Vader

In [30]:
mas = SentimentIntensityAnalyzer()

In [31]:
tweets['vader_sentiment'] = ''

for y in range(len(tweets.index)):
  x = mas.polarity_scores(tweets['text'].iloc[y])
  del x['compound']
  max_sentiment = max(x, key=x.get) #neg pos or neut

  tweets.loc[y, 'vader_sentiment'] = max_sentiment

KeyError: 'text'

In [23]:
tweets.head()

Unnamed: 0,id,local,sentiment,text,vader_sentiment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,neu
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,neu
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,neu
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,neu
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,neu


In [24]:
tweets.groupby(['sentiment']).size()

Unnamed: 0_level_0,0
sentiment,Unnamed: 1_level_1
Negative,22358
Neutral,30983
Positive,20655


In [25]:
tweets.groupby(['vader_sentiment']).size()

Unnamed: 0_level_0,0
vader_sentiment,Unnamed: 1_level_1
neg,3660
neu,65581
pos,4755


In [29]:
print(tweets.columns)

AttributeError: 'Series' object has no attribute 'columns'

In [28]:
tweets.loc[tweets['vader_sentiment'] == 'neg', 'vader_sentiment'] == 'Negative'
tweets.loc[tweets['vader_sentiment'] == 'neu', 'vader_sentiment'] == 'Neutral'
tweets.loc[tweets['vader_sentiment'] == 'pos', 'vader_sentiment'] == 'Positive'

KeyError: 'vader_sentiment'

In [None]:
tweets.groupby(['vader_sentiment']).size()

In [None]:
y_pred = tweets['vader_sentiment']
y_test = tweets['sentiment']

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)