# Imports

In [1]:
import sys
sys.path.insert(0,'..')
%load_ext autoreload
%autoreload 2

In [18]:
from src.utils import model_utils as mod_utils
from src.utils import twitter_utils as tt_utils
from src.models import build_model
import pandas as pd
import swifter
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, GRU
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [10, 5]

# Leitura e tratamento de dados

## Leitura da base para treinamento

In [9]:
df = pd.read_csv("../data/external/Tweets_Mg_2.csv", encoding='utf-8')
df.head(3)

Unnamed: 0.1,Unnamed: 0,Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,Classificacao,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,0,Sun Jan 08 01:22:05 +0000 2017,ÔøΩÔøΩÔøΩ‚õ™ @ Catedral de Santo Ant√¥nio - Governador ...,,,Brasil,Leonardo C Schneider,LeoCSchneider,0,Neutro,...,,,,,,,,,,
1,1,Sun Jan 08 01:49:01 +0000 2017,"ÔøΩ @ Governador Valadares, Minas Gerais https:/...",-41.9333,-18.85,,W√¢ndell,klefnews,0,Neutro,...,,,,,,,,,,
2,2,Sun Jan 08 01:01:46 +0000 2017,"ÔøΩÔøΩ @ Governador Valadares, Minas Gerais https:...",-41.9333,-18.85,,W√¢ndell,klefnews,0,Neutro,...,,,,,,,,,,


In [5]:
df['Classificacao'].value_counts()

Positivo    3300
Neutro      2453
Negativo    2446
Name: Classificacao, dtype: int64

In [6]:
df = df[df['Classificacao']!= 'Neutro']

In [7]:
df['Classificacao'].value_counts()

Positivo    3300
Negativo    2446
Name: Classificacao, dtype: int64

In [10]:
X = df['Text'].values
y = df['Classificacao'].values
X

array(['ÔøΩÔøΩÔøΩ‚õ™ @ Catedral de Santo Ant√¥nio - Governador Valadares/MG https://t.co/JSbKamIqUJ',
       'ÔøΩ @ Governador Valadares, Minas Gerais https://t.co/B3ThIDJCSf',
       'ÔøΩÔøΩ @ Governador Valadares, Minas Gerais https://t.co/dPkgzVR2Qw',
       ...,
       'Trio √© preso suspeito de roubo, tr√°fico e abuso sexual em Uberl√¢ndia https://t.co/zaQbXRRJWc',
       'Trio √© preso suspeito de roubo, tr√°fico e abuso sexual em Uberl√¢ndia: Um dos autores teria molestado v√≠tima de‚Ä¶ https://t.co/lQ8cTSNftA',
       'Trio suspeito de roubo de cargas √© preso em Santa Luzia (MG) https://t.co/0INgJcMtZb #R7MG #RecordTVMinas'],
      dtype=object)

In [11]:
X = np.array([tt_utils.remove_url(tweet) for tweet in X])
X

array(['Catedral de Santo Antnio Governador ValadaresMG',
       'Governador Valadares Minas Gerais',
       'Governador Valadares Minas Gerais', ...,
       'Trio preso suspeito de roubo trfico e abuso sexual em Uberlndia',
       'Trio preso suspeito de roubo trfico e abuso sexual em Uberlndia Um dos autores teria molestado vtima de',
       'Trio suspeito de roubo de cargas preso em Santa Luzia MG R7MG RecordTVMinas'],
      dtype='<U138')

In [12]:
y = np.where(y=='Positivo', 1, 0)

## Utilizando o *tokenizer* nas palavras

In [13]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_x = t.texts_to_sequences(X)
print(encoded_x)
# pad documents to a max length of 4 words

[[3743, 1, 1883, 3744, 26, 803], [26, 68, 3, 14], [26, 68, 3, 14], [], [1884, 100, 2200, 804, 1, 848, 7, 216, 1, 115, 33, 52, 77, 5, 1, 3], [246, 1128, 3745, 1477, 1129, 1648, 7, 393, 1885, 28, 428, 51, 77, 5, 1, 3], [7, 1130, 40, 194, 3746, 12, 3747, 24, 2736, 3748, 893, 1, 79, 5, 21, 2737, 7, 1649, 1049], [8, 2201, 2202, 11, 338, 2, 1478, 12, 1479, 978, 11, 5, 1, 3], [63, 7, 9, 1, 3, 665, 622, 139, 134, 119], [21, 894, 115, 393, 3749, 169, 175, 3750, 3751, 8, 1480, 11, 5, 1, 6, 21, 136, 2203, 1, 1341, 3752, 3753], [27, 8, 699, 1481, 26, 3754, 979, 1342, 209, 3755, 8, 699, 34, 3756, 2738, 3757, 1482, 25, 200, 12, 2204, 2738], [2739, 7, 2739, 3758, 1886, 1650, 26, 68, 3, 14], [3759, 3760, 1483, 1887, 27, 144, 941, 1, 420, 13, 166, 805, 6, 12, 2740, 1, 157, 12, 26], [170, 170, 2205, 1888, 487, 2206, 12, 110, 1, 513, 5, 1, 3], [170, 170, 2205, 1888, 487, 2206, 12, 110, 1, 513, 5, 1, 3], [187, 1889, 1890, 113, 118, 5, 339, 1891, 1227, 3, 1892, 13, 1343, 10, 1893, 1894, 2741, 2742], [42, 3

In [14]:
max_length = 16
padded_x = pad_sequences(encoded_x, maxlen=max_length, padding='post')
print(padded_x)

[[3743    1 1883 ...    0    0    0]
 [  26   68    3 ...    0    0    0]
 [  26   68    3 ...    0    0    0]
 ...
 [ 408   45   78 ...    0    0    0]
 [  78    1   36 ... 7266 2063    1]
 [ 408   78    1 ...    0    0    0]]


## Leitura do embedding j√° treinado
Link: http://nilc.icmc.usp.br/embeddings

In [15]:
%%time
# load the whole embedding into memory
embeddings_index = mod_utils.load_embedding()

Loading Word2Vec Model...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 41.5 s, sys: 456 ms, total: 42 s
Wall time: 42.1 s


In [16]:
%%time
# create a weight matrix for words in training docs
embedding_matrix = mod_utils.create_embedding_matrix(embeddings_index, vocab_size, t)

CPU times: user 16.2 ms, sys: 5 ¬µs, total: 16.2 ms
Wall time: 15.9 ms


# Modelagem

## Kfold cross-validation

In [24]:
estimator = KerasClassifier(build_fn=build_model.baseline_model, epochs=20, batch_size=512, verbose=1)

In [25]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [26]:
results = cross_val_score(estimator, padded_x, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 8)             58136     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 16, 100)           2500      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 8, 100)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               200250    
_________________________________________________________________
dropout_1 (Dropout)  

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 16, 8)             58136     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 16, 100)           2500      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 8, 100)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               200250    
___________________________________________

Epoch 18/20
Epoch 19/20
Epoch 20/20
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 16, 8)             58136     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 16, 100)           2500      
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 8, 100)            0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 250)               200250    
_________________________________________________________________
dropout_6 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)  

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 16, 8)             58136     
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 16, 100)           2500      
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 8, 100)            0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 250)               200250    
___________________________________________

Epoch 18/20
Epoch 19/20
Epoch 20/20
Baseline: 92.12% (1.16%)


## Treinando o Modelo

In [85]:
# define model
model = Sequential()
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(GRU(units=64, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_x, y, epochs=50, batch_size=128, validation_split=0.2, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(padded_x, y, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 16, 50)            363350    
_________________________________________________________________
gru_8 (GRU)                  (None, 64)                22080     
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 65        
Total params: 385,495
Trainable params: 22,145
Non-trainable params: 363,350
_________________________________________________________________
None
Train on 6559 samples, validate on 1640 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epo

## Avalia√ß√£o no Treino

In [86]:
y_pred_prob = model.predict(padded_x)
y_pred_bin = np.where(y_pred_prob>=0.5, 1, 0)

In [87]:
print("AUC: %.2f" % roc_auc_score(y, y_pred_prob))
print(classification_report(y, y_pred_bin, target_names=["Negativo", "Positivo"]))

AUC: 1.00
              precision    recall  f1-score   support

    Negativo       0.95      1.00      0.97      4899
    Positivo       1.00      0.92      0.96      3300

    accuracy                           0.97      8199
   macro avg       0.97      0.96      0.96      8199
weighted avg       0.97      0.97      0.97      8199



In [30]:
model.save("../models/keras/sentiment-analysis.keras")

# Utilizando o modelo treinado para classifica√ß√£o dos novos dados
Os dados foram adquiridos utilizando a API do Twitter, procurando por palavras chave: ambev, skol e brahma

## Leitura e tratamento dos dados para *scoring*

In [31]:
df_tweets = pd.read_csv("../data/processed/tweets_ambev.csv", sep=';', encoding='utf-8')
df_tweets.drop(columns=['Unnamed: 0'], inplace=True)
df_tweets.head(3)

Unnamed: 0,user,tweet,location
0,xuuuuuuuuuuulia,"Feliz dia dos namorados @BrahmaCerveja, @skol,...","Maring√°, Brasil"
1,PedroPawlowski,O quanto gosto de cada bebida:\n\ngin: 4/10\nv...,"S√£o Paulo, Brasilien"
2,rogerjagielski,O quanto gosto de cada bebida:\n\ngin: 0/10 \n...,"Blumenau, Brasil"


In [32]:
X = df_tweets['tweet'].values
X

array(['Feliz dia dos namorados @BrahmaCerveja, @skol, @Eisenbahn üòç‚ù§',
       'O quanto gosto de cada bebida:\n\ngin: 4/10\nvodka: 5/10\npinga: 0/10\nvinho: 3/10\ncerveja: 8/10\nwhisky: 4/10\ntequila: 2‚Ä¶ https://t.co/8UE6poNxTX',
       'O quanto gosto de cada bebida:\n\ngin: 0/10 \nvodka: 7/10\npinga: 6/10\nvinho: 7/10\ncerveja: 10/10\nwhisky: 10/10\ntequila‚Ä¶ https://t.co/VEaGdJx2X2',
       ..., 'Skol n√£o. Skol jamais!',
       '@Tatadebh Obrigada, amiga \nMeu buqu√™ era uma latinha de brahma\nMeu pai perdeu meu casamento pq encheu a cara de quent√£o\nEsse dia foi lindo',
       'RT @Itspedrito: MEU DEUS SKOL BEATS CONTE COMIGO PRA TUDO https://t.co/XQUUe85cNA'],
      dtype=object)

In [33]:
X_test = np.array([tt_utils.remove_url(tweet) for tweet in X])
X_test

array(['Feliz dia dos namorados BrahmaCerveja skol Eisenbahn',
       'O quanto gosto de cada bebidagin 410vodka 510pinga 010vinho 310cerveja 810whisky 410tequila 2',
       'O quanto gosto de cada bebidagin 010 vodka 710pinga 610vinho 710cerveja 1010whisky 1010tequila',
       ..., 'Skol no Skol jamais',
       'Tatadebh Obrigada amiga Meu buqu era uma latinha de brahmaMeu pai perdeu meu casamento pq encheu a cara de quentoEsse dia foi lindo',
       'RT Itspedrito MEU DEUS SKOL BEATS CONTE COMIGO PRA TUDO'],
      dtype='<U137')

In [34]:
# integer encode the documents
encoded_x_test = t.texts_to_sequences(X_test)
print(encoded_x_test)


[[1416, 179, 40, 4290], [10, 3524, 2960, 1, 589, 38], [10, 3524, 2960, 1, 589], [850, 8, 744, 1, 455, 8, 4290, 34, 10, 584, 1097, 5317], [4, 3048, 27, 3122, 4291, 35, 385, 35, 629, 7, 35, 631], [], [4290, 6410, 386, 1411], [4, 179, 40, 252, 200, 43, 206, 2225, 13, 341, 1164, 583, 7, 1156, 316, 738, 535, 27], [4, 584, 1162, 4290, 206, 575], [10, 25, 200, 2746, 4290, 21, 584, 7, 3312, 3756, 79, 7, 2899, 11, 994], [179, 40, 252, 200, 43, 206, 2225, 13, 341, 1164, 583, 7, 1156, 316, 738, 8], [10, 3524, 2960, 1, 589], [4290], [176, 51, 290, 27, 4290, 136, 4563, 25, 200, 355, 1187, 1906], [1240, 80, 4290, 142, 2204, 79, 104, 1145, 3196, 11, 98, 200, 997, 79, 2224, 200, 25], [4290, 12, 4925], [4290, 142, 13, 12, 11, 584, 149, 79], [79, 1, 4291, 1157, 256], [6894, 21, 233, 4290], [4, 179, 40, 252, 200, 43, 206, 2225, 13, 341, 1164, 583, 7, 1156, 316, 738, 535], [4291, 13, 192, 136, 1233, 818, 158, 2819], [179, 40, 252, 200, 43, 206, 2225, 13, 341, 1164, 583, 7, 1156, 316, 738, 8], [1664, 2262,

In [35]:
padded_x_test = pad_sequences(encoded_x_test, maxlen=max_length, padding='post')
print(padded_x_test)

[[1416  179   40 ...    0    0    0]
 [  10 3524 2960 ...    0    0    0]
 [  10 3524 2960 ...    0    0    0]
 ...
 [4290   12 4290 ...    0    0    0]
 [ 995 3139  584 ...  179  227 2450]
 [   4  584 1162 ...    0    0    0]]


In [91]:
y_pred_test = model.predict(padded_x_test)
y_pred_test_bin = np.where(y_pred_test>0.5, "Positivo", "Negativo")

# Agregando informa√ß√µes para o gr√°fico

In [95]:
pd.set_option('display.max_colwidth', 200)

In [102]:
df_tweets.groupby(by=["location", 'Sentiment'], as_index=False).count()[['location', 'Sentiment', 'tweet']]

Unnamed: 0,location,Sentiment,tweet
0,RJ‚Ä¢CG ZO,Positivo,1
1,Brasil,Negativo,1
2,Hope World| JK+JM ‚ô°,Negativo,1
3,Texas ‚ù§Ô∏è,Negativo,1
4,·µò·µà·µê ¬∞ À°‚Å±·∂ª,Negativo,1
5,‚äô‚ô°‚äô ·µò·µà·µê f(x) exo,Negativo,1
6,üá∞üá∑¬≥¬π,Positivo,1
7,#0 13RW stan,Negativo,1
8,#Brasil | fan account,Negativo,1
9,$C,Negativo,1


In [108]:
df_group_location = df_tweets.groupby(by=["location", 'Sentiment'], as_index=False).count()[['location', 'Sentiment', 'tweet']]
df_group_location = df_group_location[df_group_location['location'].astype(str).str.contains('[^a-zA-Z]')]
df_group_location[df_group_location['tweet']>10]

Unnamed: 0,location,Sentiment,tweet
42,021,Negativo,17
214,"Belo Horizonte, Brasil",Negativo,39
215,"Belo Horizonte, Brasil",Positivo,17
282,Bras√≠lia,Negativo,12
288,"Bras√≠lia, Brasil",Negativo,29
451,"Curitiba, Brasil",Negativo,14
513,"Esp√≠rito Santo, Brasil",Negativo,18
544,"Fortaleza, Cear√°",Negativo,15
545,"Fortaleza, Cear√°",Positivo,17
706,"Joinville, Brasil",Negativo,21


In [110]:
df_group_location[df_group_location['tweet']>10].to_csv("../data/processed/sentiment-region.csv", sep=',', encoding='utf-8-sig')

# Gr√°ficos dos resultados

## Porcentagem de positivos e negativos

Percentual de classifica√ß√£o ap√≥s filtragem
<img src="../reports/figures/pizza.png">

## Localiza√ß√£o

<img src="../reports/figures/barra.png">

In [96]:
!ls ../reports/figures

barra.png  pizza.png
