<a href="https://colab.research.google.com/github/paulodreher/data_science/blob/main/Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
nltk.download()

In [None]:
import os
import pandas as pd
import numpy as np
import bs4
import re
from unicodedata import normalize
from nltk.corpus import stopwords
import operator
from keras.preprocessing import sequence
from keras.utils import to_categorical
from tqdm import tqdm_notebook, tqdm
from sklearn.model_selection import train_test_split
from keras.callbacks import *
import re
import unicodedata

In [None]:
#sentimental_data = pd.read_csv('/content/drive/My Drive/Pos/Machine Learning II/Tripadvisor_SampleData.csv',delimiter=',')

In [None]:
sentimental_data = pd.read_csv('/content/drive/My Drive/Pos/Topicos II/Review.csv', encoding = 'utf-8', delimiter=',', usecols=[0,1])

In [None]:
sentimental_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
sentiment    10000 non-null object
review       10000 non-null object
dtypes: object(2)
memory usage: 156.4+ KB


In [None]:
def preprocessing_data(filepath, topwords, maxlen):

    sentimental_data = pd.read_csv(filepath, delimiter=',')

    reviews = []

    pbar = tqdm_notebook(total=len(sentimental_data))
    for review in sentimental_data['review']:
        # Remover tags HTML
        review_text = bs4.BeautifulSoup(review, 'html.parser').get_text()

        # Remover caracteres especiais, pontuacao e numeros
        content = unicodedata.normalize('NFKD',review_text)
        encoded_content = content.encode('ASCII','ignore')
        review_text = encoded_content.decode('utf-8')
        review_text = re.sub(r'[!@#$:).;,?&]',' ', review_text)
        review_text = re.sub('[^a-zA-Z]', ' ', review_text)

        # Converter para caixa baixa
        review_text = review_text.lower()

        # Vetorizar o comentário
        review_words = review_text.split()

        # Remover stopwords
        stops = set(stopwords.words('english'))

        meaningful_words = [word for word in review_words if not word in stops]

        reviews.append(meaningful_words)
        pbar.update(1)

    # Construindo dicionário de frequencia
    freq_dict = {}

    for review in reviews:
        for word in review:
            if not word in freq_dict:
                freq_dict[word] = 0
            freq_dict[word] += 1

    # Selecionar as top-K palavras (jeito inteligente Ass: carlos)
    sorted_tup = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)

    word_to_id = {}
    cnt = topwords - 1
    # Top-K palavras
    for i in sorted_tup[:topwords]:
        word_to_id[i[0]] = cnt
        cnt -= 1
    # Restante
    for i in sorted_tup[topwords:]:
        word_to_id[i[0]] = 0

    # Mapeando palavras para um id do dicionário
    processed_data = []

    for review in reviews:
        aux = []
        for word in review:
            aux.append(word_to_id[word])

        processed_data.append(aux)

    # Realizando o padding dos comentarios
    ## importar sequence de keras.preprocessing
    processed_data = np.asarray(processed_data)
    processed_data = sequence.pad_sequences(processed_data, maxlen)

    sentiment = sentimental_data['sentiment'].get_values()

    stars = []
    for i in range(len(sentiment)):
      if sentiment[i] == 'Negative':
        stars.append(0)
      else:
        stars.append(1)

    stars = to_categorical(stars)


    return processed_data, stars, word_to_id

In [None]:
#data, target, word_to_id = preprocessing_data('/content/drive/My Drive/Pos/Machine Learning II/Tripadvisor_SampleData.csv', 5000, 100)

HBox(children=(IntProgress(value=0, max=11149), HTML(value='')))



In [None]:
data, target, word_to_id = preprocessing_data('/content/drive/My Drive/Pos/Topicos II/Review.csv', 5000, 100)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))



In [None]:
data.shape

(10000, 100)

In [None]:
target

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

<h1>Criando o Modelo</h1>

In [None]:
from keras.models import Model
from keras.layers import *

In [None]:
input_node = Input(shape=(100,))

embedding = Embedding(input_dim=5000,
                      input_length=100,
                      output_dim=32)(input_node)
dropout = Dropout(0.5)(embedding)
lstm_1 = LSTM(100)(dropout)
dropout = Dropout(0.5)(lstm_1)
fc1 = Dense(2, activation='softmax')(dropout)

model = Model(input_node, fc1)
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 32)           160000    
_________________________________________________________________
dropout_7 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_8 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 202       
Total params: 213,402
Trainable params: 213,402
Non-trainable params: 0
_____________________________________________________

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target,test_size=0.33)

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=3)

In [None]:
cb_list = [early_stopping]

In [None]:
model.fit(X_train, y_train, batch_size=64, epochs=20, validation_data=(X_test, y_test), callbacks=cb_list)

Train on 6700 samples, validate on 3300 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x7f057cea7ac8>



```
```

<h2>Testar nova entrada</h2>

In [None]:
new_review = 'Ah bah'

# Remover tags HTML
review_text = bs4.BeautifulSoup(new_review, 'html.parser').get_text()

# Remover caracteres especiais, pontuacao e numeros
content = unicodedata.normalize('NFKD',review_text)
encoded_content = content.encode('ASCII','ignore')
review_text = encoded_content.decode('utf-8')
review_text = re.sub(r'[!@#$:).;,?&]',' ', review_text)
review_text = re.sub('[^a-zA-Z]', ' ', review_text)

# Converter para caixa baixa
review_text = review_text.lower()

# Vetorizar o comentário
review_words = review_text.split()

# Remover stopwords
stops = set(stopwords.words('portuguese'))

meaningful_words = [word for word in review_words if not word in stops]

processed_new_reviews = []
for word in meaningful_words:
    processed_new_reviews.append(word_to_id[word])

processed_data = np.asarray(processed_new_reviews).reshape(1, len(processed_new_reviews))
processed_data = sequence.pad_sequences(processed_data, 100)

In [None]:
processed_data

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 3882,
           0]], dtype=int32)

In [None]:
y_pred = model.predict(processed_data)[0]

if np.argmax(y_pred) == 0:
    sent = 'negativo'
elif np.argmax(y_pred) == 1:
    sent = 'neutro'
else:
    sent = 'positivo'


print('A predição do sentimento para a entrada \"{}\" é {}'.format(new_review, sent))

A predição do sentimento para a entrada "Ah bah" é positivo
