# Projeto - Mineração de Texto e Web
## Residência Engenharia e Ciência de dados - Samsung/UFPE

### Lucas Couri - lncc2
### Mariama Oliveira - mcso

## Carregando Dados

In [1]:
#Imports
import string
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score

import numpy as np
import tensorflow as tf
#from keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Conv2D, Input
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("reviews_v2.csv")
df = df[df["reviews"].notna()]

In [3]:
df.head()


Unnamed: 0,reviews,stars,dates
0,Estou aqui para relatar uma experiência que ...,5,2021-06-05
1,"Já havia comprado a versão Lite, o que já ac...",5,2021-05-07
2,"Bom. Bem fluído, interessante e eficaz em cu...",5,2021-05-07
3,"Estou bem chateado, ja possuía o modelo anti...",3,2021-05-09
4,"Aparelho muito bom, está lidando muito bem c...",4,2021-05-06


In [4]:
df.dtypes

reviews    object
stars       int64
dates      object
dtype: object

## Pré-processamento (com e sem stemming)

In [5]:
#Global variables
other_punctuation = '—“”'  
stop_words = stopwords.words('portuguese')
stop_words.append('’')
stemmer = nltk.stem.RSLPStemmer()


#Function that removes punctuation 
def remove_punctuation(text):
    punctuation_free_doc = "".join([i for i in text if i not in string.punctuation+other_punctuation])
    return punctuation_free_doc


def remove_stopwords(list_words):
    filtered_words = [word for word in list_words if word not in stop_words]
    return filtered_words


def do_stemming(list_words):
    stem_text = [stemmer.stem(word) for word in list_words]
    return stem_text


def pre_process(doc, basic_processing = False, no_stopwords = False, stemming = False):

    final_doc = doc
    
    ## print(final_doc)

    if basic_processing == True:
        
        final_doc = remove_punctuation(doc)
        final_doc = final_doc.lower()

    final_doc = nltk.word_tokenize(final_doc)

    if no_stopwords == True:
        final_doc = remove_stopwords(final_doc)    

    if stemming == True:
        final_doc = do_stemming(final_doc)

    return final_doc

def pre_process_all(df, pre_processing_list):

    for param, index in zip(pre_processing_list, range(len(pre_processing_list))):
        
        df[f"reviews_pipeline_{index}"] = df["reviews"].apply(lambda x: pre_process(x, **param))

    return df

pre_processing_list = [
    {"basic_processing": True, "no_stopwords": True, "stemming": False},
    {"basic_processing": True, "no_stopwords": True, "stemming": True}]

df_pp = pre_process_all(df, pre_processing_list)

In [6]:
df_pp.head()

Unnamed: 0,reviews,stars,dates,reviews_pipeline_0,reviews_pipeline_1
0,Estou aqui para relatar uma experiência que ...,5,2021-06-05,"[aqui, relatar, experiência, visando, contribu...","[aqu, relat, experi, vis, contribu, amig, prob..."
1,"Já havia comprado a versão Lite, o que já ac...",5,2021-05-07,"[havia, comprado, versão, lite, achei, maravil...","[hav, compr, vers, lit, ach, maravilh, porém, ..."
2,"Bom. Bem fluído, interessante e eficaz em cu...",5,2021-05-07,"[bom, bem, fluído, interessante, eficaz, cumpr...","[bom, bem, flu, interess, eficaz, cumpr, prome..."
3,"Estou bem chateado, ja possuía o modelo anti...",3,2021-05-09,"[bem, chateado, ja, possuía, modelo, antigo, b...","[bem, chate, ja, possuí, model, antig, bem, co..."
4,"Aparelho muito bom, está lidando muito bem c...",4,2021-05-06,"[aparelho, bom, lidando, bem, home, tv, apenas...","[aparelh, bom, lid, bem, hom, tv, apen, porém,..."


In [7]:
df_pp[["reviews_pipeline_0", "reviews_pipeline_1"]]

Unnamed: 0,reviews_pipeline_0,reviews_pipeline_1
0,"[aqui, relatar, experiência, visando, contribu...","[aqu, relat, experi, vis, contribu, amig, prob..."
1,"[havia, comprado, versão, lite, achei, maravil...","[hav, compr, vers, lit, ach, maravilh, porém, ..."
2,"[bom, bem, fluído, interessante, eficaz, cumpr...","[bom, bem, flu, interess, eficaz, cumpr, prome..."
3,"[bem, chateado, ja, possuía, modelo, antigo, b...","[bem, chate, ja, possuí, model, antig, bem, co..."
4,"[aparelho, bom, lidando, bem, home, tv, apenas...","[aparelh, bom, lid, bem, hom, tv, apen, porém,..."
...,...,...
5002,"[chegou, super, rápido, atendeu, superou, toda...","[cheg, sup, rápid, atend, super, tod, expect]"
5003,"[facil, instalação, configuração, entrega, sup...","[facil, instal, configur, entreg, sup, rápid]"
5004,"[amei, produto, unico, problema, pra, mim, nao...","[ame, produt, unic, problem, pra, mim, nao, hbo]"
5005,"[funciona, beleza, rede, internet, sendo, boa,...","[func, bel, red, internet, send, boa, tud, bem]"


## Definindo classe

In [8]:
df_pp["class"] = df_pp["stars"].apply(lambda x : 1 if x >=4 else 0)

## Divisão train e test

In [9]:
X = df_pp[["reviews_pipeline_0", "reviews_pipeline_1"]]
y = df_pp["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [10]:
X_train.head()

Unnamed: 0,reviews_pipeline_0,reviews_pipeline_1
4688,"[chegou, antes, prazo, mostrou, excelente, pro...","[cheg, ant, praz, mostr, excel, produt, val, c..."
3802,"[melhor, produto, comprei, pra, tv, igual, fun...","[melhor, produt, compr, pra, tv, igual, func, ..."
4442,"[gostei, falta, globoplay, pra, mim, único, de...","[gost, falt, globoplay, pra, mim, únic, defeit..."
1146,"[fire, sitck, prático, funciona, bem, transfor...","[fir, sitck, prát, func, bem, transform, tv, c..."
4612,"[to, demais, super, fácil, praticovelocidade, ...","[to, demal, sup, fácil, praticoveloc, respost,..."


Transformando tokens em string

In [11]:
X_train_join = []
X_train_join.append(X_train["reviews_pipeline_0"].apply(" ".join))
X_train_join.append(X_train["reviews_pipeline_1"].apply(" ".join))
X_train_join[0] = X_train_join[0].to_numpy()
X_train_join[1] = X_train_join[1].to_numpy()

X_test_join = []
X_test_join.append(X_test["reviews_pipeline_0"].apply(" ".join))
X_test_join.append(X_test["reviews_pipeline_1"].apply(" ".join))
X_test_join[0] = X_test_join[0].to_numpy()
X_test_join[1] = X_test_join[1].to_numpy()

# Classificadores 

## Random Forest com BoW

In [12]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 2000) 

#List with BoWs (pipeline 0 and 1)
X_train_vec = []
X_train_vec.append(vectorizer.fit_transform(X_train_join[0]))
X_train_vec.append(vectorizer.fit_transform(X_train_join[1]))

X_test_vec = []
X_test_vec.append(vectorizer.fit_transform(X_test_join[0]))
X_test_vec.append(vectorizer.fit_transform(X_test_join[1]))


print(X_train_vec[0].shape)

(3502, 2000)


### Sem stemming

In [13]:
forest = RandomForestClassifier() 
forest = forest.fit(X_train_vec[0], y_train)

In [14]:
predictions = forest.predict(X_test_vec[0]) 
result = forest.predict(X_test_vec[0])
print(classification_report(y_test, result))


              precision    recall  f1-score   support

           0       0.20      0.11      0.14       170
           1       0.89      0.94      0.92      1331

    accuracy                           0.85      1501
   macro avg       0.55      0.53      0.53      1501
weighted avg       0.81      0.85      0.83      1501



Matriz de Confusão

In [15]:
confusion_matrix(y_test, result)

array([[  19,  151],
       [  76, 1255]], dtype=int64)

### Com Stemming

In [16]:
forest = RandomForestClassifier() 
forest = forest.fit(X_train_vec[1], y_train)

In [17]:
predictions = forest.predict(X_test_vec[1]) 
result = forest.predict(X_test_vec[1])
print(classification_report(y_test, result))


              precision    recall  f1-score   support

           0       0.24      0.14      0.17       170
           1       0.90      0.95      0.92      1331

    accuracy                           0.85      1501
   macro avg       0.57      0.54      0.55      1501
weighted avg       0.82      0.85      0.84      1501



In [18]:
confusion_matrix(y_test, result)

array([[  23,  147],
       [  71, 1260]], dtype=int64)

## Redes (CNN, LSTM e BERT)

In [19]:
# tf.keras.layers.TextVectorization(
#     max_tokens=None,
#     standardize='lower_and_strip_punctuation',
#     split='whitespace',
#     ngrams=None,
#     output_mode='int',
#     output_sequence_length=None,
#     pad_to_max_tokens=False,
#     vocabulary=None,
#     idf_weights=None,
#     sparse=False,
#     ragged=False,
#     **kwargs
# )




In [20]:
X_train_join[0]

array(['chegou antes prazo mostrou excelente produto vale compra',
       'melhor produto comprei pra tv igual funciona bem',
       'gostei falta globoplay pra mim único defeito momento', ...,
       'atendeu todas expectativas ótimo produto fácil instalação controle prático funciona perfeitamente',
       'funcionou tv sansung usar tv smart atendeu',
       'comprei pra dar upgrade tv ameinão fácil instalar tudo intuitivo rápidoestou amando'],
      dtype=object)

In [21]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE,
                                            standardize=None
                                            )
#encoder.adapt(train_dataset.map(lambda text, label: text))
encoder.adapt(X_train_join[0])


In [22]:
vectorized_text = encoder(X_train_join[0])
print(vectorized_text)

tf.Tensor(
[[ 45  67 155 ...   0   0   0]
 [ 31   3  37 ...   0   0   0]
 [ 26  52  34 ...   0   0   0]
 ...
 [ 99  93  54 ...   0   0   0]
 [ 84   2   1 ...   0   0   0]
 [ 37  23 269 ...   0   0   0]], shape=(3502, 180), dtype=int64)


In [23]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'tv', 'produto', 'controle', 'fácil', 'bem', 'bom',
       'alexa', 'instalar', 'fire', 'stick', 'aparelho', 'smart',
       'excelente', 'amazon', 'instalação', 'hbo', 'rápido', 'recomendo'],
      dtype='<U17')

### CNN

### LSTM

In [24]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [25]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [26]:
history = model.fit(X_train_join[0], y_train, epochs=10,
                    batch_size = 32,
                    validation_data= (X_test_join[0], y_test),
                    validation_steps=30
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
#test_dataset = tf.data.Dataset.from_tensor_slices((X_test_join[0].to_list(), y_test.to_list()))

#test_loss, test_acc = model.evaluate(test_dataset)
test_loss, test_acc = model.evaluate((X_test_join[0], y_test))


print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

ValueError: in user code:

    File "C:\Users\lucas\anaconda3\lib\site-packages\keras\engine\training.py", line 1525, in test_function  *
        return step_function(self, iterator)
    File "C:\Users\lucas\anaconda3\lib\site-packages\keras\engine\training.py", line 1514, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\lucas\anaconda3\lib\site-packages\keras\engine\training.py", line 1507, in run_step  **
        outputs = model.test_step(data)
    File "C:\Users\lucas\anaconda3\lib\site-packages\keras\engine\training.py", line 1471, in test_step
        y_pred = self(x, training=False)
    File "C:\Users\lucas\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\lucas\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "sequential" expects 1 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=int64>]


In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))

### BERT