# Projeto - Mineração de Texto e Web
## Residência Engenharia e Ciência de dados - Samsung/UFPE

### Lucas Couri - lncc2
### Mariama Oliveira - mcso

## Carregando Dados

In [1]:
#Imports
import string
import nltk
from nltk.corpus import stopwords
import pandas as pd
# from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.sparse import csr_matrix
from scipy.cluster.vq import vq

import numpy as np
import tensorflow as tf
#from keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Conv2D, Input
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

import dataset_split 
import networks
import bert
import rm_forest

In [2]:
df = pd.read_csv("reviews_v2.csv")
df = df[df["reviews"].notna()]

## Pré-processamento (com e sem stemming)

In [3]:
#Global variables
other_punctuation = '—“”'  
stop_words = stopwords.words('portuguese')
stop_words.append('’')
stemmer = nltk.stem.RSLPStemmer()


#Function that removes punctuation 
def remove_punctuation(text):
    punctuation_free_doc = "".join([i for i in text if i not in string.punctuation+other_punctuation])
    return punctuation_free_doc


def remove_stopwords(list_words):
    filtered_words = [word for word in list_words if word not in stop_words]
    return filtered_words


def do_stemming(list_words):
    stem_text = [stemmer.stem(word) for word in list_words]
    return stem_text


def pre_process(doc, basic_processing = False, no_stopwords = False, stemming = False):

    final_doc = doc
    
    ## print(final_doc)

    if basic_processing == True:
        
        final_doc = remove_punctuation(doc)
        final_doc = final_doc.lower()

    final_doc = nltk.word_tokenize(final_doc)

    if no_stopwords == True:
        final_doc = remove_stopwords(final_doc)    

    if stemming == True:
        final_doc = do_stemming(final_doc)

    return final_doc

def pre_process_all(df, pre_processing_list):

    for param, index in zip(pre_processing_list, range(len(pre_processing_list))):
        
        df[f"reviews_pipeline_{index}"] = df["reviews"].apply(lambda x: pre_process(x, **param))

    return df

pre_processing_list = [
    {"basic_processing": True, "no_stopwords": True, "stemming": False},
    {"basic_processing": True, "no_stopwords": True, "stemming": True}]

df_pp = pre_process_all(df, pre_processing_list)

## Definindo classe

In [4]:
df_pp["class"] = df_pp["stars"].apply(lambda x : 1 if x >=4 else 0)

## Divisão Train, Validation e Test

In [5]:
#Unbalanced dataset
X_train, X_valid, X_test, y_train, y_valid, y_test = dataset_split.split_data(df_pp)

#Balanced dataset
X_train_b, X_valid_b, X_test_b, y_train_b, y_valid_b, y_test_b = dataset_split.split_data(df_pp, True)

Transformando tokens em string

In [6]:
def tokens_to_string(df):
    X_train_join = []
    X_train_join.append(df["reviews_pipeline_0"].apply(" ".join))
    X_train_join.append(df["reviews_pipeline_1"].apply(" ".join))
    X_train_join[0] = X_train_join[0].to_numpy()
    X_train_join[1] = X_train_join[1].to_numpy()

    return X_train_join

In [7]:
#Unbalanced data
X_train_join = tokens_to_string(X_train)
X_test_join = tokens_to_string(X_test)
X_valid_join = tokens_to_string(X_valid)

#Balanced data
X_train_join_b = tokens_to_string(X_train_b)
X_test_join_b = tokens_to_string(X_test_b)
X_valid_join_b = tokens_to_string(X_valid_b)


# Classificadores 

## Random Forest com BoW

In [16]:
def get_bag_of_words(X_train_join, X_valid_join, X_test_join):
    vectorizer = CountVectorizer(analyzer = "word",   
                                tokenizer = None,    
                                preprocessor = None, 
                                stop_words = None,   
                                max_features = 2000) 

    #List with BoWs (pipeline 0 and 1)
    X_train_vec = []
    X_train_vec.append(vectorizer.fit_transform(X_train_join[0]))
    X_train_vec.append(vectorizer.fit_transform(X_train_join[1]))

    X_test_vec = []
    X_test_vec.append(vectorizer.fit_transform(X_test_join[0]))
    X_test_vec.append(vectorizer.fit_transform(X_test_join[1]))

    X_valid_vec = []
    X_valid_vec.append(vectorizer.fit_transform(X_valid_join[0]))
    X_valid_vec.append(vectorizer.fit_transform(X_valid_join[1]))


    return X_train_vec, X_valid_vec, X_test_vec

In [17]:
#Unbalanced data
X_train_vec, X_valid_vec, X_test_vec = get_bag_of_words(X_train_join, X_valid_join, X_test_join)

#Balanced data
X_train_vec_b, X_valid_vec_b, X_test_vec_b = get_bag_of_words(X_train_join_b, X_valid_join_b, X_test_join_b)

In [14]:
def pipeline_rf(X_train_vec, y_train, X_valid_vec, y_valid, X_test_vec, y_test):
    #Get best parameters
    best_params = rm_forest.get_best_parameters(X_train_vec, X_valid_vec, y_train, y_valid)
    
    #Train model with best parameters
    forest = RandomForestClassifier(**best_params) 
    forest = forest.fit(X_train_vec, y_train)

    #Predict test dataset
    predictions = forest.predict(X_test_vec) 
    result = forest.predict(X_test_vec)
    print(classification_report(y_test, result))
    print(confusion_matrix(y_test, result))

### Sem stemming

In [24]:
#Unbalanced data
pipeline_rf(X_train_vec[0], y_train, X_valid_vec[0], y_valid, X_test_vec[0], y_test)

[32m[I 2022-03-31 15:58:40,000][0m A new study created in memory with name: no-name-70094e89-a24f-4ef8-b5c8-d35bc31321be[0m
[32m[I 2022-03-31 15:58:41,246][0m Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 721, 'max_depth': 9, 'criterion': 'gini'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:58:43,208][0m Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 544, 'max_depth': 34, 'criterion': 'entropy'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:58:44,579][0m Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 959, 'max_depth': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:58:45,153][0m Trial 3 finished with value: 0.032 and parameters: {'n_estimators': 199, 'max_depth': 29, 'criterion': 'gini'}. Best is trial 3 with value: 0.032.[0m
[32m[I 2022-03-31 15:58:46,487][0m Trial 4 finished with value: 0.0 and parameters: {'n_estimators': 890, 'max_depth': 3, 'criteri

              precision    recall  f1-score   support

           0       0.21      0.10      0.13       167
           1       0.89      0.96      0.92      1334

    accuracy                           0.86      1501
   macro avg       0.55      0.53      0.53      1501
weighted avg       0.82      0.86      0.84      1501



In [23]:
#Balanced data
pipeline_rf(X_train_vec_b[0], y_train_b, X_valid_vec_b[0], y_valid_b, X_test_vec_b[0], y_test_b)

[32m[I 2022-03-31 15:56:41,990][0m A new study created in memory with name: no-name-9de3d36b-1fe0-4c01-9c52-7a3a69ac6990[0m
[32m[I 2022-03-31 15:56:43,439][0m Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 940, 'max_depth': 5, 'criterion': 'entropy'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:56:43,539][0m Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 64, 'max_depth': 4, 'criterion': 'gini'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:56:43,612][0m Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 49, 'max_depth': 2, 'criterion': 'gini'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:56:43,939][0m Trial 3 finished with value: 0.0 and parameters: {'n_estimators': 239, 'max_depth': 2, 'criterion': 'gini'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-03-31 15:56:45,077][0m Trial 4 finished with value: 0.0 and parameters: {'n_estimators': 742, 'max_depth': 3, 'criterion': 'en

              precision    recall  f1-score   support

           0       0.16      0.26      0.20       167
           1       0.90      0.82      0.86      1334

    accuracy                           0.76      1501
   macro avg       0.53      0.54      0.53      1501
weighted avg       0.82      0.76      0.78      1501



### Com stemming

In [25]:
pipeline_rf(X_train_vec[1], y_train, X_valid_vec[1], y_valid, X_test_vec[1], y_test)

[32m[I 2022-03-31 15:59:16,350][0m A new study created in memory with name: no-name-251c84ab-1d55-476f-9b1c-4e8fbcaf2858[0m
[32m[I 2022-03-31 15:59:20,691][0m Trial 0 finished with value: 0.07407407407407407 and parameters: {'n_estimators': 965, 'max_depth': 53, 'criterion': 'entropy'}. Best is trial 0 with value: 0.07407407407407407.[0m
[32m[I 2022-03-31 15:59:22,864][0m Trial 1 finished with value: 0.032520325203252036 and parameters: {'n_estimators': 598, 'max_depth': 35, 'criterion': 'entropy'}. Best is trial 0 with value: 0.07407407407407407.[0m
[32m[I 2022-03-31 15:59:25,115][0m Trial 2 finished with value: 0.016666666666666666 and parameters: {'n_estimators': 797, 'max_depth': 25, 'criterion': 'entropy'}. Best is trial 0 with value: 0.07407407407407407.[0m
[32m[I 2022-03-31 15:59:26,030][0m Trial 3 finished with value: 0.05882352941176471 and parameters: {'n_estimators': 286, 'max_depth': 33, 'criterion': 'gini'}. Best is trial 0 with value: 0.07407407407407407.[0

              precision    recall  f1-score   support

           0       0.21      0.07      0.11       167
           1       0.89      0.97      0.93      1334

    accuracy                           0.87      1501
   macro avg       0.55      0.52      0.52      1501
weighted avg       0.82      0.87      0.84      1501



In [26]:
pipeline_rf(X_train_vec_b[1], y_train_b, X_valid_vec_b[1], y_valid_b, X_test_vec_b[1], y_test_b)

[32m[I 2022-03-31 15:59:48,118][0m A new study created in memory with name: no-name-33cfd233-a2da-4307-9265-c269fb574187[0m
[32m[I 2022-03-31 15:59:55,098][0m Trial 0 finished with value: 0.15228426395939085 and parameters: {'n_estimators': 995, 'max_depth': 118, 'criterion': 'gini'}. Best is trial 0 with value: 0.15228426395939085.[0m
[32m[I 2022-03-31 15:59:56,074][0m Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 410, 'max_depth': 13, 'criterion': 'entropy'}. Best is trial 0 with value: 0.15228426395939085.[0m
[32m[I 2022-03-31 16:00:00,484][0m Trial 2 finished with value: 0.07547169811320754 and parameters: {'n_estimators': 936, 'max_depth': 42, 'criterion': 'gini'}. Best is trial 0 with value: 0.15228426395939085.[0m
[32m[I 2022-03-31 16:00:04,273][0m Trial 3 finished with value: 0.028368794326241134 and parameters: {'n_estimators': 871, 'max_depth': 34, 'criterion': 'entropy'}. Best is trial 0 with value: 0.15228426395939085.[0m
[32m[I 2022-03-3

## Redes (CNN, LSTM e BERT)

In [None]:
def pipeline_cnn_lstm(tuner_name,X_train, y_train, X_valid, y_valid, X_test, y_test):

    type_nn = tuner_name[:3]
   
    #Encoding sentences
    encoder = networks.sentence_encoder(X_train)

    #Finding best parameters
    tuner = networks.network_tuner(type_nn, encoder, tuner_name)
    
    best_hp = networks.search_network(tuner, X_train, y_train, X_valid, y_valid)

    #Loading best model
    model = networks.get_model(tuner, best_hp)
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    history = model.fit(X_train, y_train, epochs=50, validation_data = (X_valid, y_valid), callbacks=[stop_early])

    #Getting test results
    networks.get_test_metrics(model, X_test, y_test)
    networks.plot_acuracy_loss(history)

### CNN

#### Sem stemming

In [None]:
# Desbalanceado
pipeline_cnn_lstm("cnn",X_train_join[0], y_train, X_valid_join[0], y_valid, X_test_join[0], y_test)

In [None]:
# Balanceado
pipeline_cnn_lstm("cnn_b",X_train_join_b[0], y_train_b, X_valid_join_b[0], y_valid_b, X_test_join_b[0], y_test_b)

#### Com stemming

In [None]:
#Desbalanceado
pipeline_cnn_lstm("cnn_st",X_train_join[1], y_train, X_valid_join[1], y_valid, X_test_join[1], y_test)

In [None]:
#Balanceado
pipeline_cnn_lstm("cnn_b_st",X_train_join_b[1], y_train_b, X_valid_join_b[1], y_valid_b, X_test_join_b[1], y_test_b)

### LSTM

#### Sem stemming

In [None]:
# Desbalanceado
pipeline_cnn_lstm("lstm",X_train_join[0], y_train, X_valid_join[0], y_valid, X_test_join[0], y_test)

In [None]:
# Balanceado
pipeline_cnn_lstm("lstm_b",X_train_join_b[0], y_train_b, X_valid_join_b[0], y_valid_b, X_test_join_b[0], y_test_b)

#### Com stemming

In [None]:
#Desbalanceado
pipeline_cnn_lstm("lstm_st",X_train_join[1], y_train, X_valid_join[1], y_valid, X_test_join[1], y_test)

In [None]:
#Balanceado
pipeline_cnn_lstm("lstm_b_st",X_train_join_b[1], y_train_b, X_valid_join_b[1], y_valid_b, X_test_join_b[1], y_test_b)

### BERT

In [None]:
from transformers import TFBertForSequenceClassification

def pipeline_bert(X_train, y_train, X_valid, y_valid, X_test, y_test):

    learning_rate = 2e-5
    number_of_epochs = 1
    ds_train, ds_valid, ds_test = bert.get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test)
    
    # model initialization
    model = TFBertForSequenceClassification.from_pretrained('bert-base-portuguese-cased', from_pt = True)

    # choosing Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    #Training model
    bert_history = model.fit(ds_train, epochs=number_of_epochs, validation_data=ds_valid)

    #Predict test data
    bert.get_test_metrics(model, ds_test, y_test)
    

#### Sem stemming

In [None]:
#Unbalanced data
pipeline_bert(X_train_join[0], y_train, X_valid_join[0], y_valid, X_test_join[0], y_test)

In [None]:
#Balanced data
pipeline_bert(X_train_join_b[0], y_train_b, X_valid_join_b[0], y_valid_b, X_test_join_b[0]y_test_b)

#### Com stemming

In [None]:
#Unbalanced data
pipeline_bert(X_train_join[1], y_train, X_valid_join[1], y_valid, X_test_join[1], y_test)

In [None]:
#Balanced data
pipeline_bert(X_train_join_b[1], y_train_b, X_valid_join_b[1], y_valid_b, X_test_join_b[1], y_test_b)

## Clustering

In [None]:
df_pp[["reviews", "reviews_pipeline_0", "class"]].head()

In [None]:
df_cluster = df_pp[["reviews", "reviews_pipeline_0", "class"]]

X_join = df_cluster["reviews_pipeline_0"].apply(" ".join)
X_join = X_join.to_numpy()
#df_cluster["reviews_join"] = X_join

In [None]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 2000) 

#List with BoWs (pipeline 0 and 1)
bow_vec = vectorizer.fit_transform(X_join)

In [None]:
words = vectorizer.get_feature_names()
#words = vectorizer.get_feature_names_out()

#setup kmeans clustering
kmeans = KMeans(n_clusters = 2, random_state = 420)
#fit the data 
kmeans.fit(bow_vec)

In [None]:
kmeans.cluster_centers_

Achar os n mais proximos (https://stackoverflow.com/questions/26795535/output-50-samples-closest-to-each-cluster-center-using-scikit-learn-k-means-libr)

- n Reviews mais próximas do centróide da classe 0 (Ruim)

In [None]:
n=2
d = kmeans.transform(csr_matrix.toarray(bow_vec))[:, 0] #distancia de cada ponto ao centroide 0
ind0 = np.argsort(d)[::][:n]
ind0
#csr_matrix.toarray(bow_vec)[ind]

In [None]:
df_pp.iloc[ind0[0]]["reviews"]

In [None]:
df_pp.iloc[ind0[1]]["reviews"]

In [None]:
lista_neg = []
for i in ind0.tolist():
    lista_neg.append(df_pp.iloc[i]["reviews"])
lista_neg

- n Reviews mais próximas do centróide da classe 1 (Bom)

In [None]:
n = 2
d = kmeans.transform(csr_matrix.toarray(bow_vec))[:, 1] #distancia de cada ponto ao centroide 1
ind1 = np.argsort(d)[::][:n]
ind1

In [None]:
df_pp.iloc[ind1[0]]["reviews"]

In [None]:
df_pp.iloc[ind1[1]]["reviews"]

In [None]:
lista_posi = []
for i in ind1.tolist():
    lista_posi.append(df_pp.iloc[i]["reviews"])
lista_posi

Salvando em um df para incluir no dashboard

In [None]:
df_rep = pd.DataFrame(data={"negativo": lista_neg, "positivo": lista_posi})
df_rep.to_csv("reviews_rep.csv", index=False)

## Função para salvar predições do melhor classificador

In [None]:
#X_test["pred"] = result
#df_pred = pd.merge(X_test[["class", "pred"]], df_pp[["reviews", "stars", "dates"]], left_index=True, right_index=True)
#df_pred.to_csv("best_pred.csv", index=False)

def save_results(result, X_test, df_pp, dfname="best_pred.csv"):
    X_test["pred"] = result
    df_pred = pd.merge(X_test[["class", "pred"]], df_pp[["reviews", "stars", "dates"]], left_index=True, right_index=True)
    df_pred.to_csv(dfname, index=False)

save_results(result, X_test, df_pp)

In [None]:
pd.read_csv("best_pred.csv")