# Projeto - Mineração de Texto e Web
## Residência Engenharia e Ciência de dados - Samsung/UFPE

### Lucas Couri - lncc2
### Mariama Oliveira - mcso

## Carregando Dados

In [None]:
#Imports
import string
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_distances_argmin_min
from scipy.sparse import csr_matrix
from scipy.cluster.vq import vq

import numpy as np
import tensorflow as tf
#from keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Conv2D, Input
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


In [None]:
df = pd.read_csv("reviews_v2.csv")
df = df[df["reviews"].notna()]

In [None]:
df.dtypes

In [None]:
df.head()


## Pré-processamento (com e sem stemming)

In [None]:
#Global variables
other_punctuation = '—“”'  
stop_words = stopwords.words('portuguese')
stop_words.append('’')
stemmer = nltk.stem.RSLPStemmer()


#Function that removes punctuation 
def remove_punctuation(text):
    punctuation_free_doc = "".join([i for i in text if i not in string.punctuation+other_punctuation])
    return punctuation_free_doc


def remove_stopwords(list_words):
    filtered_words = [word for word in list_words if word not in stop_words]
    return filtered_words


def do_stemming(list_words):
    stem_text = [stemmer.stem(word) for word in list_words]
    return stem_text


def pre_process(doc, basic_processing = False, no_stopwords = False, stemming = False):

    final_doc = doc
    
    ## print(final_doc)

    if basic_processing == True:
        
        final_doc = remove_punctuation(doc)
        final_doc = final_doc.lower()

    final_doc = nltk.word_tokenize(final_doc)

    if no_stopwords == True:
        final_doc = remove_stopwords(final_doc)    

    if stemming == True:
        final_doc = do_stemming(final_doc)

    return final_doc

def pre_process_all(df, pre_processing_list):

    for param, index in zip(pre_processing_list, range(len(pre_processing_list))):
        
        df[f"reviews_pipeline_{index}"] = df["reviews"].apply(lambda x: pre_process(x, **param))

    return df

pre_processing_list = [
    {"basic_processing": True, "no_stopwords": True, "stemming": False},
    {"basic_processing": True, "no_stopwords": True, "stemming": True}]

df_pp = pre_process_all(df, pre_processing_list)

In [None]:
df_pp.head()

In [None]:
df_pp[["reviews_pipeline_0", "reviews_pipeline_1"]]

## Definindo classe

In [None]:
df_pp["class"] = df_pp["stars"].apply(lambda x : 1 if x >=4 else 0)

## Balanceamento

In [None]:
#Function that oversaples given a dataframe
def perform_oversample(df):

    class_1,class_0 = df["class"].value_counts()
    c0 = df[df['class'] == 0]
    c1 = df[df['class'] == 1]

    df_0 = c0.sample(round(class_1/3), replace=True)
    oversampled_df = pd.concat([c1,df_0], axis=0)

    return oversampled_df

## Divisão train e test

In [None]:
#Define if it will perform oversampling
OVERSAMPLE = True

In [None]:
X = df_pp[["reviews_pipeline_0", "reviews_pipeline_1", "class"]]
y = df_pp["class"]
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.3, random_state = 42, stratify=y_train_valid)

if OVERSAMPLE:
    X_train = perform_oversample(X_train)
    
y_train =  X_train["class"]    
X_train = X_train[["reviews_pipeline_0", "reviews_pipeline_1"]]

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Transformando tokens em string

In [None]:
X_train_join = []
X_train_join.append(X_train["reviews_pipeline_0"].apply(" ".join))
X_train_join.append(X_train["reviews_pipeline_1"].apply(" ".join))
X_train_join[0] = X_train_join[0].to_numpy()
X_train_join[1] = X_train_join[1].to_numpy()

X_test_join = []
X_test_join.append(X_test["reviews_pipeline_0"].apply(" ".join))
X_test_join.append(X_test["reviews_pipeline_1"].apply(" ".join))
X_test_join[0] = X_test_join[0].to_numpy()
X_test_join[1] = X_test_join[1].to_numpy()

X_valid_join = []
X_valid_join.append(X_valid["reviews_pipeline_0"].apply(" ".join))
X_valid_join.append(X_valid["reviews_pipeline_1"].apply(" ".join))
X_valid_join[0] = X_valid_join[0].to_numpy()
X_valid_join[1] = X_valid_join[1].to_numpy()

X_train_valid_join = []
X_train_valid_join.append(X_train_valid["reviews_pipeline_0"].apply(" ".join))
X_train_valid_join.append(X_train_valid["reviews_pipeline_1"].apply(" ".join))
X_train_valid_join[0] = X_train_valid_join[0].to_numpy()
X_train_valid_join[1] = X_train_valid_join[1].to_numpy()

# Classificadores 

## Random Forest com BoW

In [None]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 2000) 

#List with BoWs (pipeline 0 and 1)
X_train_vec = []
X_train_vec.append(vectorizer.fit_transform(X_train_valid_join[0]))
X_train_vec.append(vectorizer.fit_transform(X_train_valid_join[1]))

X_test_vec = []
X_test_vec.append(vectorizer.fit_transform(X_test_join[0]))
X_test_vec.append(vectorizer.fit_transform(X_test_join[1]))

# X_valid_vec = []
# X_valid_vec.append(vectorizer.fit_transform(X_valid_join[0]))
# X_valid_vec.append(vectorizer.fit_transform(X_valid_join[1]))


print(X_train_vec[0].shape)

### Sem stemming

In [None]:
from sklearn import model_selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

cv = model_selection.StratifiedKFold(n_splits=10)

In [None]:
import optuna
import sklearn
from sklearn import datasets
def objective(trial):
      iris = sklearn.datasets.load_iris()
      n_estimators = trial.suggest_int('n_estimators', 2, 20)
      max_depth = int(trial.suggest_loguniform('max_depth', 1, 32))
      clf = sklearn.ensemble.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
      return sklearn.model_selection.cross_val_score(clf, iris.data, iris.target, 
           n_jobs=-1, cv=3).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
def val_rf(X_train, y_train, parameters, cv, SEED):

    rf = RandomForestClassifier(random_state = SEED)

    search = GridSearchCV(rf,
                          parameters,
                          scoring = "accuracy",
                          n_jobs = -1,
                          cv = cv)

    result_rf = search.fit(X_train, y_train)
    
    print('=========Resultados do Grid Search para Random Forest==========')
    print(f'Melhor Score: {result_rf.best_score_}')
    print(f'Melhores Hiperparâmetros: {result_rf.best_params_}')

    return result_rf

parameters = dict()
parameters['n_estimators'] = range(10, 101, 10)
parameters['criterion'] = ["gini", "entropy"]
#parameters['max_features'] = ["auto", "sqrt", "log2"]
#parameters['min_samples_leaf'] = [1, 4]
#parameters['min_samples_split'] = [2, 10]
#parameters['max_depth'] = [10, 100, None]#[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]

result_rf_0 = val_rf(X_train_vec[0], y_train_valid, parameters, cv, 42)

In [None]:
forest = RandomForestClassifier(**result_rf_0.best_params_) 
forest = forest.fit(X_train_vec[0], y_train_valid)

In [None]:
predictions = forest.predict(X_test_vec[0]) 
result = forest.predict(X_test_vec[0])
print(classification_report(y_test, result))


Matriz de Confusão

In [None]:
confusion_matrix(y_test, result)

### Com Stemming

In [None]:
parameters = dict()
parameters['n_estimators'] = range(10, 101, 10)
parameters['criterion'] = ["gini", "entropy"]
#parameters['max_features'] = ["auto", "sqrt", "log2"]
#parameters['min_samples_leaf'] = [1, 4]
#parameters['min_samples_split'] = [2, 10]
#parameters['max_depth'] = [10, 100, None]#[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]

result_rf_1 = val_rf(X_train_vec[1], y_train_valid, parameters, cv, 42)

In [None]:
forest = RandomForestClassifier(**result_rf_1.best_params_) 
forest = forest.fit(X_train_vec[1], y_train_valid)

In [None]:
predictions = forest.predict(X_test_vec[1]) 
result = forest.predict(X_test_vec[1])
print(classification_report(y_test, result))


In [None]:
confusion_matrix(y_test, result)

## Redes (CNN, LSTM e BERT)

In [None]:
# tf.keras.layers.TextVectorization(
#     max_tokens=None,
#     standardize='lower_and_strip_punctuation',
#     split='whitespace',
#     ngrams=None,
#     output_mode='int',
#     output_sequence_length=None,
#     pad_to_max_tokens=False,
#     vocabulary=None,
#     idf_weights=None,
#     sparse=False,
#     ragged=False,
#     **kwargs
# )

In [None]:
X_train_join[0]

### Encoder (Sem stemming)

In [None]:
VOCAB_SIZE = 1000
encoder_0 = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE,
                                            standardize=None
                                            )
#encoder.adapt(train_dataset.map(lambda text, label: text))
encoder_0.adapt(X_train_join[0])


In [None]:
vectorized_text = encoder_0(X_train_join[0])
print(vectorized_text)

In [None]:
vocab = np.array(encoder_0.get_vocabulary())
vocab[:20]

### Encoder (Com stemming)

In [None]:
VOCAB_SIZE = 1000
encoder_1 = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE,
                                            standardize=None
                                            )
#encoder.adapt(train_dataset.map(lambda text, label: text))
encoder_1.adapt(X_train_join[1])


In [None]:
vectorized_text = encoder_1(X_train_join[1])
print(vectorized_text)

In [None]:
vocab = np.array(encoder_1.get_vocabulary())
vocab[:20]

### CNN

#### Sem stemming

In [None]:
import keras_tuner as kt
##Testando keras tuner

def model_builder_0(hp):

    # hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    hp_rate = hp.Float('dropout_1', min_value=0.0, max_value=0.5, default=0.25, step=0.05)

    model_CNN_0 = tf.keras.Sequential([
    encoder_0,
    tf.keras.layers.Embedding(
        input_dim=len(encoder_0.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Conv1D(filters=32, kernel_size=8, activation='relu'),
    tf.keras.layers.MaxPool1D(pool_size=2),
    #tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(rate=hp_rate),
    # tf.keras.layers.Dense(units=hp_units, activation='relu'), #>>>>>Hiperparametro
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model_CNN_0.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), #>>>>>Hiperparametro
              metrics=['accuracy'])

    return model_CNN_0

In [None]:
# model_CNN_0 = tf.keras.Sequential([
#     encoder_0,
#     tf.keras.layers.Embedding(
#         input_dim=len(encoder_0.get_vocabulary()),
#         output_dim=64,
#         # Use masking to handle the variable sequence lengths
#         mask_zero=True),
#     tf.keras.layers.Conv1D(filters=32, kernel_size=8, activation='relu'),
#     tf.keras.layers.MaxPool1D(pool_size=2),
#     #tf.keras.layers.Flatten(),    
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.GlobalMaxPool1D(),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

In [None]:
# print(model_CNN_0.summary())

In [None]:
# model_CNN_0.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy'])

In [None]:
# history = model_CNN_0.fit(X_train_join[0], y_train, epochs=30,
#                     batch_size = 32,
#                     validation_data= (X_valid_join[0], y_valid),
#                     validation_steps=30
#                     )

In [None]:
# Instantiate the tuner
tuner = kt.Hyperband(model_builder_0, # the hypermodel
                     objective='val_accuracy', # objective to optimize
max_epochs=50,
factor=3, # factor which you have seen above 
directory='tuner', # directory to save logs 
project_name='cnn_0_bal_2')

In [None]:
# hypertuning settings
tuner.search_space_summary() 


In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
# Perform hypertuning
tuner.search(X_train_join[1], y_train, epochs=50, validation_data = (X_valid_join[0], y_valid), callbacks=[stop_early])
best_hp = tuner.get_best_hyperparameters()[0]

In [None]:
# Build the model with the optimal hyperparameters
model_CNN_0 = tuner.hypermodel.build(best_hp)
model_CNN_0.summary()

In [None]:
history = model_CNN_0.fit(X_train_join[0], y_train, epochs=50, validation_data = (X_valid_join[0], y_valid), callbacks=[stop_early])

In [None]:
test_loss, test_acc = model_CNN_0.evaluate(X_test_join[0], y_test)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
result = model_CNN_0.predict(X_test_join[0])
result = np.where(result > 0.5, 1, 0)
result

print(classification_report(y_test, result))

In [None]:
confusion_matrix(y_test, result)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

#### Com stemming

In [None]:
# model_CNN_1 = tf.keras.Sequential([
#     encoder_1,
#     tf.keras.layers.Embedding(
#         input_dim=len(encoder_1.get_vocabulary()),
#         output_dim=64,
#         # Use masking to handle the variable sequence lengths
#         mask_zero=True),
#     tf.keras.layers.Conv1D(filters=32, kernel_size=8, activation='relu'),
#     tf.keras.layers.MaxPool1D(pool_size=2),
#     #tf.keras.layers.Flatten(),    
#     tf.keras.layers.Dense(32, activation='relu'),
#     tf.keras.layers.GlobalMaxPool1D(),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

In [None]:
import keras_tuner as kt
##Testando keras tuner

def model_builder_1(hp):

    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)

    model_CNN_1 = tf.keras.Sequential([
    encoder_1,
    tf.keras.layers.Embedding(
        input_dim=len(encoder_1.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Conv1D(filters=32, kernel_size=8, activation='relu'),
    tf.keras.layers.MaxPool1D(pool_size=2),
    #tf.keras.layers.Flatten(),    
    tf.keras.layers.Dense(units=hp_units, activation='relu'), #>>>>>Hiperparametro
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model_CNN_1.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), #>>>>>Hiperparametro
              metrics=['accuracy'])

    return model_CNN_1

In [None]:
# Instantiate the tuner
tuner = kt.Hyperband(model_builder_1, # the hypermodel
                     objective='val_accuracy', # objective to optimize
max_epochs=10,
factor=3, # factor which you have seen above 
directory='tuner', # directory to save logs 
project_name='cnn_1')

In [None]:
# hypertuning settings
tuner.search_space_summary() 


In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
# Perform hypertuning
tuner.search(X_train_join[1], y_train, epochs=10, validation_data = (X_valid_join[1], y_valid), callbacks=[stop_early])

In [None]:
best_hp = tuner.get_best_hyperparameters()[0]

In [None]:
# Build the model with the optimal hyperparameters
model_CNN_1 = tuner.hypermodel.build(best_hp)
model_CNN_1.summary()


In [None]:
history = model_CNN_1.fit(X_train_join[1], y_train, epochs=10, validation_data = (X_valid_join[1], y_valid))

In [None]:
# print(model_CNN_1.summary())

In [None]:
# model_CNN_1.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy'])

In [None]:
# history = model_CNN_1.fit(X_train_join[1], y_train, epochs=20,
#                     batch_size = 32,
#                     validation_data= (X_valid_join[1], y_valid),
#                     validation_steps=3
#                     )

In [None]:
#test_loss, test_acc = model_CNN_1.evaluate(X_test_join[1], y_test)
test_loss, test_acc = model_CNN_1.evaluate(X_test_join[1], y_test)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
result = h_model.predict(X_test_join[1])
result = np.where(result > 0.5, 1, 0)
result

print(classification_report(y_test, result))


In [None]:
confusion_matrix(y_test, result)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

### LSTM

In [None]:
model_LSTM0 = tf.keras.Sequential([
    encoder_0,
    tf.keras.layers.Embedding(
        input_dim=len(encoder_0.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model_LSTM0.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
print(model_LSTM0.summary())

#### Sem stemming

Treinando modelo

In [None]:
history = model_LSTM0.fit(X_train_join[0], y_train, epochs=20,
                    batch_size = 32,
                    validation_data= (X_valid_join[0], y_valid),
                    validation_steps=30
                    )

Avaliando modelo

In [None]:
test_loss, test_acc = model_LSTM0.evaluate(X_test_join[0], y_test)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
result = model_LSTM0.predict(X_test_join[0])
result = np.where(result > 0.5, 1, 0)
result

print(classification_report(y_test, result))

In [None]:
confusion_matrix(y_test, result)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

#### Com stemming

In [None]:
model_LSTM1 = tf.keras.Sequential([
    encoder_1,
    tf.keras.layers.Embedding(
        input_dim=len(encoder_1.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model_LSTM1.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

Treinando modelo

In [None]:
history = model_LSTM1.fit(X_train_join[1], y_train, epochs=10,
                    batch_size = 32,
                    validation_data= (X_valid_join[1], y_valid),
                    validation_steps=30
                    )

Avaliando modelo

In [None]:
test_loss, test_acc = model_LSTM1.evaluate(X_test_join[1], y_test)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
result = model_LSTM1.predict(X_test_join[1])
result = np.where(result > 0.5, 1, 0)
result

print(classification_report(y_test, result))

In [None]:
confusion_matrix(y_test, result)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

### BERT

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)

In [None]:
def convert_example_to_feature(review):
  return tokenizer.encode_plus(review,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [None]:
# can be up to 512 for BERT
max_length = 256
batch_size = 1

In [None]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [None]:
def encode_examples(texts, labels, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
      ds = ds.take(limit)
  # for review, label in tfds.as_numpy(ds):
  for text, label in zip(texts, labels):
    bert_input = convert_example_to_feature(text)
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

#### Sem stemming

In [None]:
# train dataset
ds_train_encoded_0 = encode_examples(X_train_join[0], y_train).shuffle(3).batch(batch_size)

# test dataset
ds_test_encoded_0 = encode_examples(X_test_join[0], y_test).batch(batch_size)

#validation dataset
ds_valid_encoded_0 = encode_examples(X_valid_join[0], y_valid).batch(batch_size)

In [None]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 3
# model initialization
model_bert_0 = TFBertForSequenceClassification.from_pretrained('bert-base-portuguese-cased', from_pt = True)

In [None]:
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model_bert_0.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
bert_history = model_bert_0.fit(ds_train_encoded_0, epochs=number_of_epochs, validation_data=ds_valid_encoded_0)

In [None]:
tf_output = model_bert_0.predict(ds_test_encoded_0)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive'] #(0:negative, 1:positive)
label = tf.argmax(tf_prediction, axis=1)
label_pred = label.numpy()
print(label_pred)

In [None]:
unique, counts = np.unique(label_pred, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(classification_report(y_test, label_pred))

In [None]:
confusion_matrix(y_test, label_pred)

#### Com stemming

In [None]:
# train dataset
ds_train_encoded_1 = encode_examples(X_train_join[1], y_train).shuffle(3).batch(batch_size)

# test dataset
ds_test_encoded_1 = encode_examples(X_test_join[1], y_test).batch(batch_size)

#validation dataset
ds_valid_encoded_1 = encode_examples(X_valid_join[1], y_valid).batch(batch_size)

In [None]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 3
# model initialization
model_bert_1 = TFBertForSequenceClassification.from_pretrained('bert-base-portuguese-cased', from_pt = True)

In [None]:
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model_bert_1.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
bert_history = model_bert_1.fit(ds_train_encoded_1, epochs=number_of_epochs, validation_data=ds_valid_encoded_1)

In [None]:
tf_output = model_bert_1.predict(ds_test_encoded_1)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive'] #(0:negative, 1:positive)
label = tf.argmax(tf_prediction, axis=1)
label_pred = label.numpy()
print(label_pred)

In [None]:
unique, counts = np.unique(label_pred, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(classification_report(y_test, label_pred))

In [None]:
confusion_matrix(y_test, label_pred)

## Clustering

In [None]:
df_pp[["reviews", "reviews_pipeline_0", "class"]].head()

In [None]:
df_cluster = df_pp[["reviews", "reviews_pipeline_0", "class"]]

X_join = df_cluster["reviews_pipeline_0"].apply(" ".join)
X_join = X_join.to_numpy()
#df_cluster["reviews_join"] = X_join

In [None]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 2000) 

#List with BoWs (pipeline 0 and 1)
bow_vec = vectorizer.fit_transform(X_join)

In [None]:
words = vectorizer.get_feature_names()
#words = vectorizer.get_feature_names_out()

#setup kmeans clustering
kmeans = KMeans(n_clusters = 2, random_state = 420)
#fit the data 
kmeans.fit(bow_vec)

In [None]:
kmeans.cluster_centers_

https://towardsdatascience.com/k-means-clustering-chardonnay-reviews-using-scikit-learn-nltk-9df3c59527f3

In [None]:
### Palavras mais comuns em cada cluster
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

Usando distancia

In [None]:
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, csr_matrix.toarray(bow_vec))
closest

Usando vector quantization (https://stackoverflow.com/questions/21660937/get-nearest-point-to-centroid-scikit-learn)

In [None]:
# centroids: N-dimensional array with your centroids
# points:    N-dimensional array with your data points
closest, distances = vq(kmeans.cluster_centers_, csr_matrix.toarray(bow_vec))
closest

https://stackoverflow.com/questions/39766593/get-element-closest-to-cluster-centroid

In [None]:
from scipy.spatial import cKDTree

def find_k_closest(centroids, data, k=1, distance_norm=2):
    """
    Arguments:
    ----------
        centroids: (M, d) ndarray
            M - number of clusters
            d - number of data dimensions
        data: (N, d) ndarray
            N - number of data points
        k: int (default 1)
            nearest neighbour to get
        distance_norm: int (default 2)
            1: Hamming distance (x+y)
            2: Euclidean distance (sqrt(x^2 + y^2))
            np.inf: maximum distance in any dimension (max((x,y)))

    Returns:
    -------
        indices: (M,) ndarray
        values: (M, d) ndarray
    """

    kdtree = cKDTree(data)
    distances, indices = kdtree.query(centroids, k, p=distance_norm)
    if k > 1:
        indices = indices[:,-1]
    values = data[indices]
    return indices, values

indices, values = find_k_closest(kmeans.cluster_centers_, csr_matrix.toarray(bow_vec))

In [None]:
indices

Achar os n mais proximos (https://stackoverflow.com/questions/26795535/output-50-samples-closest-to-each-cluster-center-using-scikit-learn-k-means-libr)

- n Reviews mais próximas do centróide da classe 0 (Ruim)

In [None]:
n=2
d = kmeans.transform(csr_matrix.toarray(bow_vec))[:, 0] #distancia de cada ponto ao centroide 0
ind0 = np.argsort(d)[::][:n]
ind0
#csr_matrix.toarray(bow_vec)[ind]

In [None]:
df_pp.iloc[ind0[0]]["reviews"]

In [None]:
df_pp.iloc[ind0[1]]["reviews"]

In [None]:
lista_neg = []
for i in ind0.tolist():
    lista_neg.append(df_pp.iloc[i]["reviews"])
lista_neg

- n Reviews mais próximas do centróide da classe 1 (Bom)

In [None]:
n = 2
d = kmeans.transform(csr_matrix.toarray(bow_vec))[:, 1] #distancia de cada ponto ao centroide 1
ind1 = np.argsort(d)[::][:n]
ind1

In [None]:
df_pp.iloc[ind1[0]]["reviews"]

In [None]:
df_pp.iloc[ind1[1]]["reviews"]

In [None]:
lista_posi = []
for i in ind1.tolist():
    lista_posi.append(df_pp.iloc[i]["reviews"])
lista_posi

Salvando em um df para incluir no dashboard

In [None]:
df_rep = pd.DataFrame(data={"negativo": lista_neg, "positivo": lista_posi})
df_rep.to_csv("reviews_rep.csv", index=False)

## Função para salvar predições do melhor classificador

In [None]:
#X_test["pred"] = result
#df_pred = pd.merge(X_test[["class", "pred"]], df_pp[["reviews", "stars", "dates"]], left_index=True, right_index=True)
#df_pred.to_csv("best_pred.csv", index=False)

def save_results(result, X_test, df_pp, dfname="best_pred.csv"):
    X_test["pred"] = result
    df_pred = pd.merge(X_test[["class", "pred"]], df_pp[["reviews", "stars", "dates"]], left_index=True, right_index=True)
    df_pred.to_csv(dfname, index=False)

save_results(result, X_test, df_pp)

In [None]:
pd.read_csv("best_pred.csv")