<a href="https://colab.research.google.com/github/m4nko/TT050-MineracaoDeTexto/blob/main/TarefaMineracaoDeTexto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tarefa Mineração de Texto - Reviews de Hotéis
### Carlos Eduardo de Andrade Pereira - 168321

In [5]:
# Importando o Dataset

import pandas as pd
dt = pd.read_csv('tripadvisor_hotel_reviews.csv')
dt.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [6]:
# Tamanho do dataset

len(dt)

20491

### Passo 1: Preparar as reviews

In [7]:
# Removendo palavras com números e as tornando minúsculas
# re = Regular Expression

import re

lower_alpha = lambda x: re.sub(r"""\w*\d\w*""", ' ', x.lower())
dt['Review'] = dt.Review.map(lower_alpha)

dt.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not * experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [8]:
# Removendo pontuações

import string

without_ponc = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
dt['Review'] = dt.Review.map(without_ponc)

dt.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not experience hotel monaco seat...,3
3,unique great stay wonderful time hotel monac...,5
4,great stay great stay went seahawk game aweso...,5


In [9]:
# Checando se não há 'br' no texto

dt.Review[1]

'ok nothing special charge diamond member hilton decided chain shot   anniversary seattle  start booked suite paid extra website description not  suite bedroom bathroom standard hotel room  took printed reservation desk showed said things like tv couch ect desk clerk told oh mixed suites description kimpton website sorry free breakfast  got kidding  embassy suits sitting room bathroom bedroom unlike kimpton calls suite    day stay offer correct false advertising  send kimpton preferred guest website email asking failure provide suite advertised website reservation description furnished hard copy reservation printout website desk manager duty did not reply solution  send email trip guest survey did not follow email mail  guess tell concerned guest the staff ranged indifferent not helpful  asked desk good breakfast spots neighborhood hood told no hotels  gee best breakfast spots seattle     block away convenient hotel does not know exist  arrived late night   pm inside run bellman busy c

In [10]:
# Tokenizando
#import nltk
#nltk.download()
#nltk.download('punkt')
from nltk.tokenize import word_tokenize

dt['Token'] = dt.Review.map(word_tokenize)

In [11]:
# Removendo stop words
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords

stop_words = stopwords.words('english')

stop_lambda = lambda x: [y for y in x if y not in stop_words]
dt['Tokens_stWords'] = dt.Token.apply(stop_lambda)
dt.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Review,Rating,Token,Tokens_stWords
0,nice hotel expensive parking got good deal sta...,4,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d..."
1,ok nothing special charge diamond member hilto...,2,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member..."
2,nice rooms not experience hotel monaco seat...,3,"[nice, rooms, not, experience, hotel, monaco, ...","[nice, rooms, experience, hotel, monaco, seatt..."
3,unique great stay wonderful time hotel monac...,5,"[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ..."
4,great stay great stay went seahawk game aweso...,5,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game..."


In [12]:
# Parte do discurso

from nltk.tag import pos_tag

pos_lambda = lambda x: nltk.pos_tag(x)
dt['Tokens_pos'] = (dt.Tokens_stWords.apply(pos_lambda))
dt.head()

Unnamed: 0,Review,Rating,Token,Tokens_stWords,Tokens_pos
0,nice hotel expensive parking got good deal sta...,4,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d...","[(nice, JJ), (hotel, NN), (expensive, JJ), (pa..."
1,ok nothing special charge diamond member hilto...,2,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member...","[(ok, JJ), (nothing, NN), (special, JJ), (char..."
2,nice rooms not experience hotel monaco seat...,3,"[nice, rooms, not, experience, hotel, monaco, ...","[nice, rooms, experience, hotel, monaco, seatt...","[(nice, JJ), (rooms, NNS), (experience, VBP), ..."
3,unique great stay wonderful time hotel monac...,5,"[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ...","[(unique, JJ), (great, JJ), (stay, NN), (wonde..."
4,great stay great stay went seahawk game aweso...,5,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game...","[(great, JJ), (stay, NN), (great, JJ), (stay, ..."


In [13]:
# Stemização
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
stem_lambda = lambda x: [stemmer.stem(y) for y in x]

dt['Tokens_stem'] = dt.Tokens_stWords.apply(stem_lambda)
dt.head()

Unnamed: 0,Review,Rating,Token,Tokens_stWords,Tokens_pos,Tokens_stem
0,nice hotel expensive parking got good deal sta...,4,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d...","[(nice, JJ), (hotel, NN), (expensive, JJ), (pa...","[nice, hotel, expens, park, got, good, deal, s..."
1,ok nothing special charge diamond member hilto...,2,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member...","[(ok, JJ), (nothing, NN), (special, JJ), (char...","[ok, noth, special, charg, diamond, member, hi..."
2,nice rooms not experience hotel monaco seat...,3,"[nice, rooms, not, experience, hotel, monaco, ...","[nice, rooms, experience, hotel, monaco, seatt...","[(nice, JJ), (rooms, NNS), (experience, VBP), ...","[nice, room, experi, hotel, monaco, seattl, go..."
3,unique great stay wonderful time hotel monac...,5,"[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ...","[(unique, JJ), (great, JJ), (stay, NN), (wonde...","[uniqu, great, stay, wonder, time, hotel, mona..."
4,great stay great stay went seahawk game aweso...,5,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game...","[(great, JJ), (stay, NN), (great, JJ), (stay, ...","[great, stay, great, stay, went, seahawk, game..."


In [14]:
# Lematização
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

# Convertendo o esquema de nomeação para um que será reconhecido pelo WordNet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
lemmatizer_fun = lambda x: lemmatizer.lemmatize(*x)
dt['Tokens_lemma'] = dt.Tokens_pos\
                        .apply(lambda x: [(y[0], get_wordnet_pos(y[1])) for y in x])\
                        .apply(lambda x: [lemmatizer_fun(y) for y in x])


dt.head().T

Unnamed: 0,0,1,2,3,4
Review,nice hotel expensive parking got good deal sta...,ok nothing special charge diamond member hilto...,nice rooms not experience hotel monaco seat...,unique great stay wonderful time hotel monac...,great stay great stay went seahawk game aweso...
Rating,4,2,3,5,5
Token,"[nice, hotel, expensive, parking, got, good, d...","[ok, nothing, special, charge, diamond, member...","[nice, rooms, not, experience, hotel, monaco, ...","[unique, great, stay, wonderful, time, hotel, ...","[great, stay, great, stay, went, seahawk, game..."
Tokens_stWords,"[nice, hotel, expensive, parking, got, good, d...","[ok, nothing, special, charge, diamond, member...","[nice, rooms, experience, hotel, monaco, seatt...","[unique, great, stay, wonderful, time, hotel, ...","[great, stay, great, stay, went, seahawk, game..."
Tokens_pos,"[(nice, JJ), (hotel, NN), (expensive, JJ), (pa...","[(ok, JJ), (nothing, NN), (special, JJ), (char...","[(nice, JJ), (rooms, NNS), (experience, VBP), ...","[(unique, JJ), (great, JJ), (stay, NN), (wonde...","[(great, JJ), (stay, NN), (great, JJ), (stay, ..."
Tokens_stem,"[nice, hotel, expens, park, got, good, deal, s...","[ok, noth, special, charg, diamond, member, hi...","[nice, room, experi, hotel, monaco, seattl, go...","[uniqu, great, stay, wonder, time, hotel, mona...","[great, stay, great, stay, went, seahawk, game..."
Tokens_lemma,"[nice, hotel, expensive, parking, get, good, d...","[ok, nothing, special, charge, diamond, member...","[nice, room, experience, hotel, monaco, seattl...","[unique, great, stay, wonderful, time, hotel, ...","[great, stay, great, stay, go, seahawk, game, ..."


## Passo 2: Análise de Sentimentos

In [15]:
# Modificando o dataset para incluir uma análise de sentimentos

import numpy as np

dt = dt[dt.Rating!=3]
dt['Feeling'] = np.where(dt['Rating'] >= 4, 'positive', 'negative')

dt = dt[['Review', 'Feeling']]
dt.head()

Unnamed: 0,Review,Feeling
0,nice hotel expensive parking got good deal sta...,positive
1,ok nothing special charge diamond member hilto...,negative
3,unique great stay wonderful time hotel monac...,positive
4,great stay great stay went seahawk game aweso...,positive
5,love monaco staff husband stayed hotel crazy w...,positive


In [16]:
# Com isso, podemos notar que as revisões tendem a ser positivas,tendo somente 17% das avaliações um sentimento negativo!
dt.Feeling.value_counts(normalize=True)

positive    0.824439
negative    0.175561
Name: Feeling, dtype: float64

In [17]:
# Preparando o modelo para a análise - criação de dados para treino e teste

reviews = dt.Review
feeling = dt.Feeling

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(reviews, feeling, test_size=0.2, random_state = 40)

# Tamanho dos dados de treino / teste
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((14645,), (14645,), (3662,), (3662,))

In [18]:
# Criação da primeira matriz

from sklearn.feature_extraction.text import CountVectorizer

matrix1 = CountVectorizer(stop_words='english')

X_train_matrix1 = matrix1.fit_transform(X_train)
X_test_matrix1  = matrix1.transform(X_test)

print(X_train_matrix1.toarray().shape)

(14645, 41473)


In [19]:
print(X_train_matrix1)

  (0, 16148)	1
  (0, 17695)	2
  (0, 12945)	1
  (0, 32555)	1
  (0, 34749)	1
  (0, 24558)	1
  (0, 7359)	1
  (0, 20925)	1
  (0, 15953)	1
  (0, 20156)	1
  (0, 15886)	1
  (0, 14834)	1
  (0, 11416)	1
  (0, 694)	1
  (0, 12788)	2
  (0, 7449)	1
  (0, 10139)	1
  (0, 5181)	1
  (0, 434)	1
  (0, 8473)	1
  (0, 21347)	1
  (0, 22056)	2
  (0, 22906)	1
  (0, 34719)	1
  (0, 11293)	1
  :	:
  (14644, 34592)	1
  (14644, 18254)	1
  (14644, 8986)	1
  (14644, 3806)	1
  (14644, 32600)	1
  (14644, 6586)	1
  (14644, 35815)	1
  (14644, 26479)	1
  (14644, 36292)	1
  (14644, 24181)	1
  (14644, 34414)	1
  (14644, 17137)	2
  (14644, 28232)	1
  (14644, 22384)	1
  (14644, 37628)	1
  (14644, 8400)	1
  (14644, 18260)	1
  (14644, 36552)	1
  (14644, 39119)	1
  (14644, 39118)	1
  (14644, 17005)	1
  (14644, 31743)	3
  (14644, 810)	1
  (14644, 35025)	1
  (14644, 228)	1


In [1]:
# Criando a Matriz 2
matrix2 = CountVectorizer(ngram_range=(1,2), binary=True, stop_words='english')

X_train_matrix2 = matrix2.fit_transform(X_train)
X_test_matrix2  = matrix2.transform(X_test)

print(X_train_matrix2.toarray().shape)

NameError: ignored

In [2]:
print(X_train_matrix2)

NameError: ignored

In [3]:
# Criando um modelo de Regressão Logística
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
# Treinando o primeiro modelo
lr.fit(X_train_matrix1, Y_train)
y_pred_matrix1 = lr.predict(X_test_matrix1)

In [None]:
# Treinando o segundo modelo
lr.fit(X_train_matrix2, Y_train)
y_pred_matrix2 = lr.predict(X_test_matrix2)

In [None]:
# Função para calcular as métricas de erro, já que faremos multiplas vezes

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from __future__ import division
%matplotlib inline

def conf_matrix(actual, predicted):
    cm = confusion_matrix(actual, predicted)
    sns.heatmap(cm, xticklabels=['predicted_negative', 'predicted_positive'], 
                yticklabels=['actual_negative', 'actual_positive'], annot=True,
                fmt='d', annot_kws={'fontsize':20}, cmap="YlGnBu");

    true_neg, false_pos = cm[0]
    false_neg, true_pos = cm[1]
    
    a = true_pos + true_neg
    b = true_pos + true_neg + false_pos + false_neg
    print (a/b)
    
    accuracy = round(float((true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)),3)
    precision = round(float((true_pos) / (true_pos + false_pos)),3)
    recall = round(float((true_pos) / (true_pos + false_neg)),3)
    f1 = round(float(2 * (precision * recall) / (precision + recall)),3)

    cm_results = [accuracy, precision, recall, f1]
    return cm_results

In [None]:
# Compilando as métricas de erros em um dataframe para comparação

results = pd.DataFrame(list(zip(cm1, cm2)))
results = results.set_index([['Accuracy', 'Precision', 'Recall', 'F1 Score']])
results.columns = ['LogReg1', 'LogReg2']
results