In [1]:
%%capture
!pip install -r ../requirements.txt

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import unicodedata
import fasttext
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from wordcloud import WordCloud

In [3]:
df_raw = pd.read_csv('../data/raw/Big_AHR.csv')

In [4]:
df_negative = df_raw[df_raw['rating']<=2]

In [5]:
%%capture
!pip install keybert

In [6]:
from keybert import KeyBERT

In [7]:
kw_model = KeyBERT(model='all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [8]:
def normalize_str(text_string):
    if text_string is not None:
        result = unicodedata.normalize('NFD', text_string).encode('ascii', 'ignore').decode()
    else:
        result = None
    return result

In [9]:
def non_alphanumeric(texto):
    return re.sub("(\\W)+"," ", texto)

In [10]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/gitpod/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/gitpod/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
stop_words = stopwords.words('spanish')

for word in ['no', 'sin', 'nada']:
    stop_words.remove(word)

stop_words.append('hotel')

for i in range(len(stop_words)):
    stop_words[i] = normalize_str(stop_words[i])

In [12]:
def remove_stopwords(text_string):
    word_tokens = word_tokenize(text_string)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    sentence = ' '.join(str(x) for x in filtered_sentence)
    return sentence

In [13]:
title = df_negative['title'].tolist()
review = df_negative['review_text'].tolist()

In [14]:
def key_phrases(title, review):
    key_phrases = []
    for i in range(len(title)):
        text = remove_stopwords(non_alphanumeric(normalize_str((title[i] + ' ' + review[i]).lower())))
        keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), highlight=False, top_n=2)
        keywords_list= list(dict(keywords).keys())
        for keyword in keywords_list:
            key_phrases.append(keyword)
    return key_phrases

In [16]:
key_phrases = key_phrases(title, review)

In [17]:
df_key_phrases = pd.DataFrame(key_phrases, columns=['text'])
df_key_phrases.to_csv('../data/processed/negative_key_phrases.csv')