In [1]:
import json 
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
from nltk.tokenize import word_tokenize
import gensim.corpora as corpora

stopwords = set(nltk.corpus.stopwords.words('spanish'))
stemmer = SnowballStemmer('spanish')



# We will now define preprocessing functions for text. These will reduce the huge size of the vocabulary for vectorization. Each review will be converted to a vector where each component will be the tf-idf corresponding to a token in this reduced vocabulary. 

In [2]:
def preprocess(document):
    """preprocess each review by 1)converting to lowercase,2) removing punctuation symbols and non-alphabetic characters"
    3) removing spanish stopwords and non-alphabetic characters
    4) keeping only the stems of the words ("buenos"->"buen")
    """
    document = document.lower()
    document = document.translate(str.maketrans('', '', string.punctuation))
    stopwordremoval = " ".join(
        [i for i in document.lower().split() if i not in stopwords and i.isalpha()])
    processed_text = [stemmer.stem(i) for i in word_tokenize(stopwordremoval)]
    return processed_text

def tokenize_reviews(df_reviews):
    # generating documents in a tokenized-stemmed format
    docs = df_reviews["body"].apply(preprocess)

    docs = docs.tolist()
    # Create Dictionary
    id2word = corpora.Dictionary(docs)
    print("length of vocab before filtering", len(id2word))

    # vocab_list contains the vocabulary
    vocab_list = [k for k in id2word.token2id.keys()]
    print("length of vocab_list before filtering", len(vocab_list))

    id2word.filter_extremes(no_below=3, no_above=0.90)
    print("length of vocab after filtering extremes", len(id2word))

    # vocab_list contains the filtered vocabulary
    vocab_list = [k for k in id2word.token2id.keys()]

    print("length of vocab after filtering extreme-words", len(vocab_list))

    # removing very short token stems
    vocab_list = [x for x in vocab_list if len(x) > 2]
    print("length of vocab after very short tokens", len(vocab_list))

    # keeping in docs only words in vocab_list (remove the filtered extreme words)
    filtered_docs = []
    for doc in docs:
        filtered_doc = []
        for elem in doc:
            if elem in vocab_list:
                filtered_doc.append(elem)
        filtered_docs.append(filtered_doc)

    reviews_tokenized = filtered_docs
    return reviews_tokenized



# Next we will read the reviews file

In [3]:
path_reviews = os.path.join("resources", "reviews.json")

with open(path_reviews) as json_file:
    reviews = json.load(json_file)

df_reviews = pd.DataFrame(reviews)

In [4]:
reviews_tokenized = tokenize_reviews(df_reviews)

length of vocab before filtering 4074
length of vocab_list before filtering 4074
length of vocab after filtering extremes 1737
length of vocab after filtering extreme-words 1737
length of vocab after very short tokens 1717


# Before filtering the vocabulary word-stock there were 4074 vocabulary terms, with our approach (stemming,removing stopwords,etc) these have been reduced to 1717 tokens ; thus reducing the huge dimensionality of the review vectors

In [5]:
df_reviews["review_tokens"] = reviews_tokenized

In [6]:
def list_to_string(list_text):
    string_text = ' '.join(list_text)
    return string_text
    

In [7]:
df_reviews["review_tokens"] = df_reviews["review_tokens"].apply(list_to_string)

In [8]:
df_reviews

Unnamed: 0,body,uid,review_tokens
0,Buena selección de ostras.,00000f8808a9789cfe57be5884ff1ad5c3b96580,buen seleccion ostras
1,"Tiene gran variedad de tapas a 2,50 de gran ca...",000010f29b5d65ad7c073acc31e327dc3ff9af54,gran varied tap gran calid
2,"Buen ambiente, trato excelente y jamón exquisito.",0000341606a7b258a202b225bb60bb615171fd18,buen ambient trat excelent jamon exquisit
3,Las pasta correcta pero es cara para las racio...,00003bb8ac6d31908a02cff8e372fd3434545d9a,past correct car racion
4,"Si he estado unas 50 veces, NUNCA, he salido m...",0000c9cca3bca013e9fd7afcc7f7bc3312dfb917,unas vec nunc sal intent nombr eleg buen excel...
...,...,...,...
4995,"El clásico bar de tapas de toda la vida, con e...",f6aa4efe7ea814ce71c5697432375a42669a4f1f,clasic bar tap tod vid excelent materi prim la...
4996,Hoy hemos repetido y he querido compartir mi o...,617f4bbdf90ecd26b24fb2c7952b75e7d0181f8d,hoy repet quer compart opinion leid comentari ...
4997,"No quiero ni pensar, sin descuento: 35-40 euro...",76deeb2b327c2f1ff5340357092d15d540e055cc,quier pens descuent eur sal practic igual entr...
4998,"Quedaron como lo que son, unos señores del pal...",13332e163f0ff8263d7836f106186b15a22d1ede,qued señor palad buen hac trabaj bien junt art...


In [9]:
vectorizer = TfidfVectorizer()
X = df_reviews["review_tokens"]
X_vect = vectorizer.fit_transform(X)

In [10]:
vectorizer.get_feature_names()[600:610]

['ejempl',
 'elabor',
 'eleccion',
 'eleg',
 'elev',
 'elig',
 'ello',
 'embarg',
 'embut',
 'empan']

In [11]:
def cosine_similarity(index1,index2):
    """this function takes as input the indices of two reviews,returns the similarity score between them"""
    dot_product = np.dot(X_vect[index1],np.transpose(X_vect[index2]))
    return round(dot_product.todense()[0,0],3)

# Given that the number of reviews is roughly 5000, comparing all these reviews one to one is a bit cumbersome computationally. Thus we will check our vectorization by computing the similarities between the first 500 reviews only, creating a table that for each pair of reviews gives their similarity score

In [12]:
similarity = []
for i in range(len(df_reviews.head(500))):
    for j in range(i,len(df_reviews.head(500))):
        if i!=j:
            d_ij = cosine_similarity(i,j)
            row = (i,j,d_ij)
            similarity.append(row)
    

In [13]:
similarity = pd.DataFrame(similarity,columns=["i","j","similarity"])

Let us rank the table by similarity and let us see whether the most similar pairs of reviews in this sample share a likeness in their words

In [14]:
similarity.sort_values(by="similarity",ascending=False).head(20)

Unnamed: 0,i,j,similarity
69485,167,181,0.851
122199,428,434,0.842
68493,164,188,0.84
110319,329,434,0.839
19392,40,253,0.836
120146,403,456,0.832
106729,309,434,0.828
13670,28,105,0.822
52087,118,227,0.789
35504,77,85,0.787


# Let us compare for instance reviews 118 and 354 which have a cosine-similarity score of 0.773

In [15]:
i = 118
print("original text : ",df_reviews.body.iloc[i])
print("processed-tokenized text :",df_reviews.review_tokens.iloc[i])

original text :  Además tiene unas vistas fantásticas al mar.
processed-tokenized text : ademas unas vist fantast mar


In [16]:
j= 354
print("original text : ",df_reviews.body.iloc[j])
print("processed-tokenized text :",df_reviews.review_tokens.iloc[j])

original text :  Y unas vistas fantásticas.
processed-tokenized text : unas vist fantast


# One can clearly see that the two reviews are indeed very alike. They both mention the "fantastic views" 

In [17]:
i= 167
print("original text : ",df_reviews.body.iloc[i])
print("processed-tokenized text :",df_reviews.review_tokens.iloc[i])

original text :  Se come estupendamente y a precios asequibles.
processed-tokenized text : com estupend preci asequ


In [18]:
j = 181
print("original text : ",df_reviews.body.iloc[j])
print("processed-tokenized text :",df_reviews.review_tokens.iloc[j])

original text :  Estupendas tapas a precios asequibles.
processed-tokenized text : estupend tap preci asequ


# This second example, for reviews i = 161 and 187 (with cosine similarity=0.852) also shows that they are indeed quite similar, both of them mentioning the "affordable prices" and using the root "estupendo"

In [19]:
similarity.sort_values(by="similarity",ascending=False).tail(5)

Unnamed: 0,i,j,similarity
43341,95,497,0.0
43340,95,496,0.0
43338,95,494,0.0
43337,95,493,0.0
124749,498,499,0.0


# On the other hand let us check that reviews with similarity_score equal to zero are indeed totally unlike. In the example below for instance reviews 95 and 497 use completely different words in their description.

In [20]:
i= 95
print("original text : ",df_reviews.body.iloc[i])
print("processed-tokenized text :",df_reviews.review_tokens.iloc[i])

original text :  Encargamos una tarta aquí y el resultado fue espectacular.
processed-tokenized text : encarg tart aqu result espectacul


In [21]:
j= 497
print("original text : ",df_reviews.body.iloc[j])
print("processed-tokenized text :",df_reviews.review_tokens.iloc[j])

original text :  ..solo cumple con lo de pudin.
processed-tokenized text : sol cumpl pudin
