### <div align="center">***RECOMENDADOR***</div>
***

In [1]:
import numpy as np
import pandas as pd
import scipy
import sys
import re


import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:

#!pip install unidecode
sys.path.append(r"Utilities")
import Utilities 

# Limpieza dataset

In [3]:
dataset=pd.read_csv('../data/df_comerios.csv')


In [4]:
dataset=dataset.drop(['image_url','is_closed','review_count','display_phone','location'], axis=1)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           958 non-null    object 
 1   name         958 non-null    object 
 2   rating       958 non-null    float64
 3   address      958 non-null    object 
 4   description  956 non-null    object 
 5   review       463 non-null    object 
 6   alias        958 non-null    object 
 7   latitude     958 non-null    float64
 8   longitude    958 non-null    float64
 9   price        620 non-null    object 
 10  distrito     958 non-null    object 
dtypes: float64(3), object(8)
memory usage: 82.5+ KB


In [6]:
dataset.head(2)

Unnamed: 0,id,name,rating,address,description,review,alias,latitude,longitude,price,distrito
0,GV-WXC3F4MUzwjhAH_f_XA,El Rincón Asturiano,1.0,"Calle de las Delicias, 26, 28045 Madrid, Spain",Spanish Asturian Tapas Bars,We are here for dinner again on two consecutiv...,el-rincón-asturiano-madrid-2,40.403985,-3.692258,€€,Arganzuela
1,7zpK35tqV8uFtg9BGwfbRg,Donde da la Vuelta el Viento,5.0,"Calle de Mesón de Paredes, 81, 28012 Madrid, S...",Tapas Bars Spanish Modern European,Great place with friendly staff. I came for ta...,donde-da-la-vuelta-el-viento-madrid,40.40619,-3.701441,€,Arganzuela


In [7]:
dataset['combinacion'] = dataset[['description', 'review', 'alias','distrito']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [8]:
dataset=dataset.drop(['description', 'review', 'alias','distrito'], axis=1)

In [9]:
dataset.head(2)

Unnamed: 0,id,name,rating,address,latitude,longitude,price,combinacion
0,GV-WXC3F4MUzwjhAH_f_XA,El Rincón Asturiano,1.0,"Calle de las Delicias, 26, 28045 Madrid, Spain",40.403985,-3.692258,€€,Spanish Asturian Tapas Bars We are here for d...
1,7zpK35tqV8uFtg9BGwfbRg,Donde da la Vuelta el Viento,5.0,"Calle de Mesón de Paredes, 81, 28012 Madrid, S...",40.40619,-3.701441,€,Tapas Bars Spanish Modern European Great plac...


# Limpieza de columna analisis

Lo primero separamos el texto en palabras


In [10]:
#Tokenization
dataset['Descripcion_tok'] = dataset['combinacion'].apply(lambda x: Utilities.tokenizacion(x)) 
dataset['Descripcion_tok'].head(10)

0    [spanish, asturian, tapas, bars, are, here, fo...
1    [tapas, bars, spanish, modern, european, great...
2    [asturian, tapas, bars, the, size, everything,...
3    [tapas, bars, tapas/small, plates, beer, bar, ...
4    [spanish, mediterranean, tapas/small, plates, ...
5    [indian, south, asian, cuisine, bangladesh, in...
6    [spanish, tapas/small, plates, stayed, marriot...
7    [thai, legit, thai, food, from, (, thai, ), ju...
8    [chinese, asian, fusion, would, definitely, re...
9    [social, clubs, coffee, &, tea, former, tobacc...
Name: Descripcion_tok, dtype: object

Ahora eliminamos palabras que aportan poco significado: articulos, preposiciones.

In [11]:
dataset['Descripcion_tok'] = dataset['Descripcion_tok'].apply(lambda x:Utilities.removeStopwords(x)) 
dataset['Descripcion_tok'].head(10)

0    [spanish, asturian, tapas, bars, are, here, fo...
1    [tapas, bars, spanish, modern, european, great...
2    [asturian, tapas, bars, the, size, everything,...
3    [tapas, bars, tapas/small, plates, beer, bar, ...
4    [spanish, mediterranean, tapas/small, plates, ...
5    [indian, south, asian, cuisine, bangladesh, in...
6    [spanish, tapas/small, plates, stayed, marriot...
7    [thai, legit, thai, food, from, (, thai, ), ju...
8    [chinese, asian, fusion, would, definitely, re...
9    [social, clubs, coffee, &, tea, former, tobacc...
Name: Descripcion_tok, dtype: object

Ahora eliminamos signos de puntuación 

In [12]:

dataset['Descripcion_tok'] = dataset['Descripcion_tok'].apply(lambda x:Utilities.removePunctuation(x)) 
dataset['Descripcion_tok'].head(10) 

0    [spanish, asturian, tapas, bars, are, here, fo...
1    [tapas, bars, spanish, modern, european, great...
2    [asturian, tapas, bars, the, size, everything,...
3    [tapas, bars, tapas/small, plates, beer, bar, ...
4    [spanish, mediterranean, tapas/small, plates, ...
5    [indian, south, asian, cuisine, bangladesh, in...
6    [spanish, tapas/small, plates, stayed, marriot...
7    [thai, legit, thai, food, from, thai, just, th...
8    [chinese, asian, fusion, would, definitely, re...
9    [social, clubs, coffee, tea, former, tobacco, ...
Name: Descripcion_tok, dtype: object

Ahora unimos todas las palabras, lo convertimos a string.

In [13]:

dataset['Descripcion_clean'] = dataset['Descripcion_tok'].apply(lambda x:Utilities.arrayToString(x))
dataset['Descripcion_clean'].head(10)

0    spanish asturian tapas bars are here for dinne...
1    tapas bars spanish modern european great place...
2    asturian tapas bars the size everything you or...
3    tapas bars tapas/small plates beer bar delicio...
4    spanish mediterranean tapas/small plates the s...
5    indian south asian cuisine bangladesh influenc...
6    spanish tapas/small plates stayed marriott ato...
7    thai legit thai food from thai just the items ...
8    chinese asian fusion would definitely recommen...
9    social clubs coffee tea former tobacco factory...
Name: Descripcion_clean, dtype: object

# Analisis importancia palabras

In [14]:
# Contamos cada palabra y creamos con esta cuenta una matriz

vectorizer = CountVectorizer(encoding='iso-8859-1')  
MatrizFrecuencias = vectorizer.fit_transform(dataset['Descripcion_clean'])
MatrizFrecuencias

<958x4442 sparse matrix of type '<class 'numpy.int64'>'
	with 24284 stored elements in Compressed Sparse Row format>

In [16]:
# Visualizacion matriz de Frecuencias

X = pd.DataFrame(data=MatrizFrecuencias.toarray(), index= dataset['id'].values,
                columns=vectorizer.get_feature_names())
X

Unnamed: 0,abarca,abascal,ability,able,about,above,abroad,absolute,absolutely,abuela,...,zoko,zona,zones,zongzi,zuppa,águila,álbora,ángel,índalo,ñeru
GV-WXC3F4MUzwjhAH_f_XA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7zpK35tqV8uFtg9BGwfbRg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cxypKAKs_zzJ8kvB_6G2Bw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
J6Mq8jWYD9ntHd0u4OQr9A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
om6h-4trsKlw9cOp53QXcg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CoqMKp12tEbL8BBMYujYQw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V05g1M-QgokTD3eEAjBkcw,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ve8ThB2NbLWp_lEKzqQ_3w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
i_0Icrf_U4tRuyC2zDki_A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Porcentaje de ceros por palabras, nos hacemos una idea de que palabras se repiten mucho y tienen menos importancia para el análisis

PorecntajeCerosPorPalabra = X.apply(lambda x: 100*len(np.where(x ==0)[0])/len(x), axis=0).sort_values(ascending=True)
PorecntajeCerosPorPalabra.head(10)

madrid      3.549061
the        54.697286
and        57.202505
spanish    65.344468
tapas      67.849687
was        69.624217
food       70.772443
bars       70.981211
this       72.233820
for        73.590814
dtype: float64

Calculamos el término frecuencia inversa de documento, para calcular la proporción de las veces que aparece cada palabra en relación con el resto de palabras ( matriz de correlación).


In [24]:

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(MatrizFrecuencias)
tfidf

<958x4442 sparse matrix of type '<class 'numpy.float64'>'
	with 24284 stored elements in Compressed Sparse Row format>

Ahora que ya tenemos la similitud entre las palabras, podemos clacular la similitud entre los restaurantes, en funcion de las veces que aparecen estas palabras en cada uno de ellos.

In [25]:
tdm = tfidf.transpose()
dtm = tfidf
Simil = dtm.dot(tdm) #TRANSPONES LA MATRIZ PARA MULTIPLICARLA POR SI MISMA (MATRIZ AL CUADRADO)

Visualizamos el resultado de la matriz de similitud


In [26]:
SimilDF = pd.DataFrame(data = Simil.toarray(), index=dataset['id'].values,columns=dataset['id'].values)
SimilDF

Unnamed: 0,GV-WXC3F4MUzwjhAH_f_XA,7zpK35tqV8uFtg9BGwfbRg,cxypKAKs_zzJ8kvB_6G2Bw,J6Mq8jWYD9ntHd0u4OQr9A,om6h-4trsKlw9cOp53QXcg,q6BFfigGUsHK7PoQi-r2_w,6NsFBZG9XtYOT2ZL5Od80A,z8GLIMlrmt_m_dIdfj0BIQ,NG4lZn1VmumeF8kj9iUXCQ,jUpmOhgn-MsKBS72FunUfQ,...,YVhLhQa3PHRH1Av_lhy_8A,rHMCX-xfuYYtZFr8XB5Y8g,cRlQmWJHU2kcEQPdO9xjfw,ckqHMom0dFxZQ7vHg22BcA,XXH-OUFG9hP011oFElTc2w,CoqMKp12tEbL8BBMYujYQw,V05g1M-QgokTD3eEAjBkcw,ve8ThB2NbLWp_lEKzqQ_3w,i_0Icrf_U4tRuyC2zDki_A,laZCuDjNQqe7AVWUxaYcyA
GV-WXC3F4MUzwjhAH_f_XA,1.000000,0.193134,0.182942,0.220973,0.233619,0.166053,0.257421,0.096760,0.173613,0.049975,...,0.202598,0.117195,0.191519,0.160550,0.179609,0.137176,0.185774,0.083966,0.088614,0.148463
7zpK35tqV8uFtg9BGwfbRg,0.193134,1.000000,0.161779,0.221953,0.174451,0.131867,0.167799,0.106750,0.163643,0.042339,...,0.187897,0.069700,0.168372,0.117557,0.198114,0.059659,0.134033,0.080981,0.107968,0.122360
cxypKAKs_zzJ8kvB_6G2Bw,0.182942,0.161779,1.000000,0.202295,0.144500,0.109684,0.200493,0.084393,0.151952,0.036004,...,0.185488,0.112772,0.108985,0.120911,0.198977,0.085828,0.206149,0.144813,0.107291,0.127792
J6Mq8jWYD9ntHd0u4OQr9A,0.220973,0.221953,0.202295,1.000000,0.232345,0.133987,0.205405,0.118912,0.132505,0.046063,...,0.187574,0.108549,0.126142,0.128369,0.238641,0.071105,0.163239,0.111570,0.085626,0.159794
om6h-4trsKlw9cOp53QXcg,0.233619,0.174451,0.144500,0.232345,1.000000,0.178831,0.219727,0.089371,0.164768,0.050461,...,0.185389,0.067980,0.208355,0.125323,0.185668,0.082750,0.130292,0.058377,0.120942,0.155000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CoqMKp12tEbL8BBMYujYQw,0.137176,0.059659,0.085828,0.071105,0.082750,0.064903,0.103311,0.026581,0.041340,0.007491,...,0.091858,0.075555,0.075895,0.130652,0.066290,1.000000,0.113106,0.064169,0.057906,0.106775
V05g1M-QgokTD3eEAjBkcw,0.185774,0.134033,0.206149,0.163239,0.130292,0.084849,0.120013,0.057859,0.087983,0.030264,...,0.207485,0.098385,0.097367,0.122952,0.263041,0.113106,1.000000,0.122440,0.096687,0.141700
ve8ThB2NbLWp_lEKzqQ_3w,0.083966,0.080981,0.144813,0.111570,0.058377,0.095173,0.108175,0.026705,0.067693,0.058725,...,0.098223,0.078468,0.089474,0.076733,0.127612,0.064169,0.122440,1.000000,0.113765,0.109549
i_0Icrf_U4tRuyC2zDki_A,0.088614,0.107968,0.107291,0.085626,0.120942,0.090665,0.113202,0.070979,0.068878,0.023440,...,0.099107,0.051673,0.071936,0.106821,0.168046,0.057906,0.096687,0.113765,1.000000,0.143745


In [27]:
Top = 10
nombre_restaurante="i_0Icrf_U4tRuyC2zDki_A"
Num_restaurante = SimilDF.index.get_loc(nombre_restaurante)
print('restaurante:',SimilDF.columns[Num_restaurante])
RecomendacionItemItem = SimilDF.iloc[(-SimilDF.iloc[:, Num_restaurante]).argsort()[1:(Top+1)].values, Num_restaurante] 
print ('\n Los restaurantes más similares son:')
RecomendacionItemItem

restaurante: i_0Icrf_U4tRuyC2zDki_A

 Los restaurantes más similares son:


im9SjhagjTKX1Qv5rtK7Ng    0.219304
Oun6nvhWxc7ZOB-a33wdgw    0.204783
ztL1IZFwgMudPl7-RTsq5A    0.203723
_MAM5qBIskt15RxLb8DmMw    0.184562
1F8YgN91nkcj4aGSnBpOPQ    0.183662
afVDx6d0ZTh4cZvsiSzF1g    0.175082
i6WJB9WVGKItw2o7-vEEpw    0.174910
v0nC3M8QUBO7pIEHQf-zeA    0.172783
pIJsygG-dMFs8xu7PJeJSA    0.171392
lDpp4WL4Mngq2fjgY7xKWg    0.171229
Name: i_0Icrf_U4tRuyC2zDki_A, dtype: float64

In [28]:
import utm
import folium
from folium import plugins 


In [29]:
map_madrid = folium.Map(location=[40.427919,-3.680877], zoom_start=14)

for (index, row) in dfDistritoX.iterrows():
    folium.Marker(location = [row.loc["latitude"], row.loc["longitude"]],
    #popup = row.loc["rotulo"] + " " + row["desc_distrito_local"],
    tooltip = "click").add_to(map_madrid)

map_madrid

NameError: name 'dfDistritoX' is not defined