In [49]:
import numpy as np
import pandas as pd

df = pd.read_csv('restaurants.csv', low_memory = False)


Our goal is to cluster the data into groups that have minimal separation to better understand what differentiates restaurants.

In [50]:
df.iloc[1]

Unnamed: 0                                                            7
webUrl                https://www.tripadvisor.com/Restaurant_Review-...
website                                 http://www.herve-restaurant.com
features              Reservations, Seating, Serves Alcohol, Accepts...
review_Tags           gastronomic experience, pairing, sommelier, cu...
cuisines              French, Mediterranean, Greek, Japanese Fusion,...
meal_Types                              Dinner, nan, nan, nan, nan, nan
dishes                nan, nan, nan, nan, nan, nan, nan, nan, nan, n...
address               Trion Ierarchon 170 May not be visible from th...
description           Chef's table restaurant providing a 16 course ...
email                                         info@herve-restaurant.com
name                                                   Herve Restaurant
menuWebUrl                https://herve-restaurant.com/menu-philosophy/
numberOfReviews                                                 

Above we can see the data we have for each restaurant. Some potentially useful

In [51]:
df.iloc[1]['review_Tags']

'gastronomic experience, pairing, sommelier, cuisine, dishes, ambience, greece, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan'

In [52]:
df.iloc[1]['features']

'Reservations, Seating, Serves Alcohol, Accepts Credit Cards, Table Service, Outdoor Seating, Street Parking, Wheelchair Accessible, Full Bar, Accepts Mastercard, Accepts Visa, Digital Payments, Free Wifi, Family style, Non-smoking restaurants, Gift Cards Available, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan'

In [53]:
df.iloc[1]['description']

"Chef's table restaurant providing a 16 course tasting experience highlighting season's best ingredients from Greece and abroad."

In [54]:
df.iloc[1]['description']

"Chef's table restaurant providing a 16 course tasting experience highlighting season's best ingredients from Greece and abroad."

In [55]:
### Here we get rid of all the rows that for which the following columns have no data
###

df = df.dropna(subset=['description', 'review_Tags', 'cuisines', 'priceLevel'])

In [56]:
df = df.fillna('')
df = df.replace(to_replace='nan', value='')
df = df.replace(to_replace='nan,', value='')

In [57]:
##Here we go through each column and remove any variations of the string 'nan'

for col in df.columns:
    if pd.api.types.is_string_dtype(df[col]):  # Check for string type
        df[col] = df[col].str.replace(r' nan,', '', regex=True)
        
for col in df.columns:
    if pd.api.types.is_string_dtype(df[col]):  # Check for string type
        df[col] = df[col].str.replace(r' nan', '', regex=True)
        
        
df['cuisines'].iloc[1]


'Italian, Mediterranean, Greek, Healthy, Vegetarian Friendly, Vegan Options, Gluten Free Options,'

In [58]:
df['cuisines'].iloc[15]

'Lebanese, Fast Food, Mediterranean, Healthy, Middle Eastern, Street Food, Vegetarian Friendly, Vegan Options, Halal, Gluten Free Options,'

In [59]:
df.to_csv('restaurants_narrow.csv')

In [31]:
####Now let's try doc2vec to embed these columns:

from gensim.models import Doc2Vec

from gensim.models.doc2vec import TaggedDocument


cols = ['description', 'review_Tags', 'cuisines', 'priceLevel']


##This function takes a particular column and creates an embedding for it usnig the parameters specified

def embed_doc2vec(df, col, vector_size, window, min_count=2, workers=4):
    
    # Prepare a list of tagged documents for Doc2Vec
    documents = [TaggedDocument(doc.split(), [col + '_' + str(idx)]) for idx, doc in df[col].iteritems()]

    # Train a Doc2Vec model for this column
    model = Doc2Vec(documents, vector_size = vector_size, window = window, min_count = min_count, workers = workers, epochs=40)  # Adjust hyperparameters as needed

    # Embed each document in the column
    column_embeddings = [model.infer_vector(doc.words) for doc in documents]

    # Add a new column for the embeddings
    df[col + '_embeddings'] = column_embeddings
    
    return df, model

    
    

In [34]:
df, model = embed_doc2vec(df, 'review_Tags', vector_size = 5, window = 3, min_count=2, workers=4)


In [21]:
#### Let's try some clustering:

from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

# Extract embeddings as a NumPy array
embeddings = np.vstack(df['review_Tags_embeddings'].tolist()) 
embeddings = normalize(embeddings)
# Calculate cosine similarity matrix (1 - cosine distance)
dist_matrix = 1 - squareform(pdist(embeddings, metric='cosine'))

# Perform hierarchical clustering with desired linkage
#clustering = AgglomerativeClustering(affinity='precomputed', linkage='average')  # Adjust linkage as needed
#clusters = clustering.fit_predict(dist_matrix)



## K means below for example

kmeans = KMeans(n_clusters=15, random_state=42)  # Set a random state for reproducibility
clusters = kmeans.fit_predict(embeddings)


# Assign cluster labels to the DataFrame
df['cluster'] = clusters

In [22]:
df['cluster'].describe()

count    485.000000
mean       6.573196
std        4.052510
min        0.000000
25%        4.000000
50%        6.000000
75%       10.000000
max       14.000000
Name: cluster, dtype: float64

In [23]:
embeddings.shape

(485, 5)

In [None]:
len(dist_matrix)

In [24]:
dist_matrix

array([[1.        , 0.69705918, 0.46801083, ..., 0.61045663, 0.58840672,
        0.67931719],
       [0.69705918, 1.        , 0.6978383 , ..., 0.92295954, 0.76316313,
        0.94346835],
       [0.46801083, 0.6978383 , 1.        , ..., 0.72954281, 0.94568774,
        0.68904336],
       ...,
       [0.61045663, 0.92295954, 0.72954281, ..., 1.        , 0.70998731,
        0.98998499],
       [0.58840672, 0.76316313, 0.94568774, ..., 0.70998731, 1.        ,
        0.71370894],
       [0.67931719, 0.94346835, 0.68904336, ..., 0.98998499, 0.71370894,
        1.        ]])

In [None]:
df

In [25]:
df['cluster'].value_counts()

4     57
1     51
5     46
8     42
10    35
7     32
9     32
2     31
13    28
11    26
6     24
14    23
3     21
12    20
0     17
Name: cluster, dtype: int64

In [None]:
### Here we try to evaluate the number of clusters and their inertia to try and locate the 'elbow'

sse = {}
for k in range(1, 25):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(embeddings)
    df["cluster"] = kmeans.labels_
    #print(data["clusters"])
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()

In [30]:
with pd.option_context('display.max_colwidth', None,
                       'display.max_columns', None,
                       'display.max_rows', None):




    display(df[df['cluster'] == 0]['review_Tags'])

33                                              tagine, talagani cheese, meatballs, hummus, short ribs, kataifi, lamb, shrimp, cherry sauce, delicious food, our waiter, excellent restaurant, christos, gazi, dish, euphoria, greece, ingredients, greek, district, twist, heart, acropolis,
40                                                                                                                                      souvlaki, burger, fries, salad, great vegan, quality and quantity, great taste, tasty food, large portions, delivery, meat, vegetarian, pita, waiter,
85                        brisket, smoked meat, mac and cheese, spare ribs, duck, scones, corn on the cob, short ribs, fine dining, an amazing dinner, amazing taste, delicious food, attentive service, outdoor seating, bbq, sides, greece, bite, mouth, twist, neighborhood, heart, euros,
93     falafel, pastitsio, salad, burger, sandwiches, bread, vegan food, great vegan, non vegan, eating meat, greek classics, gluten free, vis

#### Above we tried to cluster using document embeddings that we trained ourselves, but this didn't seem to be successful, the clusters don't seem to reflect much and this could be for several reasons e.g. the inherent data wasn't rich enough, cluster size fine tuning, or even our trained embeddings(not enough data). 

In [36]:
tokens = "vegan vegetarian and live music".split()

new_vector = model.infer_vector(tokens)
sims = model.dv.most_similar([new_vector]) #gives you top 10 document tags and their cosine similarity

In [37]:
sims

[('review_Tags_93', 0.9968696236610413),
 ('review_Tags_218', 0.9967239499092102),
 ('review_Tags_115', 0.9942142963409424),
 ('review_Tags_436', 0.9933612942695618),
 ('review_Tags_787', 0.9903804063796997),
 ('review_Tags_495', 0.9901872873306274),
 ('review_Tags_399', 0.9895277619361877),
 ('review_Tags_910', 0.989235520362854),
 ('review_Tags_489', 0.9888647198677063),
 ('review_Tags_143', 0.9885537624359131)]

In [48]:
df.iloc[218]['description']

'The enchanting ‘’Peacock’’ Roof Garden Restaurant, you can enjoy your dinner or drink year- round regardless weather conditions, overlooking the amazing Acropolis & Acropolis museum.Our guests have an excellent opportunity to try the homemade dishes and desserts, all prepared with natural and pure products that we use for preparing the local cuisine dishes.'