### Building a recommendation system with restaurants in London

### Ideas

#### < Maybe find a map dataset or something to get geo data for each restaurant, would be nice to have a cuisine datapoint too

#### < NLP angle with the reviews textual data

#### < Can use date as a factor, more recent the review, the more weighting it should have

#### < What does good look like? Like what is a 'good' recommendation?

_______________________________________________________________________

### Factor Release Plan

#### Release 1

##### 1. Initial recommender based on review value and volume
##### 2. Involve date as a factor of review reliability
##### 3. Introduce textual NLP analysis of reviews as another way of weighting
##### 4. Bring in geo data for people to select areas or something?
##### 5. Would be nice to webscrape cuisine as a datapoint too
##### 6. Could also include author id as a factor - potentially someone who posts lots of reviews are more reliable?

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import dateutil.parser as parser
from datetime import datetime, date, timedelta
import torch
import skorch
import scipy
import torch.nn as nn
import torch
import torch.nn.functional as F
import sys
from skorch.helper import DataFrameTransformer
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn import metrics
from sklearn.preprocessing import FunctionTransformer
from skorch.callbacks import EarlyStopping
from sklearn.pipeline import Pipeline
from skorch import NeuralNetRegressor
import pickle
import emoji
import requests
from bs4 import BeautifulSoup

In [2]:
initial_df = pd.read_csv('Data/London_reviews.csv')

  initial_df = pd.read_csv('Data/London_reviews.csv')


In [3]:
print(initial_df)

       Unnamed: 0 parse_count       restaurant_name rating_review    sample  \
0               0           1  Cocotte_Notting_Hill           5.0  Positive   
1               1           2  Cocotte_Notting_Hill           5.0  Positive   
2               2           3  Cocotte_Notting_Hill           5.0  Positive   
3               3           4  Cocotte_Notting_Hill           5.0  Positive   
4               4           5  Cocotte_Notting_Hill           5.0  Positive   
...           ...         ...                   ...           ...       ...   
996562     999995      999996       The_Old_Brewery           4.0  Positive   
996563     999996      999997       The_Old_Brewery           2.0  Negative   
996564     999997      999998       The_Old_Brewery           5.0  Positive   
996565     999998      999999       The_Old_Brewery           5.0  Positive   
996566     999999     1000000       The_Old_Brewery           3.0  Negative   

               review_id                           

In [4]:
# dropping some unwanted columns from the initial dataset
df = initial_df.drop(['Unnamed: 0','parse_count'],axis=1)
df = df.dropna(subset=['url_restaurant'])
# making a new name column which removes the underscores
df['restaurant_name_clean'] = [(str(s).replace('_', ' ')) for s in df['restaurant_name']]


In [7]:
gmaps1 = pd.read_csv('Data/Restaurant_Location_Details_2.0.csv')
gmaps1 = gmaps1.drop_duplicates(keep='first')

gmaps2 = pd.read_csv(('Data/Restaurant_Location_Additional_Details.csv'))
gmaps2 = gmaps2.drop_duplicates(keep='first')

gmapsdata = gmaps2.merge(gmaps1, how='left', on='place_id')

gmaps3 = pd.read_csv('Data/Restaurant_Cuisine_and_Loc.csv')
gmaps3 = gmaps3.drop_duplicates(keep='first')

gmapsdata2 = gmaps3.merge(gmapsdata, how='left', left_on='id', right_on='place_id')


In [8]:
df_merged = df.merge(gmapsdata2, how='left', left_on='restaurant_name_clean', right_on='TA_Names')

df_merged.to_csv('Data/Merged_TA_Gmaps_Dataset_2.0.csv')

In [9]:
df_merged.groupby('primaryType').agg({'primaryType': 'count'})

Unnamed: 0_level_0,primaryType
primaryType,Unnamed: 1_level_1
american_restaurant,15443
art_gallery,271
bakery,1835
bar,104554
barbecue_restaurant,4110
beauty_salon,298
brazilian_restaurant,1849
breakfast_restaurant,3439
brunch_restaurant,847
cafe,15107


In [13]:
# merging the main Tripadvisor dataset with the new Google Maps data sourced from the API scripts

# as some of the restaurant names in the Tripadvisor dataset are slightly different to as recorded 
# on Google Maps (an example is "Genzo Greek" in Tripadvisor is "GENZO" on Google Maps), we will need to 
# perform some fuzzy matching between the two datasets in order to merge them.
import difflib

# gmapsdata['fuzzy_name'] = gmapsdata['name'].apply(lambda x: difflib.get_close_matches(x, df['restaurant_name_clean'].unique(), n=1)[0])

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

# following function adapted from the below link:
# https://stackoverflow.com/questions/13636848/is-it-possible-to-do-fuzzy-match-merge-with-python-pandas
def fuzzy_merge(df_1, df_2, key1, key2, threshold=95, limit=1):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1.apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

dfnames = pd.Series(df['restaurant_name_clean'].unique())
merged = fuzzy_merge(dfnames, gmapsdata, 'restaurant_name_clean', 'name', threshold=80)


In [14]:
GM_names = merged.iloc[1834]
TA_names = merged.iloc[0:1834]

merged_names = pd.concat([TA_names, GM_names], axis=1, join='outer')
merged_names = merged_names.rename(columns={0: "Tripadvisor", 1: "Gmaps"})

merged_names.to_csv('Merged_Names2.csv')

In [11]:
df2 = df.merge(merged_names, left_on='restaurant_name_clean', right_on='Tripadvisor')
df_merged = df2.merge(gmapsdata, left_on='Gmaps', right_on='name', how='left')
test = df2.merge(gmapsdata, left_on='Gmaps', right_on='name', how='inner')