In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
import seaborn as sns
#df=pd.read_excel('restaurants.xlsx')

In [2]:
df = pd.read_excel('restaurant_info_6Districts_Ju.xlsx')

In [3]:
#remove titles
offender_index = df[df['happy']=="happy"].index
df= df.drop(offender_index)

#remove asia miles
asia_miles_index = df[df['food_type'].str.contains("eligible transactions")].index
df= df.drop(asia_miles_index)
df = df.reset_index()
df = df.drop('index',axis=1)

In [4]:
#drop duplicate entries with same name and location
df.drop_duplicates(subset=['name', 'location'], keep='last',inplace = True)

In [5]:
#change columns to float type
df['happy'] = df['happy'].astype(float)
df['sad'] = df['sad'].astype(float)
df['bookmark'] = df['bookmark'].astype(float)

In [6]:
#enable sort by price
df['price'] = pd.Categorical(df['price'], ['Below $50','$51-100', '$101-200', '$201-400', '$401-800', 'Above $801'])

In [7]:
#insert additional column of pure numeric prices
df['price_int'] = df['price'].replace({"Below $50":25,"$51-100":75,"$101-200":150,"$201-400":300,"$401-800":600,"Above $801":1000})


In [8]:
#define which food type to search
restaurant_type = ['Japanese','Western','Hong Kong Style','Guangdong','International','Taiwan','Korean','Thai','Italian','Sichuan']
restaurant_type_regex = "Japanese|Western|Hong Kong Style|Guangdong|International|Taiwan|Korean|Thai|Italian|Sichuan"

In [9]:
#define locations
Places = ['Causeway Bay','Central',"Yuen Long","Tsuen Wan","Tsim Sha Tsui","Mong Kok"]

In [10]:
#filter out restaurants who are in the majority
df = df[df['food_type'].str.contains(restaurant_type_regex)]
df.reset_index(inplace = True)
df = df.drop('index',axis=1)

In [11]:
#give food type a number
food_model = OrdinalEncoder()
test = pd.DataFrame(food_model.fit_transform(df[['food_type']]))

df = pd.concat([df,test],axis=1,join='inner')
df = df.rename(columns={0:"food_int"})
df.shape

(1180, 10)

In [12]:
#give location type a number
location_model = OrdinalEncoder()
test = pd.DataFrame(location_model.fit_transform(df[['location']]))

df = pd.concat([df,test],axis=1,join='inner')
df = df.rename(columns={0:"location_int"})
df.shape

(1180, 11)

In [13]:
#introduce happy vs sad column
df['happy_vs_sad'] = df['happy']/df['sad']
df['happy_vs_sad'] = np.where(df['happy_vs_sad'] == np.inf, df['happy'], df['happy_vs_sad'])


In [14]:
df.loc[:,['bookmark', 'happy', 'sad', 'price_int', 'happy_vs_sad']].describe()

Unnamed: 0,bookmark,happy,sad,price_int,happy_vs_sad
count,1180.0,1180.0,1180.0,1180.0,1180.0
mean,11937.527119,211.50678,11.955932,195.190678,39.797761
std,11891.572353,197.520064,17.293596,193.352377,50.345918
min,147.0,3.0,0.0,25.0,1.520833
25%,3788.0,74.0,2.0,75.0,11.802826
50%,8590.5,140.5,6.0,150.0,23.0
75%,15656.25,282.0,15.0,300.0,46.678571
max,86503.0,1238.0,192.0,1000.0,701.0


In [15]:
df.describe()

Unnamed: 0.1,Unnamed: 0,bookmark,happy,sad,price_int,food_int,location_int,happy_vs_sad
count,1180.0,1180.0,1180.0,1180.0,1180.0,1180.0,1180.0,1180.0
mean,691.868644,11937.527119,211.50678,11.955932,195.190678,4.54661,2.509322,39.797761
std,403.904349,11891.572353,197.520064,17.293596,193.352377,2.98575,1.730556,50.345918
min,0.0,147.0,3.0,0.0,25.0,0.0,0.0,1.520833
25%,335.75,3788.0,74.0,2.0,75.0,2.0,1.0,11.802826
50%,693.5,8590.5,140.5,6.0,150.0,4.0,2.0,23.0
75%,1043.25,15656.25,282.0,15.0,300.0,7.0,4.0,46.678571
max,1387.0,86503.0,1238.0,192.0,1000.0,9.0,5.0,701.0


In [16]:
df.groupby(by='location').describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,bookmark,bookmark,...,location_int,location_int,happy_vs_sad,happy_vs_sad,happy_vs_sad,happy_vs_sad,happy_vs_sad,happy_vs_sad,happy_vs_sad,happy_vs_sad
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Causeway Bay,212.0,125.259434,72.003908,0.0,61.75,126.5,188.25,248.0,212.0,13536.646226,...,0.0,0.0,212.0,48.572796,48.788281,3.75,16.538235,29.566667,60.0,347.0
Central,165.0,596.721212,60.129088,491.0,547.0,595.0,644.0,704.0,165.0,12482.963636,...,1.0,1.0,165.0,40.135868,53.451151,2.611111,12.2,21.47619,44.0,345.5
Mong Kok,222.0,368.563063,70.109566,249.0,308.25,365.5,429.75,490.0,222.0,15632.373874,...,2.0,2.0,222.0,44.582702,68.091894,3.212766,11.703475,22.46875,52.017857,701.0
Tsim Sha Tsui,180.0,813.272222,64.187211,705.0,756.75,812.5,867.25,923.0,180.0,20435.511111,...,3.0,3.0,180.0,44.334741,53.120662,2.926829,13.103448,25.458333,49.9,424.0
Tsuen Wan,193.0,1277.233161,65.598626,1163.0,1222.0,1280.0,1334.0,1387.0,193.0,5802.378238,...,4.0,4.0,193.0,31.043793,36.208216,1.823529,9.142857,18.5,35.8,282.0
Yuen Long,208.0,1041.706731,69.57841,924.0,980.75,1041.5,1101.25,1162.0,208.0,4270.125,...,5.0,5.0,208.0,29.675199,29.068173,1.520833,10.807143,22.416667,38.25,220.0
