In [102]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Part B: Predictive Modelling
## I. Feature engineering

repeating the cleanups from above and adding more feature engineering.

In [103]:
df = pd.read_csv('data/zomato_df_final_data.csv')
df.isna().sum()

address             0
cost              346
cuisine             0
lat               192
link                0
lng               192
phone               0
rating_number    3316
rating_text      3316
subzone             0
title               0
type               48
votes            3316
groupon             0
color               0
cost_2            346
cuisine_color       0
dtype: int64

In [104]:
df = df.drop(columns = ["address", "link", "phone", "title", "color", "cuisine_color", "type", "rating_text", "lat", "lng"], axis = 1)
df.head(3)

Unnamed: 0,cost,cuisine,rating_number,subzone,votes,groupon,cost_2
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",4.0,CBD,1311.0,False,5.243902
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",4.6,"The Grounds of Alexandria, Alexandria",3236.0,False,7.560976
2,120.0,['Japanese'],4.9,"The Star, Pyrmont",1227.0,False,10.650407


In [105]:
df.groupon = df.groupon.astype(int)

# cleaning up the subzone column
df['subzone'] = df['subzone'].apply(lambda x: x.split(',')[-1].strip() if ',' in x else x)

# encoding the subzone column
label_encoder = LabelEncoder()
df['subzone_encoded'] = label_encoder.fit_transform(df['subzone'])

In [106]:
df.head(3)

Unnamed: 0,cost,cuisine,rating_number,subzone,votes,groupon,cost_2,subzone_encoded
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",4.0,CBD,1311.0,0,5.243902,48
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",4.6,Alexandria,3236.0,0,7.560976,1
2,120.0,['Japanese'],4.9,Pyrmont,1227.0,0,10.650407,245


In [107]:
df.isna().sum()

cost                346
cuisine               0
rating_number      3316
subzone               0
votes              3316
groupon               0
cost_2              346
subzone_encoded       0
dtype: int64

In [108]:
# handling the missing values
# fill the rating column with the mean
df['rating_number'] = df.groupby('subzone_encoded')['rating_number'].transform(lambda x: x.fillna(x.mean()))
df['rating_number'] = df['rating_number'].fillna(df['rating_number'].mean())
# fill the votes column with the mean
df['votes'] = df.groupby('subzone_encoded')['votes'].transform(lambda x: x.fillna(x.mean()))
df['votes'] = df['votes'].fillna(df['votes'].mean())
df.isna().sum()

cost               346
cuisine              0
rating_number        0
subzone              0
votes                0
groupon              0
cost_2             346
subzone_encoded      0
dtype: int64

In [109]:
df['cost'] = df.groupby('subzone_encoded')['cost'].transform(lambda x: x.fillna(x.mean()))
df['cost'] = df['cost'].fillna(df['cost'].mean())

df['cost_2'] = df.groupby('subzone_encoded')['cost_2'].transform(lambda x: x.fillna(x.mean()))
df['cost_2'] = df['cost_2'].fillna(df['cost_2'].mean())

df = df.drop(columns = ["subzone"], axis = 1)
df.isna().sum()

cost               0
cuisine            0
rating_number      0
votes              0
groupon            0
cost_2             0
subzone_encoded    0
dtype: int64