In [117]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

## Reading the File 

In [12]:
df = pd.read_csv(r'/Users/csuftitan/Downloads/newAdbProject/combined_csv.csv')

## Dropping Duplicates

In [13]:
df.drop_duplicates(inplace=True)
df.shape

(51718, 17)

### Preparing Approx Cost Numerical Feature

In [14]:
# before transformation
df['approx_cost(for two people)'].unique()

array(['800', '300', '600', '700', '550', '500', '450', '650', '400',
       '900', '200', '750', '150', '850', '100', '1,200', '350', '250',
       '950', '1,000', '1,500', '1,300', '199', '80', '1,100', '160',
       '1,600', '230', '130', '50', '190', '1,700', nan, '1,400', '180',
       '1,350', '2,200', '2,000', '1,800', '1,900', '330', '2,500',
       '2,100', '3,000', '2,800', '3,400', '40', '1,250', '3,500',
       '4,000', '2,400', '2,600', '120', '1,450', '469', '70', '3,200',
       '60', '560', '240', '360', '6,000', '1,050', '2,300', '4,100',
       '5,000', '3,700', '1,650', '2,700', '4,500', '140'], dtype=object)

In [15]:
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(str).apply(lambda x: x.replace(',', ''))

In [16]:
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(float)

In [17]:
df['approx_cost(for two people)'].unique()

array([ 800.,  300.,  600.,  700.,  550.,  500.,  450.,  650.,  400.,
        900.,  200.,  750.,  150.,  850.,  100., 1200.,  350.,  250.,
        950., 1000., 1500., 1300.,  199.,   80., 1100.,  160., 1600.,
        230.,  130.,   50.,  190., 1700.,   nan, 1400.,  180., 1350.,
       2200., 2000., 1800., 1900.,  330., 2500., 2100., 3000., 2800.,
       3400.,   40., 1250., 3500., 4000., 2400., 2600.,  120., 1450.,
        469.,   70., 3200.,   60.,  560.,  240.,  360., 6000., 1050.,
       2300., 4100., 5000., 3700., 1650., 2700., 4500.,  140.])

### Preparing Rate Numerical Feature

In [18]:
# before tranformation
df['rate'].unique()

array(['4.1/5', '3.8/5', '3.7/5', '3.6/5', '4.6/5', '4.0/5', '4.2/5',
       '3.9/5', '3.1/5', '3.0/5', '3.2/5', '3.3/5', '2.8/5', '4.4/5',
       '4.3/5', 'NEW', '2.9/5', '3.5/5', nan, '2.6/5', '3.8 /5', '3.4/5',
       '4.5/5', '2.5/5', '2.7/5', '4.7/5', '2.4/5', '2.2/5', '2.3/5',
       '3.4 /5', '-', '3.6 /5', '4.8/5', '3.9 /5', '4.2 /5', '4.0 /5',
       '4.1 /5', '3.7 /5', '3.1 /5', '2.9 /5', '3.3 /5', '2.8 /5',
       '3.5 /5', '2.7 /5', '2.5 /5', '3.2 /5', '2.6 /5', '4.5 /5',
       '4.3 /5', '4.4 /5', '4.9/5', '2.1/5', '2.0/5', '1.8/5', '4.6 /5',
       '4.9 /5', '3.0 /5', '4.8 /5', '2.3 /5', '4.7 /5', '2.4 /5',
       '2.1 /5', '2.2 /5', '2.0 /5', '1.8 /5'], dtype=object)

In [19]:
def split(x):
    return x.split('/')[0]

In [20]:
df['rate']=df['rate'].astype(str).apply(split)

In [21]:
df['rate'].replace('NEW',0,inplace=True)
df['rate'].replace('-',0,inplace=True)

In [22]:
df['rate']=df['rate'].astype(str).astype(float)

In [23]:
df['rate'].unique()

array([4.1, 3.8, 3.7, 3.6, 4.6, 4. , 4.2, 3.9, 3.1, 3. , 3.2, 3.3, 2.8,
       4.4, 4.3, 0. , 2.9, 3.5, nan, 2.6, 3.4, 4.5, 2.5, 2.7, 4.7, 2.4,
       2.2, 2.3, 4.8, 4.9, 2.1, 2. , 1.8])

In [24]:
df['rate'].isnull().sum()

7775

In [25]:
# few more tranformations
def mark(x):
    if x in ('Quick Bites', 'Casual Dining'):
        return 'Quick Bites + Casual Dining'
    else:
        return 'other'

In [26]:
df['Top_types']=df['rest_type'].apply(mark)

### Creating New Restaurants and Training data

In [27]:
def assign(x):
    if x>0:
        return 1
    else:
        return 0
df['rated']=df['rate'].apply(assign)

In [29]:
df['rated'].unique() # here 0 is new rest and 1 is for training rest

array([1, 0])

In [30]:
# separating new and training data
new_restaurants = df[df['rated'] == 0]
train_val_restaurants = df.query('rated == 1')

### Threshold for dividing Good and Bad Rest

In [32]:
threshold = 3.75 #any restaurant with rating greater than 3.75 would have target value as 1
train_val_restaurants['target'] = train_val_restaurants['rate'].apply(lambda x: 1 if x >= threshold else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_val_restaurants['target'] = train_val_restaurants['rate'].apply(lambda x: 1 if x >= threshold else 0)


### Feature Extraction

In [33]:
'''After defining the target and splitting the data into train+val and test sets, let's define the features to be used on training. Here we will take a look at the raw data to select valuable features and apply some steps to create another ones.

The initial set of selected features include:

- online_order;
- book_table;
- location;
- rest_type;
- cuisines;
- listed_in(type);
- approx_cost'''

"After defining the target and splitting the data into train+val and test sets, let's define the features to be used on training. Here we will take a look at the raw data to select valuable features and apply some steps to create another ones.\n\nThe initial set of selected features include:\n\n- online_order;\n- book_table;\n- location;\n- rest_type;\n- cuisines;\n- listed_in(type);\n- approx_cost"

In [34]:
train_val_restaurants.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)', 'Top_types', 'rated', 'target'],
      dtype='object')

In [36]:
def count(x):
    return len(x.split(','))

In [37]:
#### as it have some NAN value that why very first I have to convert into str  &  then apply a function
train_val_restaurants['total_cuisines']=train_val_restaurants['cuisines'].astype(str).apply(count)
train_val_restaurants['multiple_types']=train_val_restaurants['rest_type'].astype(str).apply(count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_val_restaurants['total_cuisines']=train_val_restaurants['cuisines'].astype(str).apply(count)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_val_restaurants['multiple_types']=train_val_restaurants['rest_type'].astype(str).apply(count)


In [38]:
train_val_restaurants.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)', 'Top_types', 'rated', 'target',
       'total_cuisines', 'multiple_types'],
      dtype='object')

In [39]:
# some important features we would be intrested in
imp_features=['online_order','book_table','location','rest_type','multiple_types','total_cuisines','listed_in(type)','approx_cost(for two people)','target']

In [40]:
data = train_val_restaurants[imp_features]

In [41]:
data.isnull().sum() # as Null data is less than 1-2% we can drop it 

online_order                     0
book_table                       0
location                         0
rest_type                      149
multiple_types                   0
total_cuisines                   0
listed_in(type)                  0
approx_cost(for two people)    247
target                           0
dtype: int64

In [42]:
data.dropna(how='any',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(how='any',inplace=True)


### Splitting data into Categorical and Numerical based upon type of data

In [44]:
# Splitting features by data type, any data which is object is a categorical data
cat_features= [col for col in data.columns if data[col].dtype == 'O']
num_features= [col for col in data.columns if data[col].dtype != 'O']

In [45]:
cat_features

['online_order', 'book_table', 'location', 'rest_type', 'listed_in(type)']

In [47]:
# unique features in categorical data
for feature in cat_features:
    print('{} has total {} unique features'.format(feature, data[feature].nunique()))

online_order has total 2 unique features
book_table has total 2 unique features
location has total 92 unique features
rest_type has total 87 unique features
listed_in(type) has total 7 unique features


In [50]:
#location and rest_type has too manu unique features, lets try to reduce them
cols=['location','rest_type']
for col in cols:
    print('Total feature in {} are {}'.format(col,data[col].nunique()))
    print(data[col].value_counts()/(len(data))*100)
    print('\n')
#we can see BTM is like 9% and Quickbites is like 33 percent, now we can define a threshold and put rest things as
#Other category of data

Total feature in location are 92
BTM                      9.398624
Koramangala 5th Block    5.565517
HSR                      4.828940
Indiranagar              4.361310
JP Nagar                 4.143245
                           ...   
Yelahanka                0.009692
West Bangalore           0.007269
Rajarajeshwari Nagar     0.004846
Nagarbhavi               0.002423
Peenya                   0.002423
Name: location, Length: 92, dtype: float64


Total feature in rest_type are 87
Quick Bites                   33.642663
Casual Dining                 23.301512
Cafe                           8.162919
Dessert Parlor                 4.482458
Delivery                       4.048750
                                ...    
Food Court, Beverage Shop      0.004846
Dessert Parlor, Food Court     0.004846
Dessert Parlor, Kiosk          0.004846
Bakery, Beverage Shop          0.002423
Quick Bites, Kiosk             0.002423
Name: rest_type, Length: 87, dtype: float64




In [51]:
percent=data['location'].value_counts()/len(data)*100
values=percent.values

In [52]:
len(values[values>0.4])

46

In [56]:
#### lets set Threshold value 0.4 
values=data['location'].value_counts()/len(data)*100
values

BTM                      9.398624
Koramangala 5th Block    5.565517
HSR                      4.828940
Indiranagar              4.361310
JP Nagar                 4.143245
                           ...   
Yelahanka                0.009692
West Bangalore           0.007269
Rajarajeshwari Nagar     0.004846
Nagarbhavi               0.002423
Peenya                   0.002423
Name: location, Length: 92, dtype: float64

In [57]:
threshold=0.4 #for location
imp=values[values>threshold]
imp

BTM                      9.398624
Koramangala 5th Block    5.565517
HSR                      4.828940
Indiranagar              4.361310
JP Nagar                 4.143245
Jayanagar                3.959101
Whitefield               3.808878
Marathahalli             3.416360
Bannerghatta Road        2.970537
Koramangala 7th Block    2.556212
Koramangala 6th Block    2.553789
Brigade Road             2.548944
Bellandur                2.415681
Sarjapur Road            2.069199
Koramangala 1st Block    2.064354
Ulsoor                   2.057085
Koramangala 4th Block    2.037701
Electronic City          2.020740
MG Road                  1.921399
Banashankari             1.805098
Kalyan Nagar             1.681527
Malleshwaram             1.553111
Residency Road           1.463462
Richmond Road            1.463462
Basavanagudi             1.441655
Frazer Town              1.371390
Church Street            1.322931
Brookefield              1.315662
New BEL Road             1.226013
Kammanahalli  

In [58]:
data['location']=np.where(data['location'].isin(imp.index),data['location'],'other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['location']=np.where(data['location'].isin(imp.index),data['location'],'other')


In [60]:
data['location'].nunique() #reduced the location features to 47

47

In [61]:
# for rest_type
values2=data['rest_type'].value_counts()/len(data)*100
values2

Quick Bites                   33.642663
Casual Dining                 23.301512
Cafe                           8.162919
Dessert Parlor                 4.482458
Delivery                       4.048750
                                ...    
Food Court, Beverage Shop      0.004846
Dessert Parlor, Food Court     0.004846
Dessert Parlor, Kiosk          0.004846
Bakery, Beverage Shop          0.002423
Quick Bites, Kiosk             0.002423
Name: rest_type, Length: 87, dtype: float64

In [62]:
#setting threshold as 0.3 to check number of features reduced to
len(values2[values2>0.3])

29

In [63]:
#making it 1.5 to reduce more
threshold=1.5
imp2=values2[values2>1.5]
imp2

Quick Bites           33.642663
Casual Dining         23.301512
Cafe                   8.162919
Dessert Parlor         4.482458
Delivery               4.048750
Takeaway, Delivery     3.098953
Casual Dining, Bar     2.645862
Bakery                 1.705757
Beverage Shop          1.555534
Bar                    1.550688
Name: rest_type, dtype: float64

In [64]:
data['rest_type']=np.where(data['rest_type'].isin(imp2.index),data['rest_type'],'other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rest_type']=np.where(data['rest_type'].isin(imp2.index),data['rest_type'],'other')


In [65]:
# lets see how much features we are left with now
for feature in cat_features:
    print('{} has total {} unique features'.format(feature, data[feature].nunique()))

online_order has total 2 unique features
book_table has total 2 unique features
location has total 47 unique features
rest_type has total 11 unique features
listed_in(type) has total 7 unique features


In [66]:
#feature encoding now
data_cat = data[cat_features]
for col in cat_features:
    col_encoded = pd.get_dummies(data_cat[col],prefix=col,drop_first=False)
    data_cat=pd.concat([data_cat,col_encoded],axis=1)
    data_cat.drop(col, axis=1, inplace=True)

In [67]:
data_cat.head(5)

Unnamed: 0,online_order_No,online_order_Yes,book_table_No,book_table_Yes,location_BTM,location_Banashankari,location_Banaswadi,location_Bannerghatta Road,location_Basavanagudi,location_Bellandur,...,rest_type_Quick Bites,"rest_type_Takeaway, Delivery",rest_type_other,listed_in(type)_Buffet,listed_in(type)_Cafes,listed_in(type)_Delivery,listed_in(type)_Desserts,listed_in(type)_Dine-out,listed_in(type)_Drinks & nightlife,listed_in(type)_Pubs and bars
0,0,1,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,1,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,1,0,1,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [68]:
data_cat.drop(['online_order_No','book_table_No'], axis=1, inplace =True)

In [69]:
data_cat.head()

Unnamed: 0,online_order_Yes,book_table_Yes,location_BTM,location_Banashankari,location_Banaswadi,location_Bannerghatta Road,location_Basavanagudi,location_Bellandur,location_Brigade Road,location_Brookefield,...,rest_type_Quick Bites,"rest_type_Takeaway, Delivery",rest_type_other,listed_in(type)_Buffet,listed_in(type)_Cafes,listed_in(type)_Delivery,listed_in(type)_Desserts,listed_in(type)_Dine-out,listed_in(type)_Drinks & nightlife,listed_in(type)_Pubs and bars
0,1,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [70]:
#contacting with final data
data_final=pd.concat([data.loc[:,['multiple_types','total_cuisines','approx_cost(for two people)','target']],data_cat],axis=1)

In [71]:
data_final.shape

(41272, 71)

In [72]:
# Splitting the data
X = data_final.drop('target', axis=1)
y = data_final['target'].values

In [73]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)

In [74]:
X_train.shape

(33017, 70)

### model for decision tree

In [100]:
model = RandomForestClassifier(n_estimators=100, min_samples_leaf=10, random_state=1)

In [101]:
# Fit the model to the data.
model.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=10, random_state=1)

In [102]:
# Make predictions.
predictions = model.predict(X_test)
confusion_matrix(predictions, y_test)

array([[3568, 1261],
       [ 670, 2756]])

In [103]:
accuracy_score(predictions,y_test)

0.7660811629315566

### Creating and Testing for a Random Data

In [104]:
listOfColumns = X_train.columns.to_list()

In [105]:
# creating for dict for testing
dictForTest = {}
for i in listOfColumns:
    dictForTest[i]=0

In [106]:
dictForTest #lets copy this data for a new data and manipulate accordingly

{'multiple_types': 0,
 'total_cuisines': 0,
 'approx_cost(for two people)': 0,
 'online_order_Yes': 0,
 'book_table_Yes': 0,
 'location_BTM': 0,
 'location_Banashankari': 0,
 'location_Banaswadi': 0,
 'location_Bannerghatta Road': 0,
 'location_Basavanagudi': 0,
 'location_Bellandur': 0,
 'location_Brigade Road': 0,
 'location_Brookefield': 0,
 'location_Church Street': 0,
 'location_Commercial Street': 0,
 'location_Cunningham Road': 0,
 'location_Domlur': 0,
 'location_Ejipura': 0,
 'location_Electronic City': 0,
 'location_Frazer Town': 0,
 'location_HSR': 0,
 'location_Indiranagar': 0,
 'location_JP Nagar': 0,
 'location_Jayanagar': 0,
 'location_Jeevan Bhima Nagar': 0,
 'location_Kalyan Nagar': 0,
 'location_Kammanahalli': 0,
 'location_Koramangala 1st Block': 0,
 'location_Koramangala 3rd Block': 0,
 'location_Koramangala 4th Block': 0,
 'location_Koramangala 5th Block': 0,
 'location_Koramangala 6th Block': 0,
 'location_Koramangala 7th Block': 0,
 'location_Koramangala 8th Bloc

In [112]:
newDictForTest = {'multiple_types': 1,
 'total_cuisines': 3,
 'approx_cost(for two people)': 1500,
 'online_order_Yes': 1,
 'book_table_Yes': 1,
 'location_BTM': 1,
 'location_Banashankari': 0,
 'location_Banaswadi': 0,
 'location_Bannerghatta Road': 0,
 'location_Basavanagudi': 0,
 'location_Bellandur': 0,
 'location_Brigade Road': 0,
 'location_Brookefield': 0,
 'location_Church Street': 0,
 'location_Commercial Street': 0,
 'location_Cunningham Road': 0,
 'location_Domlur': 0,
 'location_Ejipura': 0,
 'location_Electronic City': 0,
 'location_Frazer Town': 0,
 'location_HSR': 0,
 'location_Indiranagar': 0,
 'location_JP Nagar': 0,
 'location_Jayanagar': 0,
 'location_Jeevan Bhima Nagar': 0,
 'location_Kalyan Nagar': 0,
 'location_Kammanahalli': 0,
 'location_Koramangala 1st Block': 0,
 'location_Koramangala 3rd Block': 0,
 'location_Koramangala 4th Block': 0,
 'location_Koramangala 5th Block': 0,
 'location_Koramangala 6th Block': 0,
 'location_Koramangala 7th Block': 0,
 'location_Koramangala 8th Block': 0,
 'location_Lavelle Road': 0,
 'location_MG Road': 0,
 'location_Malleshwaram': 0,
 'location_Marathahalli': 0,
 'location_New BEL Road': 0,
 'location_Old Airport Road': 0,
 'location_Rajajinagar': 0,
 'location_Residency Road': 0,
 'location_Richmond Road': 0,
 'location_Sarjapur Road': 0,
 'location_Shanti Nagar': 0,
 'location_Shivajinagar': 0,
 'location_St. Marks Road': 0,
 'location_Ulsoor': 0,
 'location_Vasanth Nagar': 0,
 'location_Whitefield': 0,
 'location_Wilson Garden': 0,
 'location_other': 0,
 'rest_type_Bakery': 0,
 'rest_type_Bar': 0,
 'rest_type_Beverage Shop': 0,
 'rest_type_Cafe': 0,
 'rest_type_Casual Dining': 0,
 'rest_type_Casual Dining, Bar': 0,
 'rest_type_Delivery': 0,
 'rest_type_Dessert Parlor': 0,
 'rest_type_Quick Bites': 1,
 'rest_type_Takeaway, Delivery': 0,
 'rest_type_other': 0,
 'listed_in(type)_Buffet': 1,
 'listed_in(type)_Cafes': 0,
 'listed_in(type)_Delivery': 0,
 'listed_in(type)_Desserts': 0,
 'listed_in(type)_Dine-out': 0,
 'listed_in(type)_Drinks & nightlife': 0,
 'listed_in(type)_Pubs and bars': 0}

In [113]:
#preparing dataframe for testing 
new = pd.DataFrame(newDictForTest,index=[0])

In [114]:
newPrediction = model.predict(new)

In [115]:
predProb = model.predict_proba(new)

In [116]:
predProb[0][1]

0.8019123833757231

### Pickle to Load out the model

In [118]:
import pickle
pickle.dump(model, open('RandomForestZomatoModel.pkl', 'wb'))