In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Data Loading and Merging the data
df1_url = 'https://github.com/dsrscientist/dataset4/blob/main/zomato.csv'
df2_url = 'https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Z_Restaurant/Country-Code.xlsx'
df1 = pd.read_csv(df1_url,encoding='latin-1')
df2 = pd.read_excel(df2_url)
df1.head()

In [None]:
print(df1.columns)

print(df2.columns)

In [None]:
df2.head()

In [None]:
df_zomoto = pd.merge(df1,df2,on='Country Code',how='left')
df_zomoto.head()

# Exploratory Data Analysis (EDA):

In [None]:
df_zomoto.shape

In [None]:
df_zomoto.info()

In [None]:
df_zomoto.describe()

In [None]:
df_zomoto.isnull().sum()

In [None]:
df_zomoto.duplicated().sum()

In [None]:
#Distrubution of average Cost for two
#Distplot

plt.figure(figsize=(10,6))
sns.distplot(df_zomoto['Average Cost for two'], bins=50,kde =True)
plt.show()

In [None]:
#Histplot
plt.figure(figsize=(10,6))
sns.histplot(df_zomoto['Average Cost for two'], bins=50,kde =True)
plt.title('Average Cost for two')
plt.show()

In [None]:
#Correlation heatmap
plt.figure(figsize=(10,6))
corr = df_zomoto.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr,annot=True,cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

As per the above heatmap , there is no significant correlation between 'Average Cost for two ' and other fatures

The correlation are relatively low ,with the highest being a weak positive correlation of 0.075 with prince Range

Price Range : Price range have some moderate positive correlaion with 'aggregate rating and Votes i.e 0.44 and 0.31 this suggestes that highest priced restartents tend to have better ratings and more votes

aggregate rating vs votes : 0.31 aggregate rating vs Price range 0.44 price range vs votes : 0.31

Country Code has a moderate negitive correlation with 'longitude'(-0.70).

this likely reflects geograpical clustering of the data,where certain country codes are associated with specific longitude ranges

In [None]:
df_zomoto.columns
print(df_zomoto.dtypes)

In [None]:
skewness = df_zomoto.select_dtypes(include=[np.number]).skew()
print(skewness)

# Skewness result is :
Restaurant ID 0.061570 Country Code 3.043965 Longitude -2.807328 Latitude -3.081635 Average Cost for two 35.477915 Price range 0.889618 Aggregate rating -0.954130 Votes 8.807637

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,PowerTransformer

num_cols = ['Restaurant ID','Country Code','Longitude','Latitude','Average Cost for two','Price range','Aggregate rating','Votes']

df_zomoto['Average Cost for two'] = np.log1p(df_zomoto['Average Cost for two'])
df_zomoto['Votes'] = np.log1p(df_zomoto['Votes'])

#Power transfer method
power_transformer = PowerTransformer(method='yeo-johnson')
df_zomoto[num_cols] = power_transformer.fit_transform(df_zomoto[num_cols])

#StandardScaler

scaler = StandardScaler()
df_zomoto[num_cols] = scaler.fit_transform(df_zomoto[num_cols])

In [None]:
df_zomoto.head()

In [None]:
#second time
skewness = df_zomoto.select_dtypes(include=[np.number]).skew()
print(skewness)

In [None]:
high_skew_col = ['Country Code', 'Longitude','Latitude']

for col in high_skew_col:
  df_zomoto[col] = power_transformer.fit_transform(df_zomoto[[col]])

  scaler1 = StandardScaler()
  df_zomoto[col] = scaler1.fit_transform(df_zomoto[[col]])

skewness = df_zomoto.select_dtypes(include=[np.number]).skew()
print(skewness)

Before Proceeding with model deployment ,we need to encode the catagerical features into numerical values

There are ⁉ Label Encoding: Assigns each unique category a different integer. This can be used when there is an ordinal relationship between categories

One-Hot Encoding: Creates binary columns for each category. This is used when there is no ordinal relationship between categories

we will apply the following encoding strategies: Label Encoding for columns where ordinal relationship might exist or where we have a small number of unique values One-Hot Encoding for columns with no ordinal relationship and a manageable number of unique values

In [None]:
label_cols = ['Has Table booking','Has Online delivery','Is delivering now','Switch to order menu','Rating color','Rating text']
onehot_cols = ['Country','City','Address','Locality','Locality Verbose','Cuisines','Currency','Restaurant Name']

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Create a LabelEncoder object
le_encode = LabelEncoder()
for col in label_cols:
  df_zomoto[col] = le_encode.fit_transform(df_zomoto[col])

In [None]:
#one hot encoder
df_zomoto = pd.get_dummies(df_zomoto,columns=onehot_cols)
df_zomoto.head()

In [None]:
#after encoding the dataset
df_zomoto.shape

In [None]:
df_zomoto.columns

In [None]:
#Encoding False and True Values
boolean_col = ['Restaurant Name_feel ALIVE','Restaurant Name_hug!',
       'Restaurant Name_iGNiTE', 'Restaurant Name_iKitchen',
       'Restaurant Name_sketch Gallery', 'Restaurant Name_t Lounge by Dilmah',
       'Restaurant Name_tashas', 'Restaurant Name_wagamama',
       'Restaurant Name_{Niche} - Cafe & Bar',
       'Restaurant Name_íˆukuraÛôa SofrasÛ±']
for col in boolean_col:
  df_zomoto[col] = df_zomoto[col].replace({False:0,True:1})

In [None]:
print(df_zomoto.head())

In [None]:
print(df_zomoto.dtypes)

# Predicting average cost for two

In [None]:
!pip install xgboost
!pip install nvidia-cublas-cu12==12.1.3.1
!pip install nvidia-cuda-cupti-cu12==12.1.105
!pip install nvidia-cuda-runtime-cu12==12.1.105
!pip install nvidia-cudnn-cu12==8.9.2.26
!pip install nvidia-cufft-cu12==11.0.2.54
!pip install nvidia-curand-cu12==10.3.2.106
!pip install nvidia-cusolver-cu12==11.4.5.107
!pip install nvidia-cusparse-cu12==12.1.0.106
!pip install nvidia-nvtx-cu12==12.1.105
!pip install nvidia-nccl-cu12==2.20.5
!pip install nvidia-nsight-cu12==2.20.5

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score,recall_score,classification_report

In [None]:
#feature and target variables
features = df_zomoto.drop(['Average Cost for two','Price range'],axis=1)
target_cost = df_zomoto['Average Cost for two']
target_price = df_zomoto['Price range']

#Train and test split
X_train_cost,X_test_cost,y_train_cost,y_test_cost = train_test_split(features,target_cost,test_size=0.2,random_state=42)
X_train_price,X_test_price,y_train_price,y_test_price = train_test_split(features,target_price,test_size=0.2,random_state=42)


scaler = StandardScaler()
X_train_cost = scaler.fit_transform(X_train_cost)
X_test_cost = scaler.transform(X_test_cost)

#applying PCA to reduce the number of features
PCA_cost = PCA(n_components=10)
X_train_cost = PCA_cost.fit_transform(X_train_cost)
X_test_cost = PCA_cost.transform(X_test_cost)

rf_model_cost = RandomForestRegressor(n_estimators=100,random_state=42)
rf_model_cost.fit(X_train_cost,y_train_cost)
y_pred_rf_cost = rf_model_cost.predict(X_test_cost)

rmse_cose = mean_squared_error(y_test_cost,y_pred_rf_cost,squared=False)
print("RandomForestRegressor - Root mean squared error(Cost):",rmse_cose)
print("RandomForestRegressor -Mean absolute error(Cost):",mean_absolute_error(y_test_cost,y_pred_rf_cost))
print("RandomForestRegressor -R2 score(Cost):",r2_score(y_test_cost,y_pred_rf_cost))

In [None]:
#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

#Price range
# Price using classifier
X_train_price = scaler.fit_transform(X_train_price)
X_test_price = scaler.transform(X_test_price)

#applying PCA to reduce the number of features
PCA_price = PCA(n_components=10)
X_train_price = PCA_price.fit_transform(X_train_price)
X_test_price = PCA_price.transform(X_test_price)


rfr_model_price = RandomForestRegressor(n_estimators=100,random_state=42)
rfr_model_price.fit(X_train_price,y_train_price)
y_pred_rfr_price = rfr_model_price.predict(X_test_price)

rmse_price = mean_squared_error(y_test_price,y_pred_rfr_price,squared=False)
print("RandomForestRegressor- Root mean squared error(Price):",rmse_price)
print("RandomForestRegressor-Mean absolute error(Price):",mean_absolute_error(y_test_price,y_pred_rfr_price))
print("RandomForestRegressor-R2 score(Price):",r2_score(y_test_price,y_pred_rfr_price))

In [None]:
param_grid = {
    'n_estimators': [50, 100,150],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'learning_rate' : [0.01,0.1,1]
}

XGB_model_cost = XGBRegressor()
XGB_GridSearch_cost = GridSearchCV(estimator=XGB_model_cost, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

XGB_GridSearch_cost.fit(X_train_cost,y_train_cost)

best_params = XGB_GridSearch_cost.best_params_
print("Best parameters:",best_params)

best_model = XGBRegressor(**best_params)
best_model.fit(X_train_cost,y_train_cost)

y_pred_XGB_cost = best_model.predict(X_test_cost)


rmse = mean_squared_error(y_test_cost,y_pred_XGB_cost,squared=False)
print("XGBRegressor-Root mean squared error(Cost):",rmse)
print("XGBRegressor-Mean absolute error(Cost):",mean_absolute_error(y_test_cost,y_pred_XGB_cost))
print("XGBRegressor-R2 score(Cost):",r2_score(y_test_cost,y_pred_XGB_cost))

In [None]:
#Feature Engineering

from sklearn.preprocessing import PolynomialFeatures

ploy_cost = PolynomialFeatures(degree=2,include_bias=False)
X_train_cost_poly = ploy_cost.fit_transform(X_train_cost)
X_test_cost_poly = ploy_cost.transform(X_test_cost)

ploy_price = PolynomialFeatures(degree=2,include_bias=False)
X_train_price_poly = ploy_price.fit_transform(X_train_price)
X_test_price_poly = ploy_price.transform(X_test_price)


best_model.fit(X_train_cost_poly,y_train_cost)
y_pred_XGB_cost = best_model.predict(X_test_cost_poly)


rmse = mean_squared_error(y_test_cost,y_pred_XGB_cost,squared=False)
print("PolynomialFeatures-Root mean squared error(Cost):",rmse)
print("PolynomialFeaturesMean - absolute error(Cost):",mean_absolute_error(y_test_cost,y_pred_XGB_cost))
print("PolynomialFeatures-R2 score(Cost):",r2_score(y_test_cost,y_pred_XGB_cost))