<a href="https://colab.research.google.com/github/nallagondu/datatrained_inter_public/blob/main/Restaurant_Food_Cost_Final_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Project Description**

Who doesn’t love food? All of us must have craving for at least a few favourite food items, we may also have a few places where we like to get them, a restaurant which serves our favourite food the way we want it to be. But there is one factor that will make us reconsider having our favourite food from our favourite restaurant, the cost. Here in this hackathon, you will be predicting the cost of the food served by the restaurants across different cities in India. You will use your Data Science skills to investigate the factors that really affect the cost, and who knows maybe you will even gain some very interesting insights that might help you choose what to eat and from where.


**You are provided with following 2 files:**
1.     train.csv : Use this dataset to train the model. This file contains all the details related to restaurant food cost as well as the target variable “cost”. You have to train your model using this file.
2.     test.csv : Use the trained model to predict the cost of a two person meal.

**Dataset Attributes**
TITLE: The feature of the restaurant which can help identify what and for whom it is suitable for.
RESTAURANT_ID: A unique ID for each restaurant.
CUISINES: The variety of cuisines that the restaurant offers.
TIME: The open hours of the restaurant.
CITY: The city in which the restaurant is located.
LOCALITY: The locality of the restaurant.
RATING: The average rating of the restaurant by customers.
VOTES: The overall votes received by the restaurant.
COST: The average cost of a two-person meal.



**Dataset Link-**
https://github.com/FlipRoboTechnologies/ML-Datasets/tree/main/Restaurant%20Food%20Cost


	https://github.com/FlipRoboTechnologies/ML-Datasets/blob/main/Restaurant%20Food%20Cost/Data_Test.xlsx

  https://github.com/FlipRoboTechnologies/ML-Datasets/blob/main/Restaurant%20Food%20Cost/Data_Train.xlsx


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Ridge,Lasso,LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor # This import should now work
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor


In [None]:
#loading the data

test_url = 'https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Restaurant%20Food%20Cost/Data_Test.xlsx'
trained_url = 'https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Restaurant%20Food%20Cost/Data_Train.xlsx'
trained_df = pd.read_excel(trained_url)
test_df = pd.read_excel(test_url)


In [None]:
trained_df.head()

In [None]:
test_df.head()

In [None]:
trained_df.shape

In [None]:
test_df.shape

In [None]:
trained_df.info()
print(test_df.info())

In [None]:
trained_df.duplicated().sum()

In [None]:
test_df.duplicated().sum()

In [None]:
print(trained_df.isnull().sum())
print(test_df.isnull().sum())

In [None]:
trained_df["source"] = 'train'
test_df["source"] = 'test'

In [None]:
#merge the datasets
mixed_df = pd.concat([trained_df,test_df], ignore_index=True)
mixed_df.head()

In [None]:
mixed_df.shape

In [None]:
mixed_df.columns

In [None]:
mixed_df.TITLE.value_counts().sum()

In [None]:
mixed_df.TITLE.unique()

In [None]:
mixed_df.duplicated().sum()

In [None]:
mixed_df = mixed_df.drop_duplicates()

In [None]:
mixed_df.duplicated().sum()

In [None]:
mixed_df.info()

In [None]:
mixed_df['VOTES'] = mixed_df['VOTES'].str.replace('votes', ' ').str.strip()


In [None]:
mixed_df['VOTES'] = pd.to_numeric(mixed_df['VOTES'] , errors='coerce')

In [None]:
mixed_df.info()

In [None]:
mixed_df['RATING'] = pd.to_numeric(mixed_df['RATING'] , errors='coerce')

In [None]:

mixed_df['RATING'] = mixed_df['RATING'].fillna(mixed_df['RATING'].mean())

In [None]:
mixed_df.info()

In [None]:
mixed_df['CUISINES'].value_counts()

In [None]:
mixed_df['location'] = mixed_df['CITY']+' ' + mixed_df['LOCALITY']

In [None]:
mixed_df.drop(columns=['CITY','LOCALITY'],inplace=True)


In [None]:
mixed_df.head()

In [None]:
mixed_df['location'].value_counts()

In [None]:
mixed_df.info()

In [None]:
Le = LabelEncoder()
mixed_df['location'] = Le.fit_transform(mixed_df['location'])

In [None]:
mixed_df['TITLE'] = Le.fit_transform(mixed_df['TITLE'])

In [None]:
mixed_df['CUISINES'] = Le.fit_transform(mixed_df['CUISINES'])

In [None]:
mixed_df['TIME'] = Le.fit_transform(mixed_df['TIME'])


In [None]:
mixed_df.head()

In [None]:
mixed_df['source'] = mixed_df['source'].map({'train':0,'test':1})

In [None]:
mixed_df['location'].value_counts()

In [None]:
mixed_df.info()

In [None]:
mixed_df

In [None]:
mixed_df.isnull().sum()

In [None]:
!pip install feature-engine
from feature_engine.imputation import RandomSampleImputer

In [None]:
from feature_engine.imputation import RandomSampleImputer
imputer = RandomSampleImputer(variables = ['VOTES'])
mixed_df = imputer.fit_transform(mixed_df)

In [None]:
mixed_df['VOTES'].fillna(mixed_df['VOTES'].mean(), inplace= True)
mixed_df['COST'].fillna(mixed_df['COST'].mean(), inplace= True)

In [None]:
mixed_df.isnull().sum()


In [None]:
print(mixed_df.shape)
print(mixed_df.info())

In [None]:
mixed_df.drop(columns = ['RESTAURANT_ID'], inplace = True)
mixed_df.describe()

In [None]:
correlation = mixed_df.corr()
correlation

In [None]:
fig,ax = plt.subplots(figsize=(20,10))
sns.heatmap(correlation,annot=True)

In [None]:
trained_final_df = mixed_df[mixed_df['source'] == 0]
test_final_df = mixed_df[mixed_df['source'] == 1]

In [None]:
# multivariate Analysis
sample_size = 1000  # Adjust this value as needed
sns.pairplot(trained_final_df.sample(n=sample_size), hue='COST', palette='Set2')
plt.show()

In [None]:
print(trained_df,test_df)
print("trained Dataframe shape :",trained_df.shape,"test data shape:",test_df.shape)

In [None]:
trained_final_df.info()

In [None]:
test_final_df.info()

In [None]:
trained_final_df.drop(columns=["source"],inplace = True)
test_final_df.drop(columns=["source"],inplace = True)

In [None]:
trained_final_df.head()

In [None]:
test_final_df.head()

In [None]:
train_X = trained_final_df.drop(columns = ["CUISINES", "COST"])
train_y = trained_final_df["COST"]
test_X = test_final_df.drop(columns=["COST","CUISINES"])
test_y = test_final_df["COST"]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

model_LR = LinearRegression()
model_LR.fit(train_X,train_y)


In [None]:
LR_pred = model_LR.predict(test_X)

In [None]:
from sklearn.model_selection import cross_val_score
cross_value = cross_val_score(model_LR,train_X,train_y,cv=5, scoring='neg_mean_squared_error')
print(cross_value)
#cross_value

In [None]:
mean_squared_error(test_y.values,LR_pred,squared=False)

In [None]:
models = [LinearRegression(),
          Ridge(alpha = 0.001),
          Lasso(alpha=0.003),
          SVR(),
          DecisionTreeRegressor(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          AdaBoostRegressor(base_estimator=LinearRegression())]

model_names = 'LinearRegression','Ridge','Lasso','SVR','DecisionTreeRegressor','RandomForestRegressor','GradientBoostingRegressor','AdaBoostRegressor'
model_df = pd.DataFrame(columns=['Model','MSE','R2','MeanCV'])
for model,model_names in zip(models,model_names):
  print(model)

  model.fit(train_X,train_y)
  pred = model.predict(test_X)
  mse = mean_squared_error(test_y,pred,squared=False)
  r2 = model.score(test_X,test_y)

  averages = cross_val_score(model,train_X,train_y,cv=5,scoring='neg_mean_squared_error').mean()

  model_df = pd.concat([model_df,pd.DataFrame({'Model': [model_names],'MSE':mse,'R2':r2,'MeanCV': [averages]})],ignore_index=True)

  print(model_df)

In [None]:
print(model_df)

In [None]:
best_model_rf = RandomForestRegressor()
best_model_rf.fit(train_X,train_y)

In [None]:
best_model_rf.score(test_X,test_y)
test_pred = best_model_rf.predict(test_X)
test_pred

In [None]:
best_model_SVR = SVR()
best_model_SVR.fit(train_X,train_y)

In [None]:
best_model_SVR.score(test_X,test_y)
test_pred_SVR = best_model_SVR.predict(test_X)
test_pred_SVR

In [None]:
best_model_GBR = GradientBoostingRegressor()
best_model_GBR.fit(train_X,train_y)

In [None]:
best_model_GBR.score(test_X,test_y)
test_pred_GBR = best_model_GBR.predict(test_X)
test_pred_GBR

In [None]:
import joblib
joblib.dump(best_model_rf,'best_model_SVR.pkl')