# Restaurant Recommendation Model

## Purpose
This Python notebook is dedicated to building and training the machine learning model used as the backend for the Restaurant Recommendation System. The goal is to create a robust recommendation system that suggests restaurants based on user preferences, historical data, and other relevant features.

## Frameworks Used
Built with sci.kit learn and XGBoost. The GradientBoostingRegressor was used to build this system.

In [1]:
#Import Relevant Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Read Pandas Dataframe
complete = pd.read_csv("./data/restaurantdemo.csv") #Directory
complete = complete.drop(columns=['business_id', 'GEO_ID', 'address', 'review_count', 'star_count'])
print(complete.head())

                       name         city state  postal_code   latitude  \
0              Metro Grille     Flanders    NJ         7836  39.949904   
1  Helen's Cafe and Gardens      Alloway    NJ         8001  39.563830   
2       Alloway Village Inn      Alloway    NJ         8001  39.555717   
3                McDonald's  Cherry Hill    NJ         8002  39.936079   
4              Little Tokyo  Cherry Hill    NJ         8002  39.943728   

   longitude  stars                                         categories  \
0 -75.161599    3.0          Restaurants, Asian Fusion, American (New)   
1 -75.363824    4.0  Restaurants, Cafes, Breakfast & Brunch, Venues...   
2 -75.360766    3.5  American (Traditional), Restaurants, Bars, Nig...   
3 -75.044117    2.5  Burgers, Food, Restaurants, Fast Food, Coffee ...   
4 -75.026066    4.0                  Japanese, Sushi Bars, Restaurants   

   B01001_001E  B01001_002E  ...  S1903_C03_031E  S1903_C03_032E  \
0        12034         5893  ...          

  complete = pd.read_csv("./data/restaurantdemo.csv") #Directory


In [3]:
#Sk.Learn Imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Apply TF-IDF to each text column separately
tfidf_vectorizer = TfidfVectorizer()

for col in ['name', 'city', 'state', 'categories']:
    tfidf_matrix = tfidf_vectorizer.fit_transform(complete[col])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    complete = pd.concat([complete, tfidf_df], axis=1)

# Drop the original text columns
complete = complete.drop(['name', 'city', 'state', 'categories'], axis=1)

In [5]:
print(complete.head())
new_comp = complete

   postal_code   latitude  longitude  stars  B01001_001E  B01001_002E  \
0         7836  39.949904 -75.161599    3.0        12034         5893   
1         8001  39.563830 -75.363824    4.0          835          420   
2         8001  39.555717 -75.360766    3.5          835          420   
3         8002  39.936079 -75.044117    2.5        23630        11866   
4         8002  39.943728 -75.026066    4.0        23630        11866   

   B01001_003E  B01001_004E  B01001_005E  B01001_006E  ...  wineries  wings  \
0          355          267          586           62  ...       0.0    0.0   
1            0            0           33            9  ...       0.0    0.0   
2            0            0           33            9  ...       0.0    0.0   
3         1068          752          650          422  ...       0.0    0.0   
4         1068          752          650          422  ...       0.0    0.0   

   women  wraps  yelp  yoga  yogurt  your  yourself  zoos  
0    0.0    0.0   0.0   0.

In [6]:
import xgboost as xgb

In [7]:
# Normalize all columns
complete = complete.apply(pd.to_numeric, errors='coerce')

for col in complete.columns:
    col_min = complete[col].min()
    col_range = complete[col].max() - col_min
    complete[col] = (complete[col] - col_min) / col_range

print(complete.head())

   postal_code  latitude  longitude  stars  B01001_001E  B01001_002E  \
0     0.000000  0.764098   0.989054  0.500     0.120581     0.120460   
1     0.001868  0.740280   0.984602  0.750     0.008367     0.008585   
2     0.001868  0.739780   0.984669  0.625     0.008367     0.008585   
3     0.001879  0.763246   0.991641  0.375     0.236774     0.242554   
4     0.001879  0.763717   0.992038  0.750     0.236774     0.242554   

   B01001_003E  B01001_004E  B01001_005E  B01001_006E  ...  wineries  wings  \
0     0.086228     0.067732     0.148204     0.025931  ...       0.0    0.0   
1     0.000000     0.000000     0.008346     0.003764  ...       0.0    0.0   
2     0.000000     0.000000     0.008346     0.003764  ...       0.0    0.0   
3     0.259412     0.190766     0.164390     0.176495  ...       0.0    0.0   
4     0.259412     0.190766     0.164390     0.176495  ...       0.0    0.0   

   women  wraps  yelp  yoga  yogurt  your  yourself  zoos  
0    0.0    0.0   0.0   0.0     

In [8]:
# Removes Duplicate Columns
complete = complete.loc[:, ~complete.columns.duplicated()]
new_comp = new_comp.loc[:, ~new_comp.columns.duplicated()]
print(new_comp.shape)
print(complete.shape)
print(complete.head())

(49545, 17574)
(49545, 17574)
   postal_code  latitude  longitude  stars  B01001_001E  B01001_002E  \
0     0.000000  0.764098   0.989054  0.500     0.120581     0.120460   
1     0.001868  0.740280   0.984602  0.750     0.008367     0.008585   
2     0.001868  0.739780   0.984669  0.625     0.008367     0.008585   
3     0.001879  0.763246   0.991641  0.375     0.236774     0.242554   
4     0.001879  0.763717   0.992038  0.750     0.236774     0.242554   

   B01001_003E  B01001_004E  B01001_005E  B01001_006E  ...  web  weight  \
0     0.086228     0.067732     0.148204     0.025931  ...  0.0     0.0   
1     0.000000     0.000000     0.008346     0.003764  ...  0.0     0.0   
2     0.000000     0.000000     0.008346     0.003764  ...  0.0     0.0   
3     0.259412     0.190766     0.164390     0.176495  ...  0.0     0.0   
4     0.259412     0.190766     0.164390     0.176495  ...  0.0     0.0   

   wholesalers  wigs  wildlife  windshield  wineries  women  yelp  zoos  
0          0

In [9]:
random_row = complete.sample(n=1, random_state=np.random.randint(0, 100)).copy()
complete = complete.drop(index=random_row.index)
#new_comp = new_comp.drop(index=random_row.index)
print(complete.shape)
print(random_row.shape)
print(random_row.index)

(49544, 17574)
(1, 17574)
Index([5124], dtype='int64')


In [10]:
# Drop Star Count
random_row = random_row.drop(columns='stars')
print(random_row.shape)

(1, 17573)


In [11]:
# Assume 'stars' is the column you want to predict
y = complete['stars']

# Drop the target column from the features
X = complete.drop('stars', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model setup
bst = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Model training
bst.fit(X_train, y_train)

# Predictions on the test set
predictions = bst.predict(X_test)

Mean Absolute Error: 0.13058177499125567


In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')

mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

In [12]:
#Test Removed Row
predicted_star_count = bst.predict(random_row)
print(f'Random Row:\n{random_row}')
print(f'Features for Prediction:\n{random_row}')
print(f'Predicted Star_Count: {predicted_star_count[0]}')

Random Row:
      postal_code  latitude  longitude  B01001_001E  B01001_002E  B01001_003E  \
5124     0.126623   0.77462   0.994695     0.568597      0.58664     0.367015   

      B01001_004E  B01001_005E  B01001_006E  B01001_007E  ...  web  weight  \
5124     0.422374     0.405918      0.49059     0.156841  ...  0.0     0.0   

      wholesalers  wigs  wildlife  windshield  wineries  women  yelp  zoos  
5124          0.0   0.0       0.0         0.0       0.0    0.0   0.0   0.0  

[1 rows x 17573 columns]
Features for Prediction:
      postal_code  latitude  longitude  B01001_001E  B01001_002E  B01001_003E  \
5124     0.126623   0.77462   0.994695     0.568597      0.58664     0.367015   

      B01001_004E  B01001_005E  B01001_006E  B01001_007E  ...  web  weight  \
5124     0.422374     0.405918      0.49059     0.156841  ...  0.0     0.0   

      wholesalers  wigs  wildlife  windshield  wineries  women  yelp  zoos  
5124          0.0   0.0       0.0         0.0       0.0    0.0   0

In [13]:
# Reverse Normalization
normalized_star_count = 0.5880435705184937  # Replace this with your actual normalized value
original_min_star_count = new_comp['stars'].min()  # Replace with the original minimum value
original_range_star_count = new_comp['stars'].max() - original_min_star_count  # Replace with the original range

# Unnormalize the 'star_count'
row_index = 5124
unnormalized_star_count = (normalized_star_count * original_range_star_count) + original_min_star_count
actual_star_count = new_comp.loc[row_index, 'stars']
# Display the unnormalized 'star_count'
print("Actual Star Count:", actual_star_count)
print("Unnormalized Star Count:", unnormalized_star_count)

Actual Star Count: 3.0
Unnormalized Star Count: 3.3521742820739746
