In [2]:
import json
import numpy as np
import pandas as pd

In [10]:
photo_datalist = []
with open('../data/photos.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        photo_datalist.append(data)

restaurant_datalist = []
with open('../data/yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        restaurant_datalist.append(data)
        

In [11]:
# clean photos
# create dataframe
picture_df = pd.DataFrame(photo_datalist)
restaurant_df = pd.DataFrame(restaurant_datalist)



In [12]:
# clean restaurants
food_related_terms = ['Restaurant', 'Food', 'Bakeries', 'Cafes', 'Dessert', 
                     'Coffee', 'Tea', 'Bars', 'Diner', 'Bistro', 'Pizzeria']

# Create a regex pattern that matches any of the food terms
pattern = '|'.join(food_related_terms)

# Filter businesses with food-related categories
restaurant_df = restaurant_df[restaurant_df['categories'].notna()]
restaurant_df = restaurant_df[restaurant_df['categories'].str.contains(pattern, case=False)]

# # NEW: Exclude additional non-food related businesses
non_food_terms = '|'.join(['Salon', 'Barber', 'Gym', 'Spa', 'Theater', 'Nightlife', 'Beauty', 'Barbershop', "Active Life",
    "Automotive",
    "Beauty & Spas",
    "Home Services",
    "Health & Medical",
    "Hotels & Travel",
    "Local Services",
    "Professional Services",
    "Public Services & Government",
    "Real Estate",
    "Religious Organizations",
    "Shopping & Retail",
    "Transportation",
    "Arts & Entertainment",
    "Event Planning & Services",
    "Education",
    "Financial Services",
    "Nightlife",
    "Pets & Animal Services",
    "Sports & Recreation",
    "Miscellaneous Services"
])
restaurant_df = restaurant_df[~restaurant_df['categories'].str.contains(non_food_terms, case=False, na=False)]
restaurant_df = restaurant_df[restaurant_df['categories'].str.contains('Food')]
# clean photos dataset
picture_df = picture_df[picture_df['label'] != 'inside']
picture_df = picture_df[picture_df['label'] != 'outside']
picture_df = picture_df[picture_df['label'] != 'menu']

# Keep only photos that belong to the filtered restaurants
picture_df = picture_df[picture_df['business_id'].isin(restaurant_df['business_id'])]

# Print statistics
print("Number of food establishments:", len(restaurant_df))
print("Number of photos for food establishments:", len(picture_df))
print("\nSample categories in filtered dataset:")
print(restaurant_df['categories'].sample(5))



Number of food establishments: 26435
Number of photos for food establishments: 43037

Sample categories in filtered dataset:
81179                                         Food, Grocery
83841     Food, Sandwiches, Restaurants, Ice Cream & Fro...
9719                     Fast Food, Sandwiches, Restaurants
137339                          Food, Southern, Restaurants
49820     Fast Food, Coffee & Tea, Food, Restaurants, Bu...
Name: categories, dtype: object


In [13]:
# New cell for validation
# Check if all pictures link to valid restaurants
valid_business_ids = set(restaurant_df['business_id'])
invalid_pictures = picture_df[~picture_df['business_id'].isin(valid_business_ids)]

print("Validation Results:")
print(f"Total pictures: {len(picture_df)}")
print(f"Pictures with valid restaurant links: {len(picture_df) - len(invalid_pictures)}")
print(f"Pictures with invalid restaurant links: {len(invalid_pictures)}")

if len(invalid_pictures) > 0:
    print("\nSample of invalid picture entries:")
    print(invalid_pictures.head())
else:
    print("\nAll pictures are linked to valid restaurants!")

# Optional: Remove any invalid pictures if found
if len(invalid_pictures) > 0:
    picture_df = picture_df[picture_df['business_id'].isin(valid_business_ids)]
    print(f"\nCleaned dataset now contains {len(picture_df)} valid pictures")

restaurant_df = restaurant_df[['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'categories', 'latitude', 'longitude', 'stars']]
print(restaurant_df.columns)



Validation Results:
Total pictures: 43037
Pictures with valid restaurant links: 43037
Pictures with invalid restaurant links: 0

All pictures are linked to valid restaurants!
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'categories', 'latitude', 'longitude', 'stars'],
      dtype='object')


In [14]:
# encode categories column as binary features and update the restaurant dataframe
# keep business_id, latitude, longitude, stars, and category columns for use in the recommender system
categories = restaurant_df['categories'].str.get_dummies(sep=', ')
restaurant_df = pd.concat([restaurant_df, categories], axis=1)
restaurant_df.drop(columns=['categories'], inplace=True)

In [15]:
print(restaurant_df.columns)
print(restaurant_df.head(5))
print(restaurant_df.shape)

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'Acai Bowls',
       ...
       'Vinyl Records', 'Vitamins & Supplements', 'Waffles', 'Water Stores',
       'Wholesale Stores', 'Wigs', 'Wine & Spirits', 'Women's Clothing',
       'Wraps', 'Yelp Events'],
      dtype='object', length=328)
               business_id                      name              address  \
3   MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries          935 Race St   
4   mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery        101 Walnut St   
5   CF33F8-E6oudUQ46HnavjQ            Sonic Drive-In        615 S Main St   
9   bBDDEgkFA1Otx9Lfe7BZUQ            Sonic Drive-In  2312 Dickerson Pike   
11  eEOYSgkmpB90uNA7lDOMRA     Vietnamese Food Truck                        

            city state postal_code   latitude  longitude  stars  Acai Bowls  \
3   Philadelphia    PA       19107  39.955505 -75.155564    4.0           0   
4     Green Lane    PA  

In [16]:
# # save the cleaned datasets as csv's for use in the recommender system
# # save the cleaned datasets as jsons to store in database
restaurant_df_model_training = restaurant_df.copy().drop(columns=['name', 'address', 'city', 'state', 'postal_code'])
picture_df.to_csv('../data/cleaned_photos.csv', index=False)

restaurant_df_database = restaurant_df.copy()
picture_df.to_json('../data/cleaned_photos.json', orient='records', lines=True)

restaurant_df_database.to_json('../data/cleaned_restaurants.json', orient='records', lines=True)
restaurant_df_model_training.to_csv('../data/cleaned_restaurants.csv', index=False)

In [17]:
# parse photos folder and remove any photos that are not in the cleaned dataset
import os

# Create a set of valid photo IDs
valid_photo_ids = set(picture_df['photo_id'])

# Define the path to the photos folder
photos_folder = '../data/photos'

# Iterate over all files in the photos folder
for filename in os.listdir(photos_folder):
    # Extract the photo ID from the filename
    photo_id = filename.split('.')[0]
    
    # Check if the photo ID is not in the valid set
    if photo_id not in valid_photo_ids:
        # Construct the full path to the file
        file_path = os.path.join(photos_folder, filename)
        
        # Remove the file
        os.remove(file_path)
        print(f"Removed invalid photo: {filename}")
        
# Check the number of photos in the folder
print(f"Remaining photos in the folder: {len(os.listdir(photos_folder))}")

Remaining photos in the folder: 39212
