In [107]:
import json
import numpy as np
import pandas as pd

In [108]:
photo_datalist = []
with open('../data/photos.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        photo_datalist.append(data)

restaurant_datalist = []
with open('../data/yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        restaurant_datalist.append(data)
        

In [109]:
# clean photos
# create dataframe
picture_df = pd.DataFrame(photo_datalist)
restaurant_df = pd.DataFrame(restaurant_datalist)
print(picture_df.head(5))
print(restaurant_df["attributes"].head(5))


                 photo_id             business_id  \
0  zsvj7vloL4L5jhYyPIuVwg  Nk-SJhPlDBkAZvfsADtccA   
1  HCUdRJHHm_e0OCTlZetGLg  yVZtL5MmrpiivyCIrVkGgA   
2  vkr8T0scuJmGVvN2HJelEA  _ab50qdWOk0DdB6XOrBitw   
3  pve7D6NUrafHW3EAORubyw  SZU9c8V2GuREDN5KgyHFJw   
4  H52Er-uBg6rNrHcReWTD2w  Gzur0f0XMkrVxIwYJvOt2g   

                                             caption    label  
0  Nice rock artwork everywhere and craploads of ...   inside  
1                                                     outside  
2                                     oyster shooter    drink  
3                                      Shrimp scampi     food  
4                                                        food  
0                        {'ByAppointmentOnly': 'True'}
1               {'BusinessAcceptsCreditCards': 'True'}
2    {'BikeParking': 'True', 'BusinessAcceptsCredit...
3    {'RestaurantsDelivery': 'False', 'OutdoorSeati...
4    {'BusinessAcceptsCreditCards': 'True', 'Wheelc...
Name: attributes, dtyp

In [110]:
import ast

def clean_and_encode_restaurant_attributes(df):
    """
    Clean and encode restaurant attributes for a recommendation model.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing business data with 'attributes' column
    
    Returns:
    pandas.DataFrame: DataFrame with cleaned and encoded attributes
    """
    # Make a copy to avoid modifying the original dataframe
    df_clean = df.copy()
    
    # Step 1: Parse the attributes column if it's a string and not None
    def parse_attributes(attr):
        if attr is None:
            return None
        if isinstance(attr, str):
            try:
                # Handle single quotes used in JSON
                attr = attr.replace("'", '"')
                # Handle cases where string starts with u
                attr = attr.replace('u"', '"')
                return json.loads(attr)
            except json.JSONDecodeError:
                try:
                    # Alternative parsing with ast.literal_eval
                    return ast.literal_eval(attr)
                except:
                    return None
        return attr
    
    # Apply parsing to the attributes column
    df_clean['attributes'] = df_clean['attributes'].apply(parse_attributes)
    
    # Step 2: Filter out non-restaurants
    # We'll consider a business a restaurant if it has any of the restaurant-specific attributes
    restaurant_attrs = ['RestaurantsPriceRange2', 'RestaurantsAttire', 'NoiseLevel', 
                       'RestaurantsGoodForGroups', 'Alcohol', 'RestaurantsReservations']
    
    def is_restaurant(attr):
        if attr is None:
            return False
        for rest_attr in restaurant_attrs:
            if rest_attr in attr:
                return True
        return False
    
    # Filter businesses that have at least one restaurant attribute
    df_clean = df_clean[df_clean['attributes'].apply(is_restaurant)]
    
    # Step 3: Extract and encode each specific attribute
    
    # 3.1: RestaurantsPriceRange2 (numeric 1-4)
    def extract_price_range(attr):
        if attr is None or 'RestaurantsPriceRange2' not in attr:
            return np.nan
        try:
            value = attr['RestaurantsPriceRange2']
            if isinstance(value, str) and value.isdigit():
                return int(value)
            return np.nan
        except:
            return np.nan
    
    df_clean['price_range'] = df_clean['attributes'].apply(extract_price_range)
    
    # 3.2: RestaurantsAttire (casual, dressy, formal)
    def extract_attire(attr):
        if attr is None or 'RestaurantsAttire' not in attr:
            return 'unknown'
        try:
            value = attr['RestaurantsAttire']
            # Clean up the string
            value = value.lower().replace("'", "").replace("u", "").replace('"', '')
            return value
        except:
            return 'unknown'
    
    df_clean['attire'] = df_clean['attributes'].apply(extract_attire)
    
    # One-hot encode attire
    attire_dummies = pd.get_dummies(df_clean['attire'], prefix='attire')
    df_clean = pd.concat([df_clean, attire_dummies], axis=1)
    
    # 3.3: NoiseLevel (quiet, average, loud, very_loud)
    def extract_noise_level(attr):
        if attr is None or 'NoiseLevel' not in attr:
            return 'unknown'
        try:
            value = attr['NoiseLevel']
            # Clean up the string
            value = value.lower().replace("'", "").replace("u", "").replace('"', '')
            return value
        except:
            return 'unknown'
    
    df_clean['noise_level'] = df_clean['attributes'].apply(extract_noise_level)
    
    # One-hot encode noise level
    noise_dummies = pd.get_dummies(df_clean['noise_level'], prefix='noise')
    df_clean = pd.concat([df_clean, noise_dummies], axis=1)
    
    # 3.4: RestaurantsGoodForGroups (boolean)
    def extract_good_for_groups(attr):
        if attr is None or 'RestaurantsGoodForGroups' not in attr:
            return np.nan
        try:
            value = attr['RestaurantsGoodForGroups']
            if isinstance(value, bool):
                return 1 if value else 0
            if isinstance(value, str):
                return 1 if value.lower() == 'true' else 0
            return np.nan
        except:
            return np.nan
    
    df_clean['good_for_groups'] = df_clean['attributes'].apply(extract_good_for_groups)
    
    # 3.5: Alcohol (none, beer_and_wine, full_bar)
    def extract_alcohol(attr):
        if attr is None or 'Alcohol' not in attr:
            return 'unknown'
        try:
            value = attr['Alcohol']
            # Clean up the string
            value = value.lower().replace("'", "").replace("u", "").replace('"', '')
            return value
        except:
            return 'unknown'
    
    df_clean['alcohol'] = df_clean['attributes'].apply(extract_alcohol)
    
    # One-hot encode alcohol
    alcohol_dummies = pd.get_dummies(df_clean['alcohol'], prefix='alcohol')
    df_clean = pd.concat([df_clean, alcohol_dummies], axis=1)
    
    # 3.6: RestaurantsReservations (boolean)
    def extract_reservations(attr):
        if attr is None or 'RestaurantsReservations' not in attr:
            return np.nan
        try:
            value = attr['RestaurantsReservations']
            if isinstance(value, bool):
                return 1 if value else 0
            if isinstance(value, str):
                return 1 if value.lower() == 'true' else 0
            return np.nan
        except:
            return np.nan
    
    df_clean['takes_reservations'] = df_clean['attributes'].apply(extract_reservations)
    
    # 3.7: Ambience (dictionary with multiple boolean values)
    # For Ambience, we'll extract common ambience attributes
    ambience_attributes = ['romantic', 'intimate', 'touristy', 'hipster', 'divey', 
                          'classy', 'trendy', 'upscale', 'casual']
    
    def extract_ambience(attr, amb_attr):
        if attr is None or 'Ambience' not in attr:
            return np.nan
        try:
            ambience = attr['Ambience']
            
            # Handle the case where Ambience is a string representation of a dictionary
            if isinstance(ambience, str):
                if ambience.lower() == 'none':
                    return np.nan
                try:
                    # Try to parse the string as a dictionary
                    ambience = ambience.replace("'", '"')
                    ambience = json.loads(ambience)
                except:
                    try:
                        ambience = ast.literal_eval(ambience)
                    except:
                        return np.nan
            
            # Now extract the specific ambience attribute
            if amb_attr in ambience:
                value = ambience[amb_attr]
                if value is None:
                    return np.nan
                if isinstance(value, bool):
                    return 1 if value else 0
                if isinstance(value, str):
                    return 1 if value.lower() == 'true' else 0
            return np.nan
        except:
            return np.nan
    
    # Add columns for each ambience attribute
    for amb_attr in ambience_attributes:
        df_clean[f'ambience_{amb_attr}'] = df_clean['attributes'].apply(
            lambda x: extract_ambience(x, amb_attr))
    
    # Fill NaN values with appropriate defaults
    df_clean['price_range'] = df_clean['price_range'].fillna(df_clean['price_range'].mode()[0])
    df_clean['good_for_groups'] = df_clean['good_for_groups'].fillna(0)
    df_clean['takes_reservations'] = df_clean['takes_reservations'].fillna(0)
    
    # Fill NaN values in ambience columns
    for amb_attr in ambience_attributes:
        df_clean[f'ambience_{amb_attr}'] = df_clean[f'ambience_{amb_attr}'].fillna(0)
    
    # Drop the original attributes column and intermediate columns
    columns_to_drop = ['attributes', 'attire', 'noise_level', 'alcohol']
    df_clean = df_clean.drop(columns=columns_to_drop)
    
    return df_clean

In [111]:
restaurant_df = clean_and_encode_restaurant_attributes(restaurant_df)
print(restaurant_df.columns)


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'hours', 'price_range', 'attire_casal', 'attire_dressy',
       'attire_formal', 'attire_none', 'attire_unknown', 'noise_average',
       'noise_lod', 'noise_none', 'noise_qiet', 'noise_unknown',
       'noise_very_lod', 'good_for_groups', 'alcohol_beer_and_wine',
       'alcohol_fll_bar', 'alcohol_none', 'alcohol_unknown',
       'takes_reservations', 'ambience_romantic', 'ambience_intimate',
       'ambience_touristy', 'ambience_hipster', 'ambience_divey',
       'ambience_classy', 'ambience_trendy', 'ambience_upscale',
       'ambience_casual'],
      dtype='object')


In [112]:
# # NEW: Exclude additional non-food related businesses
non_food_terms = '|'.join(['Salon', 'Barber', 'Gym', 'Spa', 'Theater', 'Nightlife', 'Beauty', 'Barbershop', "Active Life",
    "Automotive",
    "Beauty & Spas",
    "Home Services",
    "Health & Medical",
    "Hotels & Travel",
    "Local Services",
    "Professional Services",
    "Public Services & Government",
    "Real Estate",
    "Religious Organizations",
    "Shopping & Retail",
    "Transportation",
    "Arts & Entertainment",
    "Event Planning & Services",
    "Education",
    "Financial Services",
    "Nightlife",
    "Pets & Animal Services",
    "Sports & Recreation",
    "Miscellaneous Services"
])
restaurant_df = restaurant_df[~restaurant_df['categories'].str.contains(non_food_terms, case=False, na=False)]
restaurant_df = restaurant_df[restaurant_df['categories'].str.contains('Food')]
# clean photos dataset
picture_df = picture_df[picture_df['label'] != 'inside']
picture_df = picture_df[picture_df['label'] != 'outside']
picture_df = picture_df[picture_df['label'] != 'menu']

# Keep only photos that belong to the filtered restaurants
picture_df = picture_df[picture_df['business_id'].isin(restaurant_df['business_id'])]

# Print statistics
print("Number of food establishments:", len(restaurant_df))
print("Number of photos for food establishments:", len(picture_df))
print("\nSample categories in filtered dataset:")
print(restaurant_df['categories'].sample(5))
print(restaurant_df.columns)


Number of food establishments: 23688
Number of photos for food establishments: 40062

Sample categories in filtered dataset:
55336    Food, Specialty Food, Candy Stores, Souvenir S...
49793                  Food, Restaurants, Kosher, Bakeries
63140     Desserts, Bakeries, Food, Cupcakes, Custom Cakes
87427    Soul Food, Barbeque, Restaurants, Caribbean, S...
44958                                 Food, Farmers Market
Name: categories, dtype: object
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'hours', 'price_range', 'attire_casal', 'attire_dressy',
       'attire_formal', 'attire_none', 'attire_unknown', 'noise_average',
       'noise_lod', 'noise_none', 'noise_qiet', 'noise_unknown',
       'noise_very_lod', 'good_for_groups', 'alcohol_beer_and_wine',
       'alcohol_fll_bar', 'alcohol_none', 'alcohol_unknown',
       'takes_reservations', 'ambience_romantic', 'ambience_

In [113]:
# New cell for validation
# Check if all pictures link to valid restaurants
valid_business_ids = set(restaurant_df['business_id'])
invalid_pictures = picture_df[~picture_df['business_id'].isin(valid_business_ids)]

print("Validation Results:")
print(f"Total pictures: {len(picture_df)}")
print(f"Pictures with valid restaurant links: {len(picture_df) - len(invalid_pictures)}")
print(f"Pictures with invalid restaurant links: {len(invalid_pictures)}")

if len(invalid_pictures) > 0:
    print("\nSample of invalid picture entries:")
    print(invalid_pictures.head())
else:
    print("\nAll pictures are linked to valid restaurants!")

# Optional: Remove any invalid pictures if found
if len(invalid_pictures) > 0:
    picture_df = picture_df[picture_df['business_id'].isin(valid_business_ids)]
    print(f"\nCleaned dataset now contains {len(picture_df)} valid pictures")




Validation Results:
Total pictures: 40062
Pictures with valid restaurant links: 40062
Pictures with invalid restaurant links: 0

All pictures are linked to valid restaurants!


In [114]:
# encode categories column as binary features and update the restaurant dataframe
# keep business_id, latitude, longitude, stars, and category columns for use in the recommender system
categories = restaurant_df['categories'].str.get_dummies(sep=', ')
restaurant_df = pd.concat([restaurant_df, categories], axis=1)
restaurant_df.drop(columns=['categories', 'hours'], inplace=True)

In [115]:
print(restaurant_df.columns)
print(restaurant_df.head(5))
print(restaurant_df.shape)

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count',
       ...
       'Vinyl Records', 'Vitamins & Supplements', 'Waffles', 'Water Stores',
       'Wholesale Stores', 'Wigs', 'Wine & Spirits', 'Women's Clothing',
       'Wraps', 'Yelp Events'],
      dtype='object', length=345)
               business_id                   name              address  \
3   MTSW4McQd7CbVtyjqoe9mw     St Honore Pastries          935 Race St   
5   CF33F8-E6oudUQ46HnavjQ         Sonic Drive-In        615 S Main St   
9   bBDDEgkFA1Otx9Lfe7BZUQ         Sonic Drive-In  2312 Dickerson Pike   
11  eEOYSgkmpB90uNA7lDOMRA  Vietnamese Food Truck                        
14  0bPLkL0QhhPO5kt1_EXmNQ   Zio's Italian Market        2575 E Bay Dr   

            city state postal_code   latitude  longitude  stars  review_count  \
3   Philadelphia    PA       19107  39.955505 -75.155564    4.0            80   
5   Ashland City    TN       37015  

In [116]:
# save the cleaned datasets as csv's for use in the recommender system
# save the cleaned datasets as jsons to store in database
restaurant_df_model_training = restaurant_df.copy()
restaurant_df_model_training = restaurant_df_model_training.drop(columns=['name', 'address', 'city', 'state', 'postal_code'])
picture_df.to_csv('../data/cleaned_photos.csv', index=False)

restaurant_df_database = restaurant_df.copy()
restaurant_df_database = restaurant_df_database[['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars']]
picture_df.to_json('../data/cleaned_photos.json', orient='records', lines=True)

restaurant_df_database.to_json('../data/cleaned_restaurants.json', orient='records', lines=True)
restaurant_df_model_training.to_csv('../data/cleaned_restaurants.csv', index=False)

In [117]:
# parse photos folder and remove any photos that are not in the cleaned dataset
import os

# Create a set of valid photo IDs
valid_photo_ids = set(picture_df['photo_id'])

# Define the path to the photos folder
photos_folder = '../data/photos/photos'

# Iterate over all files in the photos folder
for filename in os.listdir(photos_folder):
    # Extract the photo ID from the filename
    photo_id = filename.split('.')[0]
    
    # Check if the photo ID is not in the valid set
    if photo_id not in valid_photo_ids:
        # Construct the full path to the file
        file_path = os.path.join(photos_folder, filename)
        
        # Remove the file
        os.remove(file_path)
        print(f"Removed invalid photo: {filename}")
        
# Check the number of photos in the folder
print(f"Remaining photos in the folder: {len(os.listdir(photos_folder))}")

FileNotFoundError: [Errno 2] No such file or directory: '../data/photos/photos'