In [435]:
import json
import numpy as np
import pandas as pd

In [436]:
photo_datalist = []
with open('../data/photos.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        photo_datalist.append(data)

restaurant_datalist = []
with open('../data/yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        restaurant_datalist.append(data)

In [437]:
# clean photos
# create dataframe
picture_df = pd.DataFrame(photo_datalist)
restaurant_df = pd.DataFrame(restaurant_datalist)
michelin_df = pd.read_csv('../data/michelin.csv')
print(restaurant_df['attributes'])
print(michelin_df.columns)


0                             {'ByAppointmentOnly': 'True'}
1                    {'BusinessAcceptsCreditCards': 'True'}
2         {'BikeParking': 'True', 'BusinessAcceptsCredit...
3         {'RestaurantsDelivery': 'False', 'OutdoorSeati...
4         {'BusinessAcceptsCreditCards': 'True', 'Wheelc...
                                ...                        
150341    {'ByAppointmentOnly': 'False', 'RestaurantsPri...
150342    {'BusinessAcceptsCreditCards': 'True', 'Restau...
150343    {'RestaurantsPriceRange2': '1', 'BusinessAccep...
150344    {'BusinessParking': '{'garage': False, 'street...
150345    {'WheelchairAccessible': 'True', 'BusinessAcce...
Name: attributes, Length: 150346, dtype: object
Index(['Name', 'Address', 'Location', 'Price', 'Cuisine', 'Longitude',
       'Latitude', 'PhoneNumber', 'Url', 'WebsiteUrl', 'Award', 'GreenStar',
       'FacilitiesAndServices', 'Description'],
      dtype='object')


In [438]:
# clean michelin
michelin_df = michelin_df.drop(columns=['PhoneNumber', 'Url', 'WebsiteUrl', 'Award', 'GreenStar',
       'FacilitiesAndServices', 'Description', 'Location'])
# encode prices based on number of characters
michelin_df["Price"] = michelin_df["Price"].apply(lambda x: str(len(str(x))))
michelin_df = michelin_df.rename(columns={"Price": "RestaurantsPriceRange2", "Name": "name", "City": "city", "Zip": "postal_code", "Latitude": "latitude", "Longitude": "longitude", "Cuisine": "categories", "Address": "address"})
# add price range information
michelin_df["attributes"] = michelin_df["RestaurantsPriceRange2"].apply(lambda x: {"RestaurantsPriceRange2": x})
michelin_df['is_michelin'] = True
restaurant_df['is_michelin'] = False
# generate random high ratings for michelin restaurants
michelin_df['stars'] = np.random.uniform(4, 5, michelin_df.shape[0])
# generate business_ids for michelin
michelin_df['business_id'] = np.arange(len(michelin_df))
# union michelin and restaurant
restaurant_df = restaurant_df.rename(columns={"name": "name", "city": "city", "postal_code": "postal_code", "latitude": "latitude", "longitude": "longitude", "price": "price", "categories": "categories", "address": "address"})
# merge michelin and restaurant
restaurant_df = pd.concat([restaurant_df, michelin_df])
print(restaurant_df.head(5))

              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0           7.0      0.0   
1  38.551126  -90.335695    3.0          15.0      1.0   
2  32.223236 -110.880452    3.5          22.0      0.0   
3  39.9555

In [439]:
import ast

def clean_and_encode_restaurant_attributes(df):
    """
    Clean and encode restaurant attributes for a recommendation model.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing business data with 'attributes' column
    
    Returns:
    pandas.DataFrame: DataFrame with cleaned and encoded attributes
    """
    # Make a copy to avoid modifying the original dataframe
    df_clean = df.copy()
    
    # Step 1: Parse the attributes column if it's a string and not None
    def parse_attributes(attr):
        if attr is None:
            return None
        if isinstance(attr, str):
            try:
                # Handle single quotes used in JSON
                attr = attr.replace("'", '"')
                # Handle cases where string starts with u
                attr = attr.replace('u"', '"')
                return json.loads(attr)
            except json.JSONDecodeError:
                try:
                    # Alternative parsing with ast.literal_eval
                    return ast.literal_eval(attr)
                except:
                    return None
        return attr
    
    # Apply parsing to the attributes column
    df_clean['attributes'] = df_clean['attributes'].apply(parse_attributes)
    
    # Step 2: Filter out non-restaurants
    # We'll consider a business a restaurant if it has any of the restaurant-specific attributes
    restaurant_attrs = ['RestaurantsPriceRange2', 'RestaurantsAttire', 'NoiseLevel', 
                       'RestaurantsGoodForGroups', 'Alcohol', 'RestaurantsReservations']
    
    def is_restaurant(attr):
        if attr is None:
            return False
        for rest_attr in restaurant_attrs:
            if rest_attr in attr:
                return True
        return False
    
    # Filter businesses that have at least one restaurant attribute
    df_clean = df_clean[df_clean['attributes'].apply(is_restaurant)]
    
    # Step 3: Extract and encode each specific attribute
    
    # 3.1: RestaurantsPriceRange2 (numeric 1-4)
    def extract_price_range(attr):
        if attr is None or 'RestaurantsPriceRange2' not in attr:
            return np.nan
        try:
            value = attr['RestaurantsPriceRange2']
            if isinstance(value, str) and value.isdigit():
                return int(value)
            return np.nan
        except:
            return np.nan
    
    df_clean['price_range'] = df_clean['attributes'].apply(extract_price_range)
    
    # 3.2: RestaurantsAttire (casual, dressy, formal)
    def extract_attire(attr):
        if attr is None or 'RestaurantsAttire' not in attr:
            return 'unknown'
        try:
            value = attr['RestaurantsAttire']
            # Clean up the string
            value = value.lower().replace("'", "").replace("u", "").replace('"', '')
            return value
        except:
            return 'unknown'
    
    df_clean['attire'] = df_clean['attributes'].apply(extract_attire)
    
    # One-hot encode attire
    attire_dummies = pd.get_dummies(df_clean['attire'], prefix='attire')
    df_clean = pd.concat([df_clean, attire_dummies], axis=1)
    
    # 3.3: NoiseLevel (quiet, average, loud, very_loud)
    def extract_noise_level(attr):
        if attr is None or 'NoiseLevel' not in attr:
            return 'unknown'
        try:
            value = attr['NoiseLevel']
            # Clean up the string
            value = value.lower().replace("'", "").replace("u", "").replace('"', '')
            return value
        except:
            return 'unknown'
    
    df_clean['noise_level'] = df_clean['attributes'].apply(extract_noise_level)
    
    # One-hot encode noise level
    noise_dummies = pd.get_dummies(df_clean['noise_level'], prefix='noise')
    df_clean = pd.concat([df_clean, noise_dummies], axis=1)
    
    # 3.4: RestaurantsGoodForGroups (boolean)
    def extract_good_for_groups(attr):
        if attr is None or 'RestaurantsGoodForGroups' not in attr:
            return np.nan
        try:
            value = attr['RestaurantsGoodForGroups']
            if isinstance(value, bool):
                return 1 if value else 0
            if isinstance(value, str):
                return 1 if value.lower() == 'true' else 0
            return np.nan
        except:
            return np.nan
    
    df_clean['good_for_groups'] = df_clean['attributes'].apply(extract_good_for_groups)
    
    # 3.5: Alcohol (none, beer_and_wine, full_bar)
    def extract_alcohol(attr):
        if attr is None or 'Alcohol' not in attr:
            return 'unknown'
        try:
            value = attr['Alcohol']
            # Clean up the string
            value = value.lower().replace("'", "").replace("u", "").replace('"', '')
            return value
        except:
            return 'unknown'
    
    df_clean['alcohol'] = df_clean['attributes'].apply(extract_alcohol)
    
    # One-hot encode alcohol
    alcohol_dummies = pd.get_dummies(df_clean['alcohol'], prefix='alcohol')
    df_clean = pd.concat([df_clean, alcohol_dummies], axis=1)
    
    # 3.6: RestaurantsReservations (boolean)
    def extract_reservations(attr):
        if attr is None or 'RestaurantsReservations' not in attr:
            return np.nan
        try:
            value = attr['RestaurantsReservations']
            if isinstance(value, bool):
                return 1 if value else 0
            if isinstance(value, str):
                return 1 if value.lower() == 'true' else 0
            return np.nan
        except:
            return np.nan
    
    df_clean['takes_reservations'] = df_clean['attributes'].apply(extract_reservations)
    
    # 3.7: Ambience (dictionary with multiple boolean values)
    # For Ambience, we'll extract common ambience attributes
    ambience_attributes = ['romantic', 'intimate', 'touristy', 'hipster', 'divey', 
                          'classy', 'trendy', 'upscale', 'casual']
    
    def extract_ambience(attr, amb_attr):
        if attr is None or 'Ambience' not in attr:
            return np.nan
        try:
            ambience = attr['Ambience']
            
            # Handle the case where Ambience is a string representation of a dictionary
            if isinstance(ambience, str):
                if ambience.lower() == 'none':
                    return np.nan
                try:
                    # Try to parse the string as a dictionary
                    ambience = ambience.replace("'", '"')
                    ambience = json.loads(ambience)
                except:
                    try:
                        ambience = ast.literal_eval(ambience)
                    except:
                        return np.nan
            
            # Now extract the specific ambience attribute
            if amb_attr in ambience:
                value = ambience[amb_attr]
                if value is None:
                    return np.nan
                if isinstance(value, bool):
                    return 1 if value else 0
                if isinstance(value, str):
                    return 1 if value.lower() == 'true' else 0
            return np.nan
        except:
            return np.nan
    
    # Add columns for each ambience attribute
    for amb_attr in ambience_attributes:
        df_clean[f'ambience_{amb_attr}'] = df_clean['attributes'].apply(
            lambda x: extract_ambience(x, amb_attr))
    
    # Fill NaN values with appropriate defaults
    df_clean['price_range'] = df_clean['price_range'].fillna(df_clean['price_range'].mode()[0])
    df_clean['good_for_groups'] = df_clean['good_for_groups'].fillna(0)
    df_clean['takes_reservations'] = df_clean['takes_reservations'].fillna(0)
    
    # Fill NaN values in ambience columns
    for amb_attr in ambience_attributes:
        df_clean[f'ambience_{amb_attr}'] = df_clean[f'ambience_{amb_attr}'].fillna(0)
    
    # Drop the original attributes column and intermediate columns
    columns_to_drop = ['attributes', 'attire', 'noise_level', 'alcohol']
    df_clean = df_clean.drop(columns=columns_to_drop)
    
    return df_clean

In [440]:
restaurant_df = clean_and_encode_restaurant_attributes(restaurant_df)
print(restaurant_df.columns)


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'hours', 'is_michelin', 'RestaurantsPriceRange2',
       'price_range', 'attire_casal', 'attire_dressy', 'attire_formal',
       'attire_none', 'attire_unknown', 'noise_average', 'noise_lod',
       'noise_none', 'noise_qiet', 'noise_unknown', 'noise_very_lod',
       'good_for_groups', 'alcohol_beer_and_wine', 'alcohol_fll_bar',
       'alcohol_none', 'alcohol_unknown', 'takes_reservations',
       'ambience_romantic', 'ambience_intimate', 'ambience_touristy',
       'ambience_hipster', 'ambience_divey', 'ambience_classy',
       'ambience_trendy', 'ambience_upscale', 'ambience_casual'],
      dtype='object')


In [441]:
# # NEW: Exclude additional non-food related businesses
non_food_terms = '|'.join(['Salon', 'Barber', 'Gym', 'Spa', 'Theater', 'Nightlife', 'Beauty', 'Barbershop', "Active Life",
    "Automotive",
    "Beauty & Spas",
    "Home Services",
    "Health & Medical",
    "Hotels & Travel",
    "Local Services",
    "Professional Services",
    "Public Services & Government",
    "Real Estate",
    "Religious Organizations",
    "Shopping & Retail",
    "Transportation",
    "Arts & Entertainment",
    "Event Planning & Services",
    "Education",
    "Financial Services",
    "Nightlife",
    "Pets & Animal Services",
    "Sports & Recreation",
    "Miscellaneous Services"
])
# Apply filter only to non-Michelin restaurants
non_michelin_mask = ~restaurant_df['is_michelin']
food_mask = restaurant_df['categories'].str.contains('Food', case=False, na=False)
non_food_mask = restaurant_df['categories'].str.contains(non_food_terms, case=False, na=False)

# Keep all Michelin restaurants and apply filter to non-Michelin ones
restaurant_df = restaurant_df[
    restaurant_df['is_michelin'] | 
    (non_michelin_mask & food_mask & ~non_food_mask)
]
picture_df = picture_df[picture_df['label'] != 'inside']
picture_df = picture_df[picture_df['label'] != 'outside']
picture_df = picture_df[picture_df['label'] != 'menu']

# Keep only photos that belong to the filtered restaurants
picture_df = picture_df[picture_df['business_id'].isin(restaurant_df['business_id'])]

# Print statistics
print("Number of food establishments:", len(restaurant_df))
print("Number of photos for food establishments:", len(picture_df))
print("\nSample categories in filtered dataset:")
print(restaurant_df['categories'].sample(5))
print(restaurant_df.columns)


Number of food establishments: 42949
Number of photos for food establishments: 45207

Sample categories in filtered dataset:
2766                      Modern Cuisine, Seasonal Cuisine
14027                                   Smørrebrød, Danish
17588                             Creative, Modern Cuisine
6694     Food, Do-It-Yourself Food, Breakfast & Brunch,...
13093                                              Italian
Name: categories, dtype: object
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'hours', 'is_michelin', 'RestaurantsPriceRange2',
       'price_range', 'attire_casal', 'attire_dressy', 'attire_formal',
       'attire_none', 'attire_unknown', 'noise_average', 'noise_lod',
       'noise_none', 'noise_qiet', 'noise_unknown', 'noise_very_lod',
       'good_for_groups', 'alcohol_beer_and_wine', 'alcohol_fll_bar',
       'alcohol_none', 'alcohol_unknown', 'takes_reser

In [442]:
# New cell for validation
# Check if all pictures link to valid restaurants
# get business_ids of non-michelin restaurants
valid_business_ids = set(restaurant_df[~restaurant_df['is_michelin']]['business_id'])
invalid_pictures = picture_df[~picture_df['business_id'].isin(valid_business_ids)]

print("Validation Results:")
print(f"Total pictures: {len(picture_df)}")
print(f"Pictures with valid restaurant links: {len(picture_df) - len(invalid_pictures)}")
print(f"Pictures with invalid restaurant links: {len(invalid_pictures)}")

if len(invalid_pictures) > 0:
    print("\nSample of invalid picture entries:")
    print(invalid_pictures.head())
else:
    print("\nAll pictures are linked to valid restaurants!")

# Optional: Remove any invalid pictures if found
if len(invalid_pictures) > 0:
    picture_df = picture_df[picture_df['business_id'].isin(valid_business_ids)]
    print(f"\nCleaned dataset now contains {len(picture_df)} valid pictures")




Validation Results:
Total pictures: 45207
Pictures with valid restaurant links: 45207
Pictures with invalid restaurant links: 0

All pictures are linked to valid restaurants!


In [443]:
# encode categories column as binary features and update the restaurant dataframe
# keep business_id, latitude, longitude, stars, and category columns for use in the recommender system
categories = restaurant_df['categories'].str.get_dummies(sep=', ').replace({np.nan: 0})
restaurant_df = pd.concat([restaurant_df, categories], axis=1)
restaurant_df.drop(columns=['categories', 'hours'], inplace=True)


In [444]:
print(restaurant_df.columns)
print(restaurant_df.head(5))
print(restaurant_df.shape)

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count',
       ...
       'World Cuisine', 'Wraps', 'Xibei', 'Xinjiang', 'Yakitori',
       'Yelp Events', 'Yoshoku', 'Yukhoe', 'Yunnanese', 'Zhejiang'],
      dtype='object', length=562)
               business_id                   name              address  \
3   MTSW4McQd7CbVtyjqoe9mw     St Honore Pastries          935 Race St   
5   CF33F8-E6oudUQ46HnavjQ         Sonic Drive-In        615 S Main St   
9   bBDDEgkFA1Otx9Lfe7BZUQ         Sonic Drive-In  2312 Dickerson Pike   
11  eEOYSgkmpB90uNA7lDOMRA  Vietnamese Food Truck                        
14  0bPLkL0QhhPO5kt1_EXmNQ   Zio's Italian Market        2575 E Bay Dr   

            city state postal_code   latitude  longitude  stars  review_count  \
3   Philadelphia    PA       19107  39.955505 -75.155564    4.0          80.0   
5   Ashland City    TN       37015  36.269593 -87.058943    2.0           6.0   
9 

In [445]:
# New cell to handle NaN values before saving
print("Checking for NaN values before saving the dataset:")
print(restaurant_df.isna().sum().sort_values(ascending=False).head(10))

# Identify numeric columns for appropriate filling
numeric_cols = restaurant_df.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = restaurant_df.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"\nHandling {len(numeric_cols)} numeric columns and {len(non_numeric_cols)} non-numeric columns")

# Fill numeric columns with 0 (could also use mean/median if appropriate)
restaurant_df[numeric_cols] = restaurant_df[numeric_cols].fillna(0)

# Fill non-numeric columns with empty string
restaurant_df[non_numeric_cols] = restaurant_df[non_numeric_cols].fillna('')

# Final check to ensure no NaN values remain
remaining_nans = restaurant_df.isna().sum().sum()
print(f"\nRemaining NaN values after cleaning: {remaining_nans}")
if remaining_nans > 0:
    print("Columns still containing NaNs:")
    print(restaurant_df.columns[restaurant_df.isna().any()].tolist())

Checking for NaN values before saving the dataset:
RestaurantsPriceRange2    25203
city                      17746
state                     17746
postal_code               17746
review_count              17746
is_open                   17746
business_id                   0
Noodles and Congee            0
Naengmyeon                    0
Nakagyo-ku                    0
dtype: int64

Handling 539 numeric columns and 23 non-numeric columns

Remaining NaN values after cleaning: 0


In [446]:
# parse photos folder and remove any photos that are not in the cleaned dataset
import os

# Create a set of valid photo IDs
valid_photo_ids = set(picture_df['photo_id'])

# Define the path to the photos folder
photos_folder = '../data/photos'

# Iterate over all files in the photos folder
for filename in os.listdir(photos_folder):
    # Extract the photo ID from the filename
    photo_id = filename.split('.')[0]
    
    # Check if the photo ID is not in the valid set
    if photo_id not in valid_photo_ids:
        # Construct the full path to the file
        file_path = os.path.join(photos_folder, filename)
        
        # Remove the file
        os.remove(file_path)
        print(f"Removed invalid photo: {filename}")
        
# Check the number of photos in the folder
print(f"Remaining photos in the folder: {len(os.listdir(photos_folder))}")

Remaining photos in the folder: 40062


In [447]:
print("Implementing dimensionality reduction by removing sparse columns...")

binary_cols = restaurant_df.columns[(restaurant_df.isin([0, 1]).sum() == len(restaurant_df))]
sparsity = 1.0 - (restaurant_df[binary_cols].astype(bool).sum() / len(restaurant_df))

sparsity_threshold = 0.995  # 99.5% zeros (0.5% non-zeros)

sparse_columns = sparsity[sparsity > sparsity_threshold].index.tolist()
print(f"Found {len(sparse_columns)} sparse columns with more than {sparsity_threshold*100:.1f}% zeros")


restaurant_df_before = restaurant_df.shape[1]
restaurant_df = restaurant_df.drop(columns=sparse_columns)
restaurant_df_after = restaurant_df.shape[1]

print(f"\nDimensionality reduction complete:")
print(f"- Before: {restaurant_df_before} columns")
print(f"- After: {restaurant_df_after} columns")
print(f"- Removed: {restaurant_df_before - restaurant_df_after} sparse columns ({(restaurant_df_before - restaurant_df_after) / restaurant_df_before * 100:.1f}% reduction)")

Implementing dimensionality reduction by removing sparse columns...
Found 432 sparse columns with more than 99.5% zeros

Dimensionality reduction complete:
- Before: 562 columns
- After: 130 columns
- Removed: 432 sparse columns (76.9% reduction)


In [448]:
# save restaurants to json for db import and csv for model training
restaurant_df_model_training = restaurant_df.copy().drop(columns=['name', 'address', 'city', 'state', 'postal_code'])
picture_df.to_csv('../data/cleaned_photos.csv', index=False)

restaurant_df_database = restaurant_df.copy()
picture_df.to_json('../data/cleaned_photos.json', orient='records', lines=True)

restaurant_df_database.to_json('../data/cleaned_restaurants.json', orient='records', lines=True)
restaurant_df_model_training.to_csv('../data/cleaned_restaurants.csv', index=False)