In [257]:
import json
import numpy as np
import pandas as pd

In [258]:
photo_datalist = []
with open('../data/photos.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        photo_datalist.append(data)

restaurant_datalist = []
with open('../data/yelp_academic_dataset_business.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        restaurant_datalist.append(data)
        


In [259]:
# clean photos
# create dataframe
picture_df = pd.DataFrame(photo_datalist)
restaurant_df = pd.DataFrame(restaurant_datalist)
michelin_df = pd.read_csv('../data/michelin.csv')
print(restaurant_df['attributes'])

0                             {'ByAppointmentOnly': 'True'}
1                    {'BusinessAcceptsCreditCards': 'True'}
2         {'BikeParking': 'True', 'BusinessAcceptsCredit...
3         {'RestaurantsDelivery': 'False', 'OutdoorSeati...
4         {'BusinessAcceptsCreditCards': 'True', 'Wheelc...
                                ...                        
150341    {'ByAppointmentOnly': 'False', 'RestaurantsPri...
150342    {'BusinessAcceptsCreditCards': 'True', 'Restau...
150343    {'RestaurantsPriceRange2': '1', 'BusinessAccep...
150344    {'BusinessParking': '{'garage': False, 'street...
150345    {'WheelchairAccessible': 'True', 'BusinessAcce...
Name: attributes, Length: 150346, dtype: object


In [260]:
# clean michelin
michelin_df = michelin_df.drop(columns=['PhoneNumber', 'Url', 'WebsiteUrl', 'Award', 'GreenStar', 'Description', 'Location'])
# encode prices based on number of characters
michelin_df["Price"] = michelin_df["Price"].apply(lambda x: str(len(str(x))))
facilities_and_services = michelin_df["FacilitiesAndServices"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
michelin_df["attributes"] = facilities_and_services.apply(lambda x: {item.strip(): True for item in x})
# Process cuisine properly - use apply with a function that has access to row index
for idx, row in michelin_df.iterrows():
    cuisine_list = row["Cuisine"].split(",") if isinstance(row["Cuisine"], str) else []
    cuisine_clean = [c.strip() for c in cuisine_list]
    price = row["Price"]
    # Update attributes for this specific row
    michelin_df.at[idx, "attributes"] = {**michelin_df.at[idx, "attributes"], 
                                         "Cuisine": cuisine_clean,
                                         "RestaurantsPriceRange2": price.strip()}
michelin_df['is_michelin'] = True
restaurant_df['is_michelin'] = False
# generate random high ratings for michelin restaurants
michelin_df['stars'] = np.random.uniform(4, 5, michelin_df.shape[0])
# generate business_ids for michelin
michelin_df['business_id'] = np.arange(len(michelin_df))

In [261]:
# align michelin_df and restaurant_df
restaurant_df = restaurant_df.drop(columns=['hours', 'review_count', 'is_open'])
michelin_df = michelin_df.drop(columns=['FacilitiesAndServices', 'Cuisine', 'Price'])
michelin_df = michelin_df.rename(columns={
    "Name": "name",
    "Address": "address",
    "Longitude": "longitude",
    "Latitude": "latitude",
})
# combine address, city, state, postal_code fields into one column
restaurant_df["address"] = restaurant_df["address"] + ", " + restaurant_df["city"] + ", " + restaurant_df["state"] + " " + restaurant_df["postal_code"]
restaurant_df = restaurant_df.drop(columns=['city', 'state', 'postal_code'])

In [262]:
non_food_terms = '|'.join(['Salon', 'Barber', 'Gym', 'Spa', 'Theater', 'Nightlife', 'Beauty', 'Barbershop', "Active Life",
    "Automotive",
    "Beauty & Spas",
    "Home Services",
    "Health & Medical",
    "Hotels & Travel",
    "Local Services",
    "Professional Services",
    "Public Services & Government",
    "Real Estate",
    "Religious Organizations",
    "Shopping & Retail",
    "Transportation",
    "Arts & Entertainment",
    "Event Planning & Services",
    "Education",
    "Financial Services",
    "Nightlife",
    "Pets & Animal Services",
    "Sports & Recreation",
    "Miscellaneous Services",
    "Shopping", "Women's Clothing", "Fashion"
])

# Filter out non-food related businesses
restaurant_df = restaurant_df[~restaurant_df['categories'].str.contains(non_food_terms, case=False, na=False)]
picture_df = picture_df[picture_df['label'] != 'inside']
picture_df = picture_df[picture_df['label'] != 'outside']
picture_df = picture_df[picture_df['label'] != 'menu']
# Keep only photos that belong to the filtered restaurants
picture_df = picture_df[picture_df['business_id'].isin(restaurant_df['business_id'])]

# Print statistics
print("Number of food establishments:", len(restaurant_df))
print("Number of photos for food establishments:", len(picture_df))
print("\nSample categories in filtered dataset:")
print(restaurant_df['categories'].unique())
print(restaurant_df.columns)

restaurant_df = restaurant_df.drop(columns=["categories"])


Number of food establishments: 51036
Number of photos for food establishments: 81941

Sample categories in filtered dataset:
['Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries'
 'Brewpubs, Breweries, Food'
 'Burgers, Fast Food, Sandwiches, Food, Ice Cream & Frozen Yogurt, Restaurants'
 ... 'Restaurants, Sandwiches, Convenience Stores, Coffee & Tea, Food'
 'Cafes, Juice Bars & Smoothies, Coffee & Tea, Restaurants, Food'
 'Specialty Food, Food, Coffee & Tea, Coffee Roasteries']
Index(['business_id', 'name', 'address', 'latitude', 'longitude', 'stars',
       'attributes', 'categories', 'is_michelin'],
      dtype='object')


In [263]:
# union michelin_df and restaurant_df based on business_id as index
michelin_df = michelin_df.set_index('business_id')
print(michelin_df["attributes"].sample(1).to_list())
restaurant_df = restaurant_df.set_index('business_id')
combined_df = pd.concat([michelin_df, restaurant_df], axis=0)
combined_df = combined_df.reset_index()
print(combined_df.columns.to_list())
print(combined_df["attributes"].sample(10).to_list())
combined_df.to_csv('../data/combined_restaurants.csv', index=False)

[{'Air conditioning': True, 'Car park': True, 'Interesting wine list': True, 'Wheelchair access': True, 'Cuisine': ['Traditional Cuisine'], 'RestaurantsPriceRange2': '3'}]
['business_id', 'name', 'address', 'longitude', 'latitude', 'attributes', 'is_michelin', 'stars']
[{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'RestaurantsTakeOut': 'True', 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'RestaurantsPriceRange2': '1'}, {'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", 'BusinessAcceptsCreditCards': 'True', 'RestaurantsPriceRange2': '3'}, {'RestaurantsGoodForGroups': 'True', 'OutdoorSeating': 'True', 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'Ambience': "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': F

In [264]:
import json
import pandas as pd
import numpy as np
import re

def flatten_json(y, parent_key='', sep='_'):
    items = {}
    if isinstance(y, dict):
        for k, v in y.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            # Clean key names: remove special characters, replace spaces
            new_key = re.sub(r'[^a-zA-Z0-9_]', '', new_key.replace(' ', '_'))
            if isinstance(v, str):
                try:
                    # Handle Yelp's specific string format e.g. u'string_value'
                    if v.startswith("u'") and v.endswith("'"):
                        v = v[2:-1]
                    elif v.startswith("'") and v.endswith("'"):
                         v = v[1:-1]
                    
                    # Attempt to parse if it looks like a dict or list string
                    # Use corrected string replacement logic
                    if isinstance(v, str) and v.strip().startswith('{') and v.strip().endswith('}'):
                        # More robust replacement for JSON parsing
                        v_corrected = v.replace("\"", "'") # Temp replace escaped quotes
                        v_corrected = v_corrected.replace("'", "\"") # Replace single with double
                        v_corrected = v_corrected.replace("None", "null").replace("True", "true").replace("False", "false")
                        nested_data = json.loads(v_corrected)
                        items.update(flatten_json(nested_data, new_key, sep=sep))
                    # elif isinstance(v, str) and v.strip().startswith('[') and v.strip().endswith(']'):
                        # Handle lists if necessary - e.g., create dummy vars or join
                        # items[new_key] = v # Placeholder: keep string for now
                    else:
                        items[new_key] = v # Keep as string if not JSON-like or list
                except (json.JSONDecodeError, TypeError):
                    items[new_key] = v # Keep original string if parsing fails
            elif isinstance(v, dict):
                items.update(flatten_json(v, new_key, sep=sep))
            elif isinstance(v, list):
                # Handle lists: Create dummy variables for each item
                for item in v:
                    item_key = re.sub(r'[^a-zA-Z0-9_]', '', str(item).replace(' ', '_'))
                    items[f"{new_key}_{item_key}"] = 1 # Create a flag for each item
            else:
                items[new_key] = v
    return items

def parse_and_flatten(attr_input):
    if isinstance(attr_input, dict): # Already a dict (from michelin data)
        return flatten_json(attr_input)
    if not isinstance(attr_input, str):
        return {}
    try:
        # More robust replacement for initial JSON parsing
        attr_str_corrected = attr_input.replace("\"", "'") # Temp replace escaped quotes
        attr_str_corrected = attr_str_corrected.replace("'", "\"") # Replace single with double
        attr_str_corrected = attr_str_corrected.replace("None", "null").replace("True", "true").replace("False", "false")
        data = json.loads(attr_str_corrected)
        return flatten_json(data)
    except (json.JSONDecodeError, TypeError):
        # If it fails, return empty dict
        # print(f"Failed to parse: {attr_input}") # Optional: for debugging
        return {}

def extract_and_encode_attributes(df):
    """
    Extracts nested JSON attributes, flattens them, encodes them numerically,
    and merges them back into the DataFrame.
    """
    if 'attributes' not in df.columns:
        print("Column 'attributes' not found.")
        return df
        
    # Apply parsing and flattening row-wise
    flattened_attrs = df['attributes'].apply(parse_and_flatten)

    # Create a DataFrame from the flattened attributes
    attrs_df = pd.DataFrame(flattened_attrs.tolist(), index=df.index)
    print(f"Extracted {attrs_df.shape[1]} attribute columns.")

    # Combine with the original DataFrame (excluding the original 'attributes' column)
    df_processed = pd.concat([df.drop(columns=['attributes']), attrs_df], axis=1)

    # --- Encoding ---
    map_dict = {
        'True': 1, 'true': 1, '1': 1, True: 1,
        'False': 0, 'false': 0, '0': 0, False: 0,
        'None': 0, 'none': 0, None: 0, 'null': 0, '': 0,
        'yes': 1, 'no': 0, # Add common boolean words
        # Handle Yelp specific strings like 'full_bar', 'beer_and_wine'
        'full_bar': 1, 'beer_and_wine': 1, # For Alcohol (treat 'none' as 0 above)
        'casual': 0, 'dressy': 1, 'formal': 2, # For Attire (example ordinal)
        'quiet': 0, 'average': 1, 'loud': 2, 'very_loud': 3 # For NoiseLevel (example ordinal)
        # Price range ('RestaurantsPriceRange2') will be handled separately
    }

    new_cols = attrs_df.columns
    object_cols_to_process = df_processed[new_cols].select_dtypes(include=['object']).columns
    
    # Identify the likely price column name (handle potential variations from flattening)
    price_col_name = None
    potential_price_cols = [col for col in new_cols if 'RestaurantsPriceRange' in col]
    if potential_price_cols:
        price_col_name = potential_price_cols[0] # Assume the first match is the one
        print(f"Identified price column as: {price_col_name}")
    else:
        print("Warning: Price range column ('RestaurantsPriceRange2') not found in extracted attributes.")

    for col in object_cols_to_process:
        # *** Skip the price column from general mapping *** 
        if col == price_col_name:
            continue
            
        # Attempt direct mapping first for common boolean/categorical strings
        # Create a temporary mapped series to check effectiveness
        mapped_series = df_processed[col].map(map_dict)
        
        # If a significant portion was mapped, apply it
        if mapped_series.notna().sum() > 0.1 * len(df_processed): # Heuristic: if >10% mapped
             df_processed[col] = mapped_series
        else:
             # If mapping didn't work well, try numeric conversion
             numeric_series = pd.to_numeric(df_processed[col], errors='coerce')
             if numeric_series.notna().sum() > 0.1 * len(df_processed):
                  df_processed[col] = numeric_series
             # else: # Optional: Consider one-hot encoding for low-cardinality categoricals
                 # unique_count = df_processed[col].nunique()
                 # if 1 < unique_count < 20: # Heuristic for one-hot encoding
                 #     print(f"One-hot encoding column: {col}")
                 #     dummies = pd.get_dummies(df_processed[col], prefix=col, dummy_na=False)
                 #     df_processed = pd.concat([df_processed.drop(columns=[col]), dummies], axis=1)
                 # else:
                 #     print(f"Dropping object column with high cardinality or unmappable: {col}")
                 #     df_processed = df_processed.drop(columns=[col], errors='ignore')
    
    # *** Specifically handle the price column *** 
    if price_col_name and price_col_name in df_processed.columns:
        # Convert to numeric, coercing errors (e.g., empty strings) to NaN
        df_processed[price_col_name] = pd.to_numeric(df_processed[price_col_name], errors='coerce')
        # Fill any resulting NaNs (e.g., from non-numeric entries or missing values) with 0
        # Consider if 0 is the best fill value, maybe median/mean if appropriate
        df_processed[price_col_name] = df_processed[price_col_name].fillna(0)
        print(f"Processed price column '{price_col_name}' numerically.")
        
    # Convert original boolean columns (like is_michelin) to int if they exist and are bool type
    for col in df_processed.select_dtypes(include=['bool']).columns:
        df_processed[col] = df_processed[col].astype(int)

    # Fill NaN values - strategy: fill with 0 for numeric/mapped cols
    # This will also cover NaNs in numeric columns not handled above
    numeric_cols = df_processed.select_dtypes(include=np.number).columns
    df_processed[numeric_cols] = df_processed[numeric_cols].fillna(0)

    # Convert float columns that should be int (like mapped booleans, counts, price ranges)
    for col in numeric_cols:
        # Check if column exists and is numeric
        if col in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed[col]):
            # Avoid converting genuine floats like lat/lon/stars by checking if all values are integers
            try:
                # Check after filling NaNs
                is_integer_like = (df_processed[col] % 1 == 0).all()
            except TypeError:
                is_integer_like = False # Handle potential non-numeric types if any issue
                
            # Also skip the price column here if it was float and meant to stay float (unlikely for price range)
            if is_integer_like and col not in ['latitude', 'longitude', 'stars']:
                # Convert to standard int (already filled NaNs)
                df_processed[col] = df_processed[col].astype(int)

    # Drop remaining object columns that were not processed (likely high cardinality text)
    final_object_cols = df_processed.select_dtypes(include=['object']).columns
    # Keep essential identifiers
    keep_objects = ['business_id', 'name', 'address'] 
    cols_to_drop = [col for col in final_object_cols if col not in keep_objects]
    if cols_to_drop:
        print(f"Dropping remaining unprocessed object columns: {cols_to_drop}")
        df_processed = df_processed.drop(columns=cols_to_drop, errors='ignore')

    print(f"DataFrame shape after processing attributes: {df_processed.shape}")
    return df_processed

In [265]:
# extract attributes and create recommendation features
combined_df = extract_and_encode_attributes(combined_df)
# combined_df = combined_df.drop(columns=['attributes']) # Drop is now handled inside the function

Extracted 377 attribute columns.
Identified price column as: RestaurantsPriceRange2
Identified price column as: RestaurantsPriceRange2
Processed price column 'RestaurantsPriceRange2' numerically.
Processed price column 'RestaurantsPriceRange2' numerically.
Dropping remaining unprocessed object columns: ['Interesting_wine_list', 'Valet_parking', 'Wheelchair_access', 'Garden_or_park', 'Restaurant_offering_vegetarian_menus', 'Car_park', 'Great_view', 'Counter_dining', 'Notable_sake_list', 'Shoes_must_be_removed', 'Cash_only', 'Brunch', 'Credit_cards_not_accepted', 'Bring_your_own_bottle', 'Cash_only__lunch', 'Foreign_credit_cards_not_accepted', 'Booking_essential', 'Booking_essential__dinner', 'ByAppointmentOnly', 'BusinessParking', 'Ambience', 'CoatCheck', 'DriveThru', 'BusinessAcceptsBitcoin', 'BYOB', 'GoodForMeal', 'Corkage', 'BYOBCorkage', 'Smoking', 'RestaurantsCounterService', 'BestNights_monday', 'BestNights_tuesday', 'BestNights_friday', 'BestNights_wednesday', 'BestNights_thursda

In [266]:
# check output based on column datatypes
print(combined_df.dtypes)
print(combined_df.shape)
print(combined_df.columns.to_list())

# Identify boolean columns (including those potentially created from attributes)
bool_cols = combined_df.select_dtypes(include=['bool']).columns
print(f"Boolean columns found: {list(bool_cols)}")
for col in bool_cols:
    combined_df[col] = combined_df[col].astype(int)

# Convert is_michelin specifically if it wasn't bool type already
if 'is_michelin' in combined_df.columns and combined_df['is_michelin'].dtype != 'int':
    # Ensure it's treated as boolean before converting to int
    combined_df['is_michelin'] = combined_df['is_michelin'].astype(bool).astype(int)

obj_bool_cols = [
    "Air conditioning",
    "Interesting wine list",
    "Valet parking",
    "Wheelchair access",
    "Garden or park",
    "Restaurant offering vegetarian menus",
    "Car park",
    "Great view",
    "Terrace",
    "Counter dining",
    "Notable sake list",
    "Shoes must be removed",
    "Cash only",
    "Brunch",
    "Credit cards not accepted",
    "Bring your own bottle",
    "Cash only - lunch",
    "Foreign credit cards not accepted"
]

# Clean column names in obj_bool_cols list to match flattened names
obj_bool_cols = [re.sub(r'[^a-zA-Z0-9_]', '', col.replace(' ', '_')) for col in obj_bool_cols]

# Filter list to columns that actually exist in the dataframe after flattening
obj_bool_cols = [col for col in obj_bool_cols if col in combined_df.columns]

# Check which of these are still object type (meaning initial mapping might have missed some cases)
obj_cols_needing_map = [col for col in obj_bool_cols if combined_df[col].dtype == 'object']

print(f"Object columns to check/map for boolean-like values: {obj_cols_needing_map}")

map_dict = {'True': 1, 'true': 1, '1': 1, True: 1,
            'False': 0, 'false': 0, '0': 0, False: 0,
            'None': 0, 'none': 0, None: 0, 'null': 0, '': 0,
            'yes': 1, 'no': 0}

for col in obj_cols_needing_map:
    # Apply mapping, coerce others to NaN
    combined_df[col] = combined_df[col].map(map_dict)
    # Convert to numeric, errors='coerce' handles original non-mappable values -> NaN
    combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

# fill missing values with 0 (redundant if done in function, but safe)
numeric_cols = combined_df.select_dtypes(include=np.number).columns
combined_df[numeric_cols] = combined_df[numeric_cols].fillna(0)

# Ensure final integer type for all boolean-like columns
# Combine original bool cols, obj_bool_cols, and is_michelin
final_bool_cols = list(bool_cols) + obj_bool_cols + ['is_michelin']
# Ensure unique and existing columns
final_bool_cols = list(set([col for col in final_bool_cols if col in combined_df.columns]))

for col in final_bool_cols:
     # Check if column exists and is numeric but not integer
     if col in combined_df.columns and pd.api.types.is_numeric_dtype(combined_df[col]) and not pd.api.types.is_integer_dtype(combined_df[col]):
         # Check if conversion to int is safe (no NaNs, all are whole numbers)
         if combined_df[col].isnull().sum() == 0 and (combined_df[col] % 1 == 0).all():
             combined_df[col] = combined_df[col].astype(int)
         else:
             # Handle cases with NaNs or non-integers if necessary, e.g., fillna then convert
             print(f"Warning: Column {col} could not be safely converted to int. Contains NaNs or non-integers.")
             # Apply fillna(0) and convert, accepting potential data change
             combined_df[col] = combined_df[col].fillna(0).astype(int)

# check output based on column datatypes
print(combined_df.shape)
print("Checking dtypes of boolean-like columns after conversion:")
print(combined_df[final_bool_cols].head())
print(combined_df[final_bool_cols].dtypes)

business_id               object
name                      object
address                   object
longitude                float64
latitude                 float64
                          ...   
GoodForMeal_lunch          int64
GoodForMeal_dinner         int64
GoodForMeal_brunch         int64
GoodForMeal_breakfast      int64
NoiseLevel                 int64
Length: 328, dtype: object
(68782, 328)
['business_id', 'name', 'address', 'longitude', 'latitude', 'is_michelin', 'stars', 'Air_conditioning', 'Cuisine_Korean', 'Cuisine_Korean_Contemporary', 'RestaurantsPriceRange2', 'Cuisine_Creative_British', 'Cuisine_French', 'Cuisine_Modern_Cuisine', 'Cuisine_Creative', 'Cuisine_Classic_French', 'Cuisine_Modern_French', 'Cuisine_Modern_British', 'Terrace', 'Cuisine_Contemporary', 'Cuisine_Seafood', 'Cuisine_Vegan', 'Cuisine_Innovative', 'Cuisine_Japanese', 'Cuisine_Sushi', 'Cuisine_American', 'Cuisine_Noodles', 'Cuisine_Naengmyeon', 'Cuisine_Gomtang', 'Cuisine_Dwaejigukbap', 'Cuisine_Southe

In [267]:
# New cell for validation
# Check if all pictures link to valid restaurants
# get business_ids of all restaurants in combined_df
all_valid_business_ids = set(combined_df['business_id'])
invalid_pictures = picture_df[~picture_df['business_id'].isin(all_valid_business_ids)]

print("Validation Results:")
print(f"Total pictures: {len(picture_df)}")
print(f"Pictures with valid restaurant links: {len(picture_df) - len(invalid_pictures)}")
print(f"Pictures with invalid restaurant links: {len(invalid_pictures)}")

if len(invalid_pictures) > 0:
    print("\nSample of invalid picture entries:")
    print(invalid_pictures.head())
else:
    print("\nAll pictures are linked to valid restaurants!")

# Optional: Remove any invalid pictures if found
if len(invalid_pictures) > 0:
    picture_df = picture_df[picture_df['business_id'].isin(all_valid_business_ids)]
    print(f"\nCleaned dataset now contains {len(picture_df)} valid pictures")




Validation Results:
Total pictures: 81941
Pictures with valid restaurant links: 81941
Pictures with invalid restaurant links: 0

All pictures are linked to valid restaurants!


In [268]:
# parse photos folder and remove any photos that are not in the cleaned dataset
import os

# Create a set of valid photo IDs
valid_photo_ids = set(picture_df['photo_id'])

# Define the path to the photos folder
photos_folder = '../data/photos'

# Iterate over all files in the photos folder
if os.path.exists(photos_folder):
    for filename in os.listdir(photos_folder):
        # Extract the photo ID from the filename
        photo_id = filename.split('.')[0]
        
        # Check if the photo ID is not in the valid set
        if photo_id not in valid_photo_ids:
            # Construct the full path to the file
            file_path = os.path.join(photos_folder, filename)
            
            # Remove the file
            try:
                os.remove(file_path)
                print(f"Removed invalid photo: {filename}")
            except OSError as e:
                print(f"Error removing file {file_path}: {e}")
                
    # Check the number of photos in the folder
    print(f"Remaining photos in the folder: {len(os.listdir(photos_folder))}")
else:
    print(f"Photos folder not found at: {photos_folder}")

Photos folder not found at: ../data/photos


In [269]:
# save restaurants to json for db import and csv for model training
restaurant_df_model_training = combined_df.copy().drop(columns=['name', 'address', 'is_michelin'])
picture_df.to_csv('../data/cleaned_photos.csv', index=False)

# keep name, address, stars, latitude, longitude, is_michelin, business_id, RestaurantsPriceRange2
# Select important columns for database
columns_to_keep = ['name', 'address', 'stars', 'latitude', 'longitude', 'is_michelin', 'business_id']
print(combined_df["RestaurantsPriceRange2"].unique())
# Find the RestaurantsPriceRange2 column (it might be extracted from attributes)
# Ensure the price column name is cleaned if necessary
price_col_name = 'RestaurantsPriceRange2'
if price_col_name not in combined_df.columns:
    # Attempt to find a similarly named column if cleaning changed it
    potential_price_cols = [col for col in combined_df.columns if 'RestaurantsPriceRange' in col]
    if potential_price_cols:
        price_col_name = potential_price_cols[0]
    else:
        print("Warning: RestaurantsPriceRange2 column not found. Price will not be included in DB export.")
        price_col_name = None

if price_col_name and price_col_name in combined_df.columns:
    # convert price_col to string for JSON export if it's numeric
    if pd.api.types.is_numeric_dtype(combined_df[price_col_name]):
         combined_df[price_col_name] = combined_df[price_col_name].astype(str)
    columns_to_keep.append(price_col_name)
else:
    price_col_name = None # Ensure it's None if not added

# Create database version with selected columns
restaurant_df_database = combined_df[columns_to_keep].copy()
picture_df.to_json('../data/cleaned_photos.json', orient='records', lines=True)

print("Columns for model training:", restaurant_df_model_training.columns.to_list())
print("Columns for database:", restaurant_df_database.columns.to_list())
print(restaurant_df_model_training.head())
restaurant_df_database.to_json('../data/cleaned_restaurants.json', orient='records', lines=True)
restaurant_df_model_training.to_pickle('../data/cleaned_restaurants.pkl')

[4 1 3 2 0]
Columns for model training: ['business_id', 'longitude', 'latitude', 'stars', 'Air_conditioning', 'Cuisine_Korean', 'Cuisine_Korean_Contemporary', 'RestaurantsPriceRange2', 'Cuisine_Creative_British', 'Cuisine_French', 'Cuisine_Modern_Cuisine', 'Cuisine_Creative', 'Cuisine_Classic_French', 'Cuisine_Modern_French', 'Cuisine_Modern_British', 'Terrace', 'Cuisine_Contemporary', 'Cuisine_Seafood', 'Cuisine_Vegan', 'Cuisine_Innovative', 'Cuisine_Japanese', 'Cuisine_Sushi', 'Cuisine_American', 'Cuisine_Noodles', 'Cuisine_Naengmyeon', 'Cuisine_Gomtang', 'Cuisine_Dwaejigukbap', 'Cuisine_Southern_Thai', 'Cuisine_Asturian', 'Cuisine_Traditional_Cuisine', 'Cuisine_Italian_Contemporary', 'Cuisine_Alpine', 'Cuisine_Mediterranean_Cuisine', 'Cuisine_Seasonal_Cuisine', 'Cuisine_Country_cooking', 'Cuisine_Farm_to_table', 'Cuisine_French_Contemporary', 'Cuisine_Chinese', 'Cuisine_Taizhou', 'Cuisine_Chao_Zhou', 'Cuisine_Taiwanese_contemporary', 'Cuisine_Singaporean', 'Cuisine_Cantonese', 'Cuis

In [270]:
print(restaurant_df_model_training['RestaurantsPriceRange2'].unique())

[4 1 3 2 0]
