<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Preprocessing/preprocessing_colombo_restaurants.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import pandas as pd
import re

In [11]:
colombo_restaurants = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboRestaurants.xlsx', engine='openpyxl')

# Rename columns by replacing '/' with '_'
colombo_restaurants.columns = colombo_restaurants.columns.str.replace('/', '_', regex=False)


# Combine the 'cuisines_0' to 'cuisines_6' columns into one
cuisine_columns = ['cuisines_0', 'cuisines_1', 'cuisines_2', 'cuisines_3', 'cuisines_4', 'cuisines_5', 'cuisines_6']
colombo_restaurants['cuisines'] = colombo_restaurants[cuisine_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop rows where the 'cuisines' column is empty
colombo_restaurants = colombo_restaurants[colombo_restaurants['cuisines'].str.strip() != '']

# Drop the original 'cuisines_0' to 'cuisines_6' columns as they are now combined
colombo_restaurants = colombo_restaurants.drop(columns=cuisine_columns)

# Combine the 'dietaryRestrictions_0' to 'dietaryRestrictions_4' columns into one
dietary_columns = ['dietaryRestrictions_0', 'dietaryRestrictions_1', 'dietaryRestrictions_2', 'dietaryRestrictions_3', 'dietaryRestrictions_4']
colombo_restaurants['dietaryRestrictions'] = colombo_restaurants[dietary_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Replace empty 'dietaryRestrictions' with 'unknown' (or any other suitable value)
colombo_restaurants['dietaryRestrictions'] = colombo_restaurants['dietaryRestrictions'].replace('', 'unknown')

# Drop rows where the 'dietaryRestrictions' column is still empty or NaN after replacement
colombo_restaurants = colombo_restaurants[colombo_restaurants['dietaryRestrictions'].str.strip() != '']

# Drop the original 'dietaryRestrictions_0' to 'dietaryRestrictions_4' columns as they are now combined
colombo_restaurants = colombo_restaurants.drop(columns=dietary_columns)

# Combine the 'features_0' to 'features_23' columns into one
features_columns = [f'features_{i}' for i in range(24)]  # 'features_0' to 'features_23'
colombo_restaurants['features'] = colombo_restaurants[features_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop rows where the 'features' column is empty
colombo_restaurants = colombo_restaurants[colombo_restaurants['features'].str.strip() != '']

# Drop the original 'features_0' to 'features_23' columns as they are now combined
colombo_restaurants = colombo_restaurants.drop(columns=features_columns)

# Combine the 'mealTypes_0' to 'mealTypes_5' columns into one
meal_columns = [f'mealTypes_{i}' for i in range(6)]  # 'mealTypes_0' to 'mealTypes_5'
colombo_restaurants['mealTypes'] = colombo_restaurants[meal_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop rows where the 'mealTypes' column is empty
colombo_restaurants = colombo_restaurants[colombo_restaurants['mealTypes'].str.strip() != '']

# Drop the original 'mealTypes_0' to 'mealTypes_5' columns as they are now combined
colombo_restaurants = colombo_restaurants.drop(columns=meal_columns)

# Replace missing values in 'priceLevel' column with '$$' (representing the median or middle value)
colombo_restaurants['priceLevel'] = colombo_restaurants['priceLevel'].fillna('$$')

# List of columns to keep
columns_to_keep = [
    'address', 'cuisines', 'category', 'dietaryRestrictions', 'features',
    'latitude', 'longitude', 'mealTypes', 'name', 'numberOfReviews',
    'priceLevel', 'rankingDenominator', 'rankingPosition', 'rankingString',
    'rating', 'rawRanking', 'webUrl', 'website'
]

# Drop all columns that are not in the list
colombo_restaurants = colombo_restaurants[columns_to_keep]

output_path = "/content/drive/MyDrive/DSGP_ME/colomboRestaurants_processed.xlsx"
colombo_restaurants.to_excel(output_path, index=False)

print(f"Processed file saved at: {output_path}")

Processed file saved at: /content/drive/MyDrive/DSGP_ME/colomboRestaurants_processed.xlsx
