<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Preprocessing/KandyRestaurantsPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the Excel file
file_path = '/content/drive/My Drive/Kandy/KandyRestaurants.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the data
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,address,addressObj/city,addressObj/country,addressObj/postalcode,addressObj/state,addressObj/street1,addressObj/street2,ancestorLocations/0/abbreviation,ancestorLocations/0/id,ancestorLocations/0/name,...,reviewTags/25/text,reviewTags/26/reviews,reviewTags/26/text,reviewTags/27/reviews,reviewTags/27/text,subcategories/0,travelerChoiceAward,type,webUrl,website
0,"12 Mahamaya Mawatha, Kandy 20000 Sri Lanka",Kandy,Sri Lanka,20000.0,,12 Mahamaya Mawatha,,,304138,Kandy,...,,,,,,Sit down,,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,https://www.grandserendibhotel.com/index.php
1,"839 Peradeniya Road Kandyan Arts Residency, Ka...",Kandy,Sri Lanka,20000.0,,839 Peradeniya Road,Kandyan Arts Residency,,304138,Kandy,...,,,,,,Sit down,,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,http://hotelkandyanarts.com/
2,"9 Sangaraja Mawatha, Kandy 8 Sri Lanka",Kandy,Sri Lanka,8.0,,9 Sangaraja Mawatha,,,304138,Kandy,...,,,,,,Sit down,,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,
3,"155 S W R D Bandaranayake Road, Kandy 20000 Sr...",Kandy,Sri Lanka,20000.0,,155 S W R D Bandaranayake Road,,,304138,Kandy,...,,,,,,Sit down,,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,http://www.kandycitystay.com
4,"No. 1C Devi Road Galkaduwa Junction, Kandy 200...",Kandy,Sri Lanka,20000.0,,No. 1C Devi Road,Galkaduwa Junction,,304138,Kandy,...,,,,,,Sit down,,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,https://www.facebook.com/profile.php?id=615576...


In [13]:
# Renaming Columns for Consistency: Replace '/' with '_'
# This step standardizes column names by replacing '/' with '_' to avoid issues in processing
df.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# Append 'addressobj_postalcode' to 'address', separated by a comma, if the column exists
# This step checks if 'addressobj_postalcode' is in the dataframe
if 'addressobj_postalcode' in df.columns:
    # Combine 'address' and 'addressobj_postalcode', separating them with a comma if both are non-empty
    df['address'] = df.apply(
        lambda row: f"{row['address']}, {row['addressobj_postalcode']}" if pd.notna(row['addressobj_postalcode']) else row['address'], axis=1)
    # Drop the 'addressobj_postalcode' column after appending its data
    df = df.drop(columns=['addressobj_postalcode'])

# Combine 'addressobj_street2' into 'addressobj_street1' and rename it to 'addressobj_street'
# This step checks if 'addressobj_street2' is in the dataframe
if 'addressobj_street2' in df.columns and 'addressobj_street1' in df.columns:
    # Combine 'addressobj_street1' and 'addressobj_street2', separated by a comma if both are non-empty
    df['addressobj_street1'] = df.apply(
        lambda row: f"{row['addressobj_street1']}, {row['addressobj_street2']}" if pd.notna(row['addressobj_street2']) else row['addressobj_street1'], axis=1)
    # Rename 'addressobj_street1' to 'addressobj_street'
    df.rename(columns={'addressobj_street1': 'addressobj_street'}, inplace=True)
    # Drop the 'addressobj_street2' column after appending its data
    df = df.drop(columns=['addressobj_street2'])

# Drop unnecessary columns
drop_columns = [col for col in df.columns if col.startswith(('photos_', 'hours_', 'orderonline_', 'ownerstopreasons', 'price_', 'ancestorlocations_', 'nearestmetrostations_'))]
df = df.drop(columns=drop_columns, errors='ignore')

# These columns are no longer needed in the dataset
columns_to_drop = ['hours', 'input', 'isclaimedicon', 'isclaimedtext', 'isclosed', 'islongclosed', 'image', 'isnearbyresult', 'addressobj_state',
                   'isnearbyresult', 'localaddress', 'localname', 'opennowtext', 'photocount', 'pricerange', 'locallangcode'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Function to combine multiple columns into one
def combine_columns(row, column_list):
    # Check if the column_list exists and has valid columns
    valid_columns = [col for col in column_list if col in row.index]

    if valid_columns:
        combined_value = ', '.join(row[valid_columns].dropna().astype(str)).strip()
        if combined_value:
            return combined_value

    # If no valid data, construct a proper fallback message
    available_links = list(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))

    return f"Please visit the following links for more details: {', '.join(available_links)}"

# Identify and merge grouped columns
categories = ['cuisines', 'dietaryrestrictions', 'dishes', 'features', 'mealtypes', 'establishmenttypes']
for category in categories:
    category_columns = [col for col in df.columns if col.startswith(f'{category}_')]

    if category_columns:
        df.loc[:, category] = df.apply(lambda row: combine_columns(row, category_columns), axis=1)
        df = df.drop(columns=category_columns, errors='ignore')  # `errors='ignore'` ensures safe column dropping

# Define the columns to extract and keep
openhours_columns = [col for col in df.columns if "openhours" in col]
closehours_columns = [col for col in df.columns if "closehours" in col]

# Ensure there are columns to extract
if openhours_columns and closehours_columns:
    # Extract one value from openhours and closehours fields
    # Take the first available non-NaN value from openhours and closehours
    df['open_hour'] = df[openhours_columns].bfill(axis=1).iloc[:, 0]
    df['close_hour'] = df[closehours_columns].bfill(axis=1).iloc[:, 0]

    # Drop all original openhours and closehours fields
    df = df.drop(columns=openhours_columns + closehours_columns)

# Replace missing values with the appropriate message
def replace_missing_values(row, col):
    if pd.isna(row[col]):
        return f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
    else:
        return row[col]

# Apply default message logic for missing values
default_message_columns = ['description', 'email', 'menuweburl', 'phone', 'pricelevel', 'subcategories_0', 'website', 'open_hour', 'close_hour', 'addressobj_street']
for col in default_message_columns:
    if col in df.columns:
        df[col] = df.apply(lambda row: replace_missing_values(row, col), axis=1)

# Combine review and text columns for each review tag into a single column
# Iterate through pairs of 'reviewtags_*_reviews' and 'reviewtags_*_text'
review_columns = [col for col in df.columns if col.startswith('reviewtags_') and ('_reviews' in col or '_text' in col)]
review_pairs = {}

# Group review and text columns into pairs (e.g., 'reviewtags_0_reviews', 'reviewtags_0_text')
for col in review_columns:
    key = col.split('_')[1]  # Extract the common index for pairing
    review_pairs.setdefault(key, []).append(col)

# Ensure each pair contains both review and text columns
review_pairs_corrected = {key: pair for key, pair in review_pairs.items() if len(pair) == 2}

# Function to merge reviews and texts
def combine_reviews_text(row, pairs):
    combined_reviews = []
    for review_col, text_col in pairs.values():
        if pd.notna(row[review_col]) and pd.notna(row[text_col]):
            combined_reviews.append(f"{row[review_col]}: {row[text_col]}")
    return ', '.join(combined_reviews) if combined_reviews else None

# Apply function to create 'reviews_and_text' column
df['reviews_and_text'] = df.apply(lambda row: combine_reviews_text(row, review_pairs_corrected), axis=1)

# Replace missing values in 'reviews_and_text'
df['reviews_and_text'] = df.apply(
    lambda row: (
        f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
        if pd.isna(row['reviews_and_text']) else row['reviews_and_text']
    ),
    axis=1
)

# Drop original review and text columns to clean the dataset
df = df.drop(columns=review_columns)

columns_to_replace = [
    'latitude', 'longitude', 'travelerchoiceaward', 'rankingdenominator',
    'rankingposition', 'rankingstring', 'rating', 'rawranking', 'reviews_and_text'
]

# Replace missing values: text columns with 'not provided', numeric score columns with 0
for col in columns_to_replace:
    if col in df.columns:
      df[col] = df[col].fillna('not provided')

# Print final columns to verify
print(df.columns)
data = df.head()
display(data)

# Save the modified data to a new Excel file
# The preprocessed file contains standardized column names, a combined 'amenities' column,
# an updated 'address' column, and an updated 'addressobj_street' column
output_file = '/content/drive/My Drive/Kandy/PreprocessedRestaurantsKandy.csv'
df.to_csv(output_file, index=False)

print(f"Preprocessing complete. Cleaned data saved to '{output_file}'")

Index(['address', 'addressobj_city', 'addressobj_country', 'addressobj_street',
       'category', 'description', 'email', 'id', 'latitude', 'locationstring',
       'longitude', 'menuweburl', 'name', 'numberofreviews', 'phone',
       'pricelevel', 'rankingdenominator', 'rankingposition', 'rankingstring',
       'rating', 'ratinghistogram_count1', 'ratinghistogram_count2',
       'ratinghistogram_count3', 'ratinghistogram_count4',
       'ratinghistogram_count5', 'rawranking', 'subcategories_0',
       'travelerchoiceaward', 'type', 'weburl', 'website', 'cuisines',
       'dietaryrestrictions', 'dishes', 'features', 'mealtypes',
       'establishmenttypes', 'reviews_and_text'],
      dtype='object')


Unnamed: 0,address,addressobj_city,addressobj_country,addressobj_street,category,description,email,id,latitude,locationstring,...,type,weburl,website,cuisines,dietaryrestrictions,dishes,features,mealtypes,establishmenttypes,reviews_and_text
0,"12 Mahamaya Mawatha, Kandy 20000 Sri Lanka, 20...",Kandy,Sri Lanka,12 Mahamaya Mawatha,restaurant,Grand Sky Lounge is Located on the rooftop of ...,info@grandserendibhotel.com,27740176,7.285323,"Kandy, Kandy District, Central Province",...,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,https://www.grandserendibhotel.com/index.php,"Bar, International, Pub, Dining bars",Please visit the following links for more deta...,Please visit the following links for more deta...,"Reservations, Outdoor Seating, Seating, Parkin...","Breakfast, Lunch, Dinner, Brunch, Drinks",Restaurants,Please visit the following links for more deta...
1,"839 Peradeniya Road Kandyan Arts Residency, Ka...",Kandy,Sri Lanka,"839 Peradeniya Road, Kandyan Arts Residency",restaurant,"Enjoy casual, contemporary cuisine all day din...",info@hotelkandyanarts.com,3438350,7.276945,"Kandy, Kandy District, Central Province",...,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,http://hotelkandyanarts.com/,"Italian, International, Asian, Indonesian, Mon...","Vegetarian friendly, Vegan options","Curry, Fish, Soup, Chicken Curry, Biryani","Reservations, Outdoor Seating, Buffet, Seating...","Lunch, Dinner, Drinks",Restaurants,Please visit the following links for more deta...
2,"9 Sangaraja Mawatha, Kandy 8 Sri Lanka, 8.0",Kandy,Sri Lanka,9 Sangaraja Mawatha,restaurant,Please visit the following links for more deta...,tkgcbistro@gmail.com,2057786,7.286927,"Kandy, Kandy District, Central Province",...,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,Please visit the following links for more deta...,"Asian, Sri Lankan","Vegetarian friendly, Vegan options","Juice & Smoothies, Noodle, Salad, Curry, Fried...","Takeout, Seating, Table Service","Breakfast, Lunch, Dinner, Late Night",Restaurants,Please visit the following links for more deta...
3,"155 S W R D Bandaranayake Road, Kandy 20000 Sr...",Kandy,Sri Lanka,155 S W R D Bandaranayake Road,restaurant,This restaurant is for the people who love to ...,info@kandycitystay.com,15150352,7.286809,"Kandy, Kandy District, Central Province",...,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,http://www.kandycitystay.com,Cafe,"Vegetarian friendly, Vegan options","Juice & Smoothies, Burger, Fried","Reservations, Outdoor Seating, Seating, Parkin...","Breakfast, Lunch, Dinner, Late Night, Drinks",Restaurants,"12.0: foods, 16.0: view"
4,"No. 1C Devi Road Galkaduwa Junction, Kandy 200...",Kandy,Sri Lanka,"No. 1C Devi Road, Galkaduwa Junction",restaurant,The Chef Corner Restaurant & BYOB located at a...,chefcorner.viplounge@gmail.com,27456453,7.2955,"Kandy, Kandy District, Central Province",...,RESTAURANT,https://www.tripadvisor.com/Restaurant_Review-...,https://www.facebook.com/profile.php?id=615576...,"Italian, Chinese, Asian, Sri Lankan",Please visit the following links for more deta...,Please visit the following links for more deta...,"Delivery, Takeout, Seating, Parking Available,...","Breakfast, Lunch, Dinner, Brunch, Drinks",Restaurants,Please visit the following links for more deta...


Preprocessing complete. Cleaned data saved to '/content/drive/My Drive/Kandy/PreprocessedRestaurantsKandy.csv'
