<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Preprocessing/preprocessing_colombo_attractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import re

In [9]:
colombo_attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/AttractionsColombo.xlsx', engine='openpyxl')

# Rename columns by replacing '/' with '_'
colombo_attractions.columns = colombo_attractions.columns.str.replace('/', '_', regex=False)

# Clean the 'offerGroup_lowestPrice' column
colombo_attractions['lowestPrice_LKR'] = colombo_attractions['offerGroup_lowestPrice'].astype(str).apply(lambda x: re.sub(r'[^\d.]', '', x) if pd.notna(x) else '')

# Convert to numeric type
colombo_attractions['lowestPrice_LKR'] = pd.to_numeric(colombo_attractions['lowestPrice_LKR'], errors='coerce')

# Fill missing values with the median
median_price = colombo_attractions['lowestPrice_LKR'].median()
colombo_attractions['lowestPrice_LKR'].fillna(median_price, inplace=True)

# Drop the original column
colombo_attractions.drop(columns=['offerGroup_lowestPrice'], inplace=True)

# Combine 'subcategories_0' to 'subcategories_9' into one column
subcategories_columns = [f'subcategories_{i}' for i in range(10)]
colombo_attractions['subcategories'] = colombo_attractions[subcategories_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop rows where 'subcategories' column is empty
colombo_attractions = colombo_attractions[colombo_attractions['subcategories'].str.strip() != '']

# Drop the original 'subcategories_0' to 'subcategories_9' columns
colombo_attractions.drop(columns=subcategories_columns, inplace=True)

# List of columns to keep
columns_to_keep = [
    'address', 'category', 'latitude', 'longitude', 'name', 'numberOfReviews',
    'rankingDenominator', 'rankingPosition', 'rankingString', 'rating',
    'webUrl', 'lowestPrice_LKR', 'subcategories'
]

# Keep only the specified columns
colombo_attractions = colombo_attractions[columns_to_keep]

# Save the processed dataset
output_path = "/content/drive/MyDrive/DSGP_ME/colombo_attractions_processed.xlsx"
colombo_attractions.to_excel(output_path, index=False)

print(f"Processed file saved at: {output_path}")

Processed file saved at: /content/drive/MyDrive/DSGP_ME/colombo_attractions_processed.xlsx


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  colombo_attractions['lowestPrice_LKR'].fillna(median_price, inplace=True)
