<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Preprocessing/preprocessing_colombo_hotels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import pandas as pd
import re

In [18]:
colombo_hotels = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboHotels.xlsx', engine='openpyxl')

# Rename columns by replacing '/' with '_'
colombo_hotels.columns = colombo_hotels.columns.str.replace('/', '_', regex=False)

threshold = 0.8  # Remove columns with more than 80% missing values
colombo_hotels = colombo_hotels.dropna(thresh=len(colombo_hotels) * (1 - threshold), axis=1)

amenity_columns = [col for col in colombo_hotels.columns if col.startswith('amenities_')]

colombo_hotels['all_amenities'] = colombo_hotels[amenity_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop rows where the 'all_amenities' column is empty (no amenities at all)
colombo_hotels = colombo_hotels[colombo_hotels['all_amenities'].notna()]

# Drop the original separate columns
colombo_hotels = colombo_hotels.drop(columns=amenity_columns)

# Replace missing values in 'categoryReviewScores_3_categoryName' with 'Cleanliness'
colombo_hotels['categoryReviewScores_3_categoryName'] = colombo_hotels['categoryReviewScores_3_categoryName'].fillna('Cleanliness')

# Replace missing values in 'categoryReviewScores_4_categoryName' with 'Service'
colombo_hotels['categoryReviewScores_4_categoryName'] = colombo_hotels['categoryReviewScores_4_categoryName'].fillna('Service')

# Replace missing values in 'categoryReviewScores_5_categoryName' with 'Sleep Quality'
colombo_hotels['categoryReviewScores_5_categoryName'] = colombo_hotels['categoryReviewScores_5_categoryName'].fillna('Sleep Quality')

# Replace missing values in categoryReviewScores_*_score columns with the median (2.5)
for i in range(6):  # Looping from 0 to 5
    column_name = f'categoryReviewScores_{i}_score'
    if column_name in colombo_hotels.columns:
        colombo_hotels[column_name] = colombo_hotels[column_name].fillna(2.5)

# Replace missing values in 'priceLevel' column with '$$' (representing the median or middle value)
colombo_hotels['priceLevel'] = colombo_hotels['priceLevel'].fillna('$$')

# Function to remove 'LKR' and commas and convert the values to numeric
def extract_price_range(price_range):
    # Check if the value is a valid string
    if isinstance(price_range, str):
        # Remove 'LKR' and commas, then split the range
        price_range = re.sub(r'[^0-9-]', '', price_range)  # Keep only numbers and dashes
        prices = price_range.split('-')
        if len(prices) == 2:
            min_price, max_price = map(int, prices)
            return (min_price + max_price) / 2  # Return average of the range
    return None  # In case of invalid data or non-string type


# Apply the function to extract the average price and create a new column 'priceRange_LKR'
colombo_hotels['priceRange_LKR'] = colombo_hotels['priceRange'].apply(extract_price_range)

# Fill missing values in 'priceRange_LKR' with the average of the non-missing values
average_price = colombo_hotels['priceRange_LKR'].mean()
colombo_hotels['priceRange_LKR'] = colombo_hotels['priceRange_LKR'].fillna(average_price)

# Drop the original 'priceRange' column and rename 'priceRange_LKR' column
colombo_hotels = colombo_hotels.drop(columns=['priceRange'])

# List of columns to keep, with corrected names
columns_to_keep = [
    'address', 'all_amenities', 'category', 'categoryReviewScores_0_categoryName',
    'categoryReviewScores_1_categoryName', 'categoryReviewScores_2_categoryName',
    'categoryReviewScores_3_categoryName', 'categoryReviewScores_4_categoryName',
    'categoryReviewScores_5_categoryName', 'categoryReviewScores_0_score',
    'categoryReviewScores_1_score', 'categoryReviewScores_2_score',
    'categoryReviewScores_3_score', 'categoryReviewScores_4_score',
    'categoryReviewScores_5_score', 'hotelClass', 'latitude', 'longitude',
    'priceLevel', 'priceRange_LKR', 'rankingDenominator', 'name',
    'rankingPosition', 'rankingString', 'rating', 'webUrl', 'website'
]

# Drop the columns that are not in the list to keep
colombo_hotels = colombo_hotels[columns_to_keep]

output_path = "/content/drive/MyDrive/DSGP_ME/colomboHotels_processed.xlsx"
colombo_hotels.to_excel(output_path, index=False)

print(f"Processed file saved at: {output_path}")


Processed file saved at: /content/drive/MyDrive/DSGP_ME/colomboHotels_processed.xlsx
