In [None]:
import pandas as pd

# Load the Excel file
file_path = '/content/colomboHotels.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure of the dataset
print("First few rows of the dataset:")
print(df.head())

# Renaming Columns for Consistency: Replace '/' with '_'
# This step standardizes column names by replacing '/' with '_' to avoid issues in processing
df.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# Combine amenities columns into a single column
# Identify columns starting with 'amenities_' (after renaming)
amenities_columns = [col for col in df.columns if col.startswith('amenities_')]

# Create a new column 'amenities' by concatenating all the 'amenities_' columns, separated by commas
# Non-NaN values in the 'amenities_' columns are joined for each row
df['amenities'] = df[amenities_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop the original amenities columns
# Remove the now redundant 'amenities_' columns to keep the dataset clean
df = df.drop(columns=amenities_columns)

# Append 'addressobj_postalcode' to 'address', separated by a comma, if the column exists
# This step checks if 'addressobj_postalcode' is in the dataframe
if 'addressobj_postalcode' in df.columns:
    # Combine 'address' and 'addressobj_postalcode', separating them with a comma if both are non-empty
    df['address'] = df.apply(
        lambda row: f"{row['address']}, {row['addressobj_postalcode']}" if pd.notna(row['addressobj_postalcode']) else row['address'],
        axis=1
    )
    # Drop the 'addressobj_postalcode' column after appending its data
    df = df.drop(columns=['addressobj_postalcode'])

# Combine 'addressobj_street2' into 'addressobj_street1' and rename it to 'addressobj_street'
# This step checks if 'addressobj_street2' is in the dataframe
if 'addressobj_street2' in df.columns and 'addressobj_street1' in df.columns:
    # Combine 'addressobj_street1' and 'addressobj_street2', separated by a comma if both are non-empty
    df['addressobj_street1'] = df.apply(
        lambda row: f"{row['addressobj_street1']}, {row['addressobj_street2']}" if pd.notna(row['addressobj_street2']) else row['addressobj_street1'],
        axis=1
    )
    # Rename 'addressobj_street1' to 'addressobj_street'
    df.rename(columns={'addressobj_street1': 'addressobj_street'}, inplace=True)
    # Drop the 'addressobj_street2' column after appending its data
    df = df.drop(columns=['addressobj_street2'])

# Drop unnecessary columns related to ancestor locations
# These columns are no longer needed in the dataset
columns_to_drop = [
    'ancestorlocations_0_id', 'ancestorlocations_0_name', 'ancestorlocations_0_subcategory',
    'ancestorlocations_1_id', 'ancestorlocations_1_name', 'ancestorlocations_1_subcategory',
    'ancestorlocations_2_id', 'ancestorlocations_2_name', 'ancestorlocations_2_subcategory',
    'ancestorlocations_3_id', 'ancestorlocations_3_name', 'ancestorlocations_3_subcategory',
    'checkindate', 'checkoutdate', 'input', 'isnearbyresult', 'photocount', 'roomtips_0_user',
    'roomtips_1_user', 'roomtips_2_user', 'roomtips_3_user', 'roomtips_4_user',
    'whatsappredirecturl'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])


# Update certain columns if they are empty
# Combine 'weburl' and 'website' values if present and append to a default message
if 'description' in df.columns:
    df['description'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['description']) else row['description']
        ),
        axis=1
    )

if 'email' in df.columns:
    df['email'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['email']) else row['email']
        ),
        axis=1
    )

if 'phone' in df.columns:
    df['phone'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['phone']) else row['phone']
        ),
        axis=1
    )

if 'pricelevel' in df.columns:
    df['pricelevel'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['pricelevel']) else row['pricelevel']
        ),
        axis=1
    )

if 'pricerange' in df.columns:
    df['pricerange'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['pricerange']) else row['pricerange']
        ),
        axis=1
    )

if 'website' in df.columns:
    df['website'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['website']) else row['website']
        ),
        axis=1
    )

# Combine review and text columns for each review tag into a single column
# Iterate through pairs of 'reviewtags_*_reviews' and 'reviewtags_*_text'
review_columns = [col for col in df.columns if col.startswith('reviewtags_') and ('_reviews' in col or '_text' in col)]
review_pairs = {}

# Group review and text columns into pairs (e.g., 'reviewtags_0_reviews', 'reviewtags_0_text')
for col in review_columns:
    key = col.split('_')[1]  # Extract the common index for pairing
    review_pairs.setdefault(key, []).append(col)

# Create a new column 'reviews_and_text' to store combined reviews and text
df['reviews_and_text'] = df.apply(
    lambda row: ', '.join(
        f"{row[pair[0]]}: {row[pair[1]]}" for pair in review_pairs.values()
        if len(pair) == 2 and pd.notna(row[pair[0]]) and pd.notna(row[pair[1]])
    ),
    axis=1
)

# Drop original review and text columns to clean the dataset
df = df.drop(columns=review_columns)

# Combine roomtips_* columns into a single column if they exist
# Identify columns starting with 'roomtips_'
roomtips_columns = [col for col in df.columns if col.startswith('roomtips_')]

if roomtips_columns:  # Check if any 'roomtips_*' columns exist
    # Group roomtips columns into sets of six based on the shared index (e.g., 'roomtips_0_', 'roomtips_1_')
    roomtips_groups = {}
    for col in roomtips_columns:
        key = col.split('_')[1]  # Extract the common index for grouping (e.g., '0', '1')
        roomtips_groups.setdefault(key, []).append(col)

    # Create a new column 'roomtips' to store combined room tips for each row
    df['roomtips'] = df.apply(
        lambda row: ', '.join(
            ', '.join(
                str(row[col]) for col in sorted(group) if pd.notna(row[col])
            ) for group in roomtips_groups.values()
        ),
        axis=1
    )

    # Drop the original roomtips_* columns to clean the dataset
    df = df.drop(columns=roomtips_columns)
else:
    # If no 'roomtips_*' columns exist, create an empty 'roomtips' column
    df['roomtips'] = ''

# Replace empty values in specified columns with 'not provided'
# For the specified columns, replace missing values (NaN) with 'not provided' or '0' for scores
columns_to_replace = [
    'categoryreviewscores_0_categoryname', 'categoryreviewscores_0_score',
    'categoryreviewscores_1_categoryname', 'categoryreviewscores_1_score',
    'categoryreviewscores_2_categoryname', 'categoryreviewscores_2_score',
    'categoryreviewscores_3_categoryname', 'categoryreviewscores_3_score',
    'categoryreviewscores_4_categoryname', 'categoryreviewscores_4_score',
    'categoryreviewscores_5_categoryname', 'categoryreviewscores_5_score',
    'hotelclassattribution', 'latitude', 'longitude', 'numberofrooms', 'travelerchoiceaward'
]

# Replace missing values: text columns with 'not provided', numeric score columns with 0
for col in columns_to_replace:
    if col in df.columns:
        if col.endswith('score'):  # If the column is a score, replace NaN with 0
            df[col] = df[col].fillna(0)
        else:  # Otherwise, replace NaN with 'not provided'
            df[col] = df[col].fillna('not provided')

# Replace empty values in 'amenities' and 'reviews_and_text' columns with 'not provided'
# Check if the columns exist and replace missing values with 'not provided'
if 'amenities' in df.columns:
    df['amenities'] = df['amenities'].replace('', 'not provided')

if 'reviews_and_text' in df.columns:
    df['reviews_and_text'] = df['reviews_and_text'].replace('', 'not provided')
if 'roomtips' in df.columns:
    df['roomtips'] = df['roomtips'].replace(',,,,', 'not provided')

# Save the modified data to a new Excel file
# The preprocessed file contains standardized column names, a combined 'amenities' column,
# an updated 'address' column, and an updated 'addressobj_street' column
output_file = '/content/preprocessed_hotel_data.xlsx'
df.to_excel(output_file, index=False)

print(f"Preprocessing complete. Cleaned data saved to '{output_file}'")



First few rows of the dataset:
                                             address addressObj/city  \
0  14 Borella Cross Road off Ward Place, Colombo ...         Colombo   
1  538 Galle Road Colombo 03, Colombo 00300 Sri L...         Colombo   
2        No 7/1 Elias Place, Colombo 01000 Sri Lanka         Colombo   
3  29 Milagiriya Avenue Marine Drive, Colombo 004...         Colombo   
4        28, Borella Cross Road, Colombo 8 Sri Lanka         Colombo   

  addressObj/country addressObj/postalcode      addressObj/street1  \
0          Sri Lanka                 00800   14 Borella Cross Road   
1          Sri Lanka                 00300          538 Galle Road   
2          Sri Lanka                 01000      No 7/1 Elias Place   
3          Sri Lanka                 00400    29 Milagiriya Avenue   
4          Sri Lanka                     8  28, Borella Cross Road   

  addressObj/street2 amenities/0    amenities/1    amenities/2  \
0     off Ward Place    Internet  Free Internet  

In [None]:
import pandas as pd

# Load the Excel file
file_path = '/content/vacationRentalsColombo.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure of the dataset
print("First few rows of the dataset:")
print(df.head())

# Renaming Columns for Consistency: Replace '/' with '_'
# This step standardizes column names by replacing '/' with '_' to avoid issues in processing
df.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# Combine amenities columns into a single column
# Identify columns starting with 'amenities_' (after renaming)
amenities_columns = [col for col in df.columns if col.startswith('amenities_')]

# Create a new column 'amenities' by concatenating all the 'amenities_' columns, separated by commas
# Non-NaN values in the 'amenities_' columns are joined for each row
df['amenities'] = df[amenities_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop the original amenities columns
# Remove the now redundant 'amenities_' columns to keep the dataset clean
df = df.drop(columns=amenities_columns)

# Drop unnecessary columns related to ancestor locations
# These columns are no longer needed in the dataset
columns_to_drop = [
   'rentaldescriptions_0_machinetranslated', 'input', 'photocount'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Replace empty values in specified columns with 'not provided' or 0
columns_to_replace = [
    'basedailyrate_amount', 'rating', 'rentaldescriptions_0_type'
]

# Replace missing values: 'rating' with 0, others with 'not provided'
for col in columns_to_replace:
    if col in df.columns:
        if col == 'rating':  # Check the column name instead of using `equals`
            df[col] = df[col].fillna(0)
        else:
            df[col] = df[col].fillna('not provided')


# Replace empty 'rentalDescriptions_0_text' with a default message if the column exists and the value is empty
if 'rentaldescriptions_0_text' in df.columns:
    df['rentaldescriptions_0_text'] = df.apply(
        lambda row: (
            f"Please visit the following link for more details: {', '.join(filter(pd.notna, [row.get('weburl', '')]))}"
            if pd.isna(row['rentaldescriptions_0_text']) or row['rentaldescriptions_0_text'] == ''
            else row['rentaldescriptions_0_text']
        ),
        axis=1
    )

# Save the modified data to a new Excel file
# The preprocessed file contains standardized column names, a combined 'amenities' column,
# an updated 'address' column, and an updated 'addressobj_street' column
output_file = '/content/preprocessed_vacation_rental.xlsx'
df.to_excel(output_file, index=False)

print(f"Preprocessing complete. Cleaned data saved to '{output_file}'")



First few rows of the dataset:
         amenities/0        amenities/1            amenities/2  \
0   Kid friendly: NO   Elder access: NO  Wheelchair access: NO   
1  Kid friendly: YES   Elder access: NO  Wheelchair access: NO   
2   Kid friendly: NO   Elder access: NO  Wheelchair access: NO   
3  Kid friendly: YES   Elder access: NO  Wheelchair access: NO   
4   Kid friendly: NO  Elder access: YES  Wheelchair access: NO   

         amenities/3          amenities/4     amenities/5  amenities/6  \
0   Pet friendly: NO  Smoking allowed: NO  2 full bath(s)   Fits 8 pax   
1  Pet friendly: YES  Smoking allowed: NO  2 full bath(s)   Fits 4 pax   
2   Pet friendly: NO  Smoking allowed: NO  1 full bath(s)   Fits 2 pax   
3   Pet friendly: NO  Smoking allowed: NO  5 full bath(s)  Fits 14 pax   
4   Pet friendly: NO  Smoking allowed: NO  1 full bath(s)   Fits 2 pax   

    amenities/7    amenities/8  baseDailyRate/amount  ...  \
0  2 bedroom(s)  2 bathroom(s)                 145.0  ...   
1  2 

In [None]:
import pandas as pd

# Load the Excel file
file_path = '/content/colomboRestaurants.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure of the dataset
print("First few rows of the dataset:")
print(df.head())

# Renaming Columns for Consistency: Replace '/' with '_'
# This step standardizes column names by replacing '/' with '_' to avoid issues in processing
df.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# Append 'addressobj_postalcode' to 'address', separated by a comma, if the column exists
# This step checks if 'addressobj_postalcode' is in the dataframe
if 'addressobj_postalcode' in df.columns:
    # Combine 'address' and 'addressobj_postalcode', separating them with a comma if both are non-empty
    df['address'] = df.apply(
        lambda row: f"{row['address']}, {row['addressobj_postalcode']}" if pd.notna(row['addressobj_postalcode']) else row['address'],
        axis=1
    )
    # Drop the 'addressobj_postalcode' column after appending its data
    df = df.drop(columns=['addressobj_postalcode'])

# Combine 'addressobj_street2' into 'addressobj_street1' and rename it to 'addressobj_street'
# This step checks if 'addressobj_street2' is in the dataframe
if 'addressobj_street2' in df.columns and 'addressobj_street1' in df.columns:
    # Combine 'addressobj_street1' and 'addressobj_street2', separated by a comma if both are non-empty
    df['addressobj_street1'] = df.apply(
        lambda row: f"{row['addressobj_street1']}, {row['addressobj_street2']}" if pd.notna(row['addressobj_street2']) else row['addressobj_street1'],
        axis=1
    )
    # Rename 'addressobj_street1' to 'addressobj_street'
    df.rename(columns={'addressobj_street1': 'addressobj_street'}, inplace=True)
    # Drop the 'addressobj_street2' column after appending its data
    df = df.drop(columns=['addressobj_street2'])

# Drop unnecessary columns related to ancestor locations
# These columns are no longer needed in the dataset
columns_to_drop = [
    'ancestorlocations_0_id', 'ancestorlocations_0_name', 'ancestorlocations_0_subcategory',
    'ancestorlocations_1_id', 'ancestorlocations_1_name', 'ancestorlocations_1_subcategory',
    'ancestorlocations_2_id', 'ancestorlocations_2_name', 'ancestorlocations_2_subcategory',
    'ancestorlocations_3_id', 'ancestorlocations_3_name', 'ancestorlocations_3_subcategory',

]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Combine cuisines columns into a single column
# Identify columns starting with 'cuisines_' (after renaming)
cuisines_columns = [col for col in df.columns if col.startswith('cuisines_')]

# Create a new column 'cuisines_' by concatenating all the 'cuisines_' columns, separated by commas
# Non-NaN values in the 'cuisines_' columns are joined for each row
df['cuisines'] = df[cuisines_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop the original amenities columns
# Remove the now redundant 'amenities_' columns to keep the dataset clean
df = df.drop(columns=cuisines_columns)


# Combine dietaryrestrictions columns into a single column
# Identify columns starting with 'dietaryrestrictions_' (after renaming)
dietaryrestrictions_columns = [col for col in df.columns if col.startswith('dietaryrestrictions_')]

# Create a new column 'dietaryrestrictions_' by concatenating all the 'dietaryrestrictions_' columns, separated by commas
# Non-NaN values in the 'dietaryrestrictions_' columns are joined for each row
df['dietaryrestrictions'] = df[dietaryrestrictions_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop the original amenities columns
# Remove the now redundant 'amenities_' columns to keep the dataset clean
df = df.drop(columns=dietaryrestrictions_columns)


# Combine dishes columns into a single column
# Identify columns starting with 'dishes_' (after renaming)
dishes_columns = [col for col in df.columns if col.startswith('dishes_')]

# Create a new column 'dishes' by concatenating all the 'dishes_' columns, separated by commas
# Non-NaN values in the 'dishes_' columns are joined for each row
df['dishes'] = df[dishes_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop the original amenities columns
# Remove the now redundant 'amenities_' columns to keep the dataset clean
df = df.drop(columns=dishes_columns)


# Combine dishes columns into a single column
# Identify columns starting with 'features_' (after renaming)
features_columns = [col for col in df.columns if col.startswith('features_')]

# Create a new column 'features' by concatenating all the 'features_' columns, separated by commas
# Non-NaN values in the 'features_' columns are joined for each row
df['features'] = df[features_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

# Drop the original features columns
# Remove the now redundant 'features_' columns to keep the dataset clean
df = df.drop(columns=features_columns)


mealtypes_columns = [col for col in df.columns if col.startswith('mealtypes_')]

df['mealtypes'] = df[mealtypes_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

df = df.drop(columns=mealtypes_columns)


establishmenttypes_columns = [col for col in df.columns if col.startswith('establishmenttypes_')]

df['establishmenttypes'] = df[establishmenttypes_columns].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

df = df.drop(columns=establishmenttypes_columns)

# Define the columns to extract and keep
openhours_columns = [col for col in df.columns if "openhours" in col]
closehours_columns = [col for col in df.columns if "closehours" in col]

# Ensure there are columns to extract
if openhours_columns and closehours_columns:
    # Extract one value from openhours and closehours fields
    # Take the first available non-NaN value from openhours and closehours
    df['open_hour'] = df[openhours_columns].bfill(axis=1).iloc[:, 0]
    df['close_hour'] = df[closehours_columns].bfill(axis=1).iloc[:, 0]

    # Drop all original openhours and closehours fields
    columns_to_drop = openhours_columns + closehours_columns
    df = df.drop(columns=columns_to_drop)

# Print final columns to verify
print(df.columns)

# Update certain columns if they are empty
# Combine 'weburl' and 'website' values if present and append to a default message
if 'description' in df.columns:
    df['description'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['description']) else row['description']
        ),
        axis=1
    )

if 'email' in df.columns:
    df['email'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['email']) else row['email']
        ),
        axis=1
    )

if 'dietaryrestrictions' in df.columns:
    df['dietaryrestrictions'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['dietaryrestrictions']) else row['dietaryrestrictions']
        ),
        axis=1
    )

if 'dishes' in df.columns:
    df['dishes'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['dishes']) else row['dishes']
        ),
        axis=1
    )

if 'menuweburl' in df.columns:
    df['menuweburl'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['menuweburl']) else row['menuweburl']
        ),
        axis=1
    )

if 'phone' in df.columns:
    df['phone'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['phone']) else row['phone']
        ),
        axis=1
    )

if 'pricelevel' in df.columns:
    df['pricelevel'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['pricelevel']) else row['pricelevel']
        ),
        axis=1
    )

if 'subcategories_0' in df.columns:
    df['subcategories_0'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['subcategories_0']) else row['subcategories_0']
        ),
        axis=1
    )

if 'website' in df.columns:
    df['website'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['website']) else row['website']
        ),
        axis=1
    )

if 'open_hour' in df.columns:
    df['open_hour'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['open_hour']) else row['open_hour']
        ),
        axis=1
    )

if 'close_hour' in df.columns:
    df['close_hour'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['close_hour']) else row['close_hour']
        ),
        axis=1
    )

if 'dietaryrestrictions' in df.columns:
    df['dietaryrestrictions'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['dietaryrestrictions']) else row['dietaryrestrictions']
        ),
        axis=1
    )

if 'dishes' in df.columns:
    df['dishes'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['dishes']) else row['dishes']
        ),
        axis=1
    )


if 'features' in df.columns:
    df['features'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['features']) else row['features']
        ),
        axis=1
    )

if 'mealtypes' in df.columns:
    df['mealtypes'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['mealtypes']) else row['mealtypes']
        ),
        axis=1
    )

# These columns are no longer needed in the dataset
columns_to_drop = ['hours', 'input', 'isclaimedicon', 'isclaimedtext', 'isclosed', 'islongclosed',
                   'isnearbyresult', 'localaddress', 'localname', 'opennowtext', 'orderonline_0_buttontext',
                   'orderonline_0_canprovidetimeslots', 'orderonline_0_headertext', 'orderonline_0_logourl',
                   'orderonline_0_offerurl',	'orderonline_0_provider',	'orderonline_0_providerdisplayname',
                   'orderonline_0_providerid',	'orderonline_0_providertype',	'ownerstopreasons'	'ownerstopreasons_sectionheader',
                   'ownerstopreasons_sponsoredby',	'ownerstopreasons_topreasons_0_header',	'ownerstopreasons_topreasons_0_image_url',
                   'ownerstopreasons_topreasons_0_issearchterm',	'ownerstopreasons_topreasons_0_keyword',	'ownerstopreasons_topreasons_0_linktext',
                   'ownerstopreasons_topreasons_0_rank',	'ownerstopreasons_topreasons_0_review_rating',	'ownerstopreasons_topreasons_0_review_reviewid',
                   'ownerstopreasons_topreasons_0_review_screenname', 'ownerstopreasons_topreasons_0_text',
                   'ownerstopreasons_topreasons_1_header',	'ownerstopreasons_topreasons_1_image_url',	'ownerstopreasons_topreasons_1_issearchterm',
                   'ownerstopreasons_topreasons_1_keyword	ownerstopreasons_topreasons_1_linktext',
                   'ownerstopreasons_topreasons_1_rank',	'ownerstopreasons_topreasons_1_review_rating',
                   'ownerstopreasons_topreasons_1_review_reviewid',	'ownerstopreasons_topreasons_1_review_screenname',	'ownerstopreasons_topreasons_1_text',
                   'ownerstopreasons_topreasons_2_header',	'ownerstopreasons_topreasons_2_image_url',	'ownerstopreasons_topreasons_2_issearchterm',
                   'ownerstopreasons_topreasons_2_keyword',	'ownerstopreasons_topreasons_2_linktext',	'ownerstopreasons_topreasons_2_rank',	'ownerstopreasons_topreasons_2_review_rating',
                   'ownerstopreasons_topreasons_2_review_reviewid',	'ownerstopreasons_topreasons_2_review_screenname',	'ownerstopreasons_topreasons_2_text',
                   'ownerstopreasons',	'ownerstopreasons_sectionheader',	'ownerstopreasons_topreasons_1_keyword',	'ownerstopreasons_topreasons_1_linktext', 'photocount',
                   'pricerange', 'hours_timezone', 'hours_weekranges_0_0_close', 'hours_weekranges_0_0_open', 'hours_weekranges_0_1_close', 'hours_weekranges_0_1_open', 'hours_weekranges_0_2_close', 'hours_weekranges_0_2_open',
                   'hours_weekranges_1_0_close', 'hours_weekranges_1_0_open', 'hours_weekranges_1_1_close', 'hours_weekranges_1_1_open', 'hours_weekranges_1_2_close', 'hours_weekranges_1_2_open',
                   'hours_weekranges_2_0_close', 'hours_weekranges_2_0_open', 'hours_weekranges_2_1_close', 'hours_weekranges_2_1_open', 'hours_weekranges_2_2_close', 'hours_weekranges_2_2_open',
                    'hours_weekranges_3_0_close', 'hours_weekranges_3_0_open', 'hours_weekranges_3_1_close', 'hours_weekranges_3_1_open', 'hours_weekranges_3_2_close', 'hours_weekranges_3_2_open',
                   'hours_weekranges_4_0_close', 'hours_weekranges_4_0_open', 'hours_weekranges_4_1_close', 'hours_weekranges_4_1_open', 'hours_weekranges_4_2_close', 'hours_weekranges_4_2_open',
                   'hours_weekranges_5_0_close', 'hours_weekranges_5_0_open', 'hours_weekranges_5_1_close', 'hours_weekranges_5_1_open', 'hours_weekranges_5_2_close', 'hours_weekranges_5_2_open',
                   'hours_weekranges_6_0_close', 'hours_weekranges_6_0_open', 'hours_weekranges_6_1_close', 'hours_weekranges_6_1_open', 'hours_weekranges_6_2_close', 'hours_weekranges_6_2_open'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Combine review and text columns for each review tag into a single column
# Iterate through pairs of 'reviewtags_*_reviews' and 'reviewtags_*_text'
review_columns = [col for col in df.columns if col.startswith('reviewtags_') and ('_reviews' in col or '_text' in col)]
review_pairs = {}

# Group review and text columns into pairs (e.g., 'reviewtags_0_reviews', 'reviewtags_0_text')
for col in review_columns:
    key = col.split('_')[1]  # Extract the common index for pairing
    review_pairs.setdefault(key, []).append(col)

# Create a new column 'reviews_and_text' to store combined reviews and text
df['reviews_and_text'] = df.apply(
    lambda row: ', '.join(
        f"{row[pair[0]]}: {row[pair[1]]}" for pair in review_pairs.values()
        if len(pair) == 2 and pd.notna(row[pair[0]]) and pd.notna(row[pair[1]])
    ),
    axis=1
)

# Drop original review and text columns to clean the dataset
df = df.drop(columns=review_columns)

columns_to_replace = [
    'latitude', 'longitude', 'travelerchoiceaward'
]

# Replace missing values: text columns with 'not provided', numeric score columns with 0
for col in columns_to_replace:
    if col in df.columns:
      df[col] = df[col].fillna('not provided')

if 'reviews_and_text' in df.columns:
    df['reviews_and_text'] = df.apply(
        lambda row: (
            f"Please visit the following links for more details: {', '.join(filter(pd.notna, [row.get('weburl', ''), row.get('website', '')]))}"
            if pd.isna(row['reviews_and_text']) else row['reviews_and_text']
        ),
        axis=1
    )


# Save the modified data to a new Excel file
# The preprocessed file contains standardized column names, a combined 'amenities' column,
# an updated 'address' column, and an updated 'addressobj_street' column
output_file = '/content/preprocessed_colombo_restaurant.xlsx'
df.to_excel(output_file, index=False)

print(f"Preprocessing complete. Cleaned data saved to '{output_file}'")




First few rows of the dataset:
                                             address addressObj/city  \
0  57 Ward Place Roof Top, Jetwing Colombo Seven,...         Colombo   
1              2 Galle Road, Colombo 00300 Sri Lanka         Colombo   
2  106 Thimbirigasyaya Road, Colombo 00500 Sri Lanka         Colombo   
3       10 Galle Face Drive, Colombo 00300 Sri Lanka         Colombo   
4  590 Colombo - Galle Main Road Marino Mall, Col...         Colombo   

  addressObj/country  addressObj/postalcode             addressObj/street1  \
0          Sri Lanka                  700.0                  57 Ward Place   
1          Sri Lanka                  300.0                   2 Galle Road   
2          Sri Lanka                  500.0       106 Thimbirigasyaya Road   
3          Sri Lanka                  300.0            10 Galle Face Drive   
4          Sri Lanka                    NaN  590 Colombo - Galle Main Road   

                addressObj/street2  ancestorLocations/0/id  \
0  Ro

In [None]:
import pandas as pd

# Load the Excel file
file_path = '/content/user_inputs.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure of the dataset
print("First few rows of the dataset:")
print(df.head())

# Renaming Columns for Consistency: Replace '/' with '_'
# This step standardizes column names by replacing '/' with '_' to avoid issues in processing
df.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# These columns are no longer needed in the dataset
columns_to_drop = ['timestamp']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

columns_to_replace = [
    'is_there_anything_else_you_would_like_us_to_know_about_your_travel_preferences?'
]

# Replace missing values: text columns with 'not provided', numeric score columns with 0
for col in columns_to_replace:
    if col in df.columns:
      df[col] = df[col].fillna('not provided')

# Save the modified data to a new Excel file
# The preprocessed file contains standardized column names, a combined 'amenities' column,
# an updated 'address' column, and an updated 'addressobj_street' column
output_file = '/content/preprocessed_user_inputs.xlsx'
df.to_excel(output_file, index=False)

print(f"Preprocessing complete. Cleaned data saved to '{output_file}'")



First few rows of the dataset:
                Timestamp      Name   \
0 2024-11-04 10:43:35.738  Mathusha    
1 2024-11-04 11:13:51.504     nielia   
2 2024-11-04 11:14:01.859     Ehansa   
3 2024-11-04 11:32:33.207     Oshini   
4 2024-11-04 11:33:55.682       Umar   

  Do you wish to have a trip itinerary generator website for your future trips?   \
0                                                Yes                               
1                                                Yes                               
2                                                Yes                               
3                                                Yes                               
4                                                Yes                               

  How often do you travel per year?  How many people are traveling?   \
0               Rarely (1 - 2 times)                          Couple   
1         Occasionally (3 - 5 times)                            Solo   
2      

In [None]:
!pip install -q tensorflow-ranking
!pip install -q --upgrade tensorflow-datasets

from typing import Dict, Tuple

import tensorflow as tf

import tensorflow_datasets as tfds
#import tensorflow_ranking as tfr
import pandas as pd

# Load datasets
def load_excel(file_path):
    return pd.read_excel(file_path)

# Load datasets into pandas DataFrames
hotels_df = load_excel('/content/preprocessed_hotel_data.xlsx')
restaurants_df = load_excel('/content/preprocessed_colombo_restaurant.xlsx')
users_df = load_excel('/content/preprocessed_user_inputs.xlsx')
vacation_rentals_df = load_excel('/content/preprocessed_vacation_rental.xlsx')

# Extract necessary fields
hotels_df = hotels_df[['id', 'name', 'type', 'rating']]
restaurants_df = restaurants_df[['id', 'name', 'type', 'rating']]
vacation_rentals_df = vacation_rentals_df[['id', 'name', 'type', 'rating']]
users_df = users_df[['user_id', 'preferred_destination_id', 'rating']]

# Combine destination datasets
destinations_df = pd.concat([hotels_df, restaurants_df, vacation_rentals_df])

# Prepare TensorFlow Datasets
ratings = tf.data.Dataset.from_tensor_slices({
    "user_id": users_df['user_id'].astype(str).to_numpy(),
    "destination_id": users_df['preferred_destination_id'].astype(str).to_numpy(),
    "user_rating": users_df['rating'].to_numpy()
})

destinations = tf.data.Dataset.from_tensor_slices(destinations_df['name'].astype(str).to_numpy())
users = ratings.map(lambda x: x["user_id"])

# Vocabulary setup for users and destinations
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(users.batch(1000))

destination_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
destination_ids_vocabulary.adapt(destinations.batch(1000))

# Group dataset for training
key_func = lambda x: user_ids_vocabulary(x["user_id"])
reduce_func = lambda key, dataset: dataset.batch(100)
ds_train = ratings.group_by_window(
    key_func=key_func, reduce_func=reduce_func, window_size=100)

# Feature-label split function
def _features_and_labels(x):
    labels = x.pop("user_rating")
    return x, labels

ds_train = ds_train.map(_features_and_labels)
ds_train = ds_train.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=32))

# Define Travel Itinerary Recommendation Model
class ItineraryRankingModel(tf.keras.Model):
    def __init__(self, user_vocab, destination_vocab):
        super().__init__()
        self.user_vocab = user_vocab
        self.destination_vocab = destination_vocab
        self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(), 64)
        self.destination_embed = tf.keras.layers.Embedding(destination_vocab.vocabulary_size(), 64)

    def call(self, features):
        user_embeddings = self.user_embed(self.user_vocab(features["user_id"]))
        destination_embeddings = self.destination_embed(self.destination_vocab(features["destination_id"]))
        return tf.reduce_sum(user_embeddings * destination_embeddings, axis=2)

# Instantiate and compile the model
model = ItineraryRankingModel(user_ids_vocabulary, destination_ids_vocabulary)
optimizer = tf.keras.optimizers.Adagrad(0.5)
loss = tfr.keras.losses.get(
    loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True
)
eval_metrics = [
    tfr.keras.metrics.get(key="ndcg", name="metric/ndcg", ragged=True),
    tfr.keras.metrics.get(key="mrr", name="metric/mrr", ragged=True),
]
model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics)

# Train the model
model.fit(ds_train, epochs=3)

# Prepare candidate destinations
for destination_names in destinations.batch(2000):
    break

# Generate personalized recommendations for a specific user
user_id = "42"  # Replace with desired user ID
inputs = {
    "user_id": tf.expand_dims(tf.repeat(user_id, repeats=destination_names.shape[0]), axis=0),
    "destination_id": tf.expand_dims(destination_names, axis=0)
}

# Get recommendations
scores = model(inputs)
recommended_destinations = tfr.utils.sort_by_scores(scores, [tf.expand_dims(destination_names, axis=0)])[0]

# Print the top 5 recommendations
print(f"Top 5 recommendations for user {user_id}: {recommended_destinations[0, :5]}")


KeyError: "None of [Index(['actual_user_id_column', 'actual_preferred_destination_column',\n       'actual_rating_column'],\n      dtype='object')] are in the [columns]"