<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/E_Personalized_Itinerary_Generator/model_training_All_cities7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re



drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #

user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/user_inputs_preprocessed.xlsx')
Hotels = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/ProcessedHotels.csv')
Restaurants = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/PreprocessedRestaurants.csv')
Attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/attractions_preprocessed.xlsx')

# ---------------------------- #
# Step 2: Clean Column Names (Remove Spaces)
# ---------------------------- #
def clean_column_names(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

user_inputs = clean_column_names(user_inputs)
Hotels = clean_column_names(Hotels)
Restaurants = clean_column_names(Restaurants)
Attractions = clean_column_names(Attractions)

# ---------------------------- #
# Step 3: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Allowed destinations
allowed_destinations = {'Kandy', 'Ella', 'Colombo', 'Nuwara Eliya'}

# Encode destinations
all_destinations = set()
for dest in user_inputs['destination'].dropna():
    all_destinations.update(dest.split(', '))
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(all_destinations)) if dest in allowed_destinations}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip()) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in Restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in Hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
veg_friendly = {'Vegetarian friendly', 'Vegan options', 'Halal', 'Gluten free options'}
non_veg_friendly = {'No Special Dietary'}

def encode_dietary_restrictions(dietary_str):
    if pd.isna(dietary_str):
        return set()
    restrictions = set(dietary_str.split(', '))
    if restrictions & veg_friendly:
        return {1}  # Veg
    elif restrictions & non_veg_friendly:
        return {3}  # Non-Veg
    return set()

Restaurants['encoded_dietaryrestrictions'] = Restaurants['dietaryrestrictions'].apply(encode_dietary_restrictions)

# Apply encoding
Restaurants['encoded_cuisines'] = Restaurants['cuisines'].apply(lambda x: {cuisine_encoder[c] for c in x.split(', ')} if pd.notna(x) else set())
Hotels['encoded_amenities'] = Hotels['all_amenities'].apply(lambda x: {amenity_encoder[a] for a in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 4: Extract City Information
# ---------------------------- #
def extract_city(address):
    match = re.search(r'([A-Za-z ]+),? Sri Lanka', str(address))
    return match.group(1).strip() if match else None

Hotels['extracted_city'] = Hotels['address'].apply(extract_city)

# ---------------------------- #
# Step 5: Matching Functions
# ---------------------------- #
def match_best_hotels(user_budget, user_destination, hotels_df):
    hotels_in_city = hotels_df[hotels_df['extracted_city'] == user_destination]
    return hotels_in_city.sort_values(by=['rating', 'rankingposition'], ascending=[False, True])

def match_best_restaurants(user_cuisine_pref, user_food_pref, user_destination, restaurants_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    restaurants_in_city = restaurants_df[restaurants_df['addressobj_city'] == user_destination].copy()
    user_encoded_cuisines = {cuisine_encoder.get(c, 0) for c in user_cuisine_pref.split(', ') if c in cuisine_encoder}
    user_encoded_food_pref = food_preference_encoder.get(user_food_pref, 3)

    def calculate_match_score(row):
        cuisine_score = len(user_encoded_cuisines.intersection(row['encoded_cuisines']))
        dietary_score = int(user_encoded_food_pref in row['encoded_dietaryrestrictions'])
        return cuisine_score + dietary_score

    restaurants_in_city['match_score'] = restaurants_in_city.apply(calculate_match_score, axis=1)
    return restaurants_in_city.sort_values(by='match_score', ascending=False)

def match_best_attractions(user_budget, user_destination, attractions_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) & (attractions_df['Lowest_Price'] <= user_budget)]
    return attractions_in_city.sort_values(by=['Rating', 'Ranking_Position'], ascending=[False, True])

# ---------------------------- #
# Step 6: Generate Itinerary
# ---------------------------- #

for index, user in user_inputs.iterrows():
    destinations = user['destination'].split(', ')
    num_days = user['number_of_days']
    selected_destinations = destinations[:2] if num_days >= 4 else [destinations[0]]

    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destination: {', '.join(selected_destinations)}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")
    print(f"Budget Per Day: {user['budget_per_day']}")

    for day in range(num_days):
        current_destination = selected_destinations[min(day // 2, len(selected_destinations) - 1)]
        recommended_hotels = match_best_hotels(user['budget_per_day'], current_destination, Hotels)
        recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], current_destination, Restaurants)
        recommended_attractions = match_best_attractions(user['budget_per_day'], current_destination, Attractions)

        if recommended_hotels.empty:
            print("No hotels found for the selected destination and budget.")
            continue  # Skip to the next user

        # Change the hotel every 3 days within a destination
        hotel = recommended_hotels.iloc[(day // 3) % len(recommended_hotels)]


        breakfast_restaurant = recommended_restaurants.iloc[day % len(recommended_restaurants)]
        attraction_1 = recommended_attractions.iloc[day % len(recommended_attractions)]
        lunch_restaurant = recommended_restaurants.iloc[(day + 1) % len(recommended_restaurants)]
        attraction_2 = recommended_attractions.iloc[(day + 1) % len(recommended_attractions)]
        dinner_restaurant = recommended_restaurants.iloc[(day + 2) % len(recommended_restaurants)]

        print(f"\nDay {day + 1} - {current_destination}")
        print(f"Hotel: {hotel['name']} (Rating: {hotel['rating']})")
        print(f"Breakfast: {breakfast_restaurant['name']} (Cuisines: {breakfast_restaurant['cuisines']})")
        print(f"Attraction 1: {attraction_1['Name']} (Lowest Price: {attraction_1['Lowest_Price']})")
        print(f"Lunch: {lunch_restaurant['name']} (Cuisines: {lunch_restaurant['cuisines']})")
        print(f"Attraction 2: {attraction_2['Name']} (Lowest Price: {attraction_2['Lowest_Price']})")
        print(f"Dinner: {dinner_restaurant['name']} (Cuisines: {dinner_restaurant['cuisines']})")
        print("Returning to Hotel\n")
        print("-" * 40)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Attraction 2: Honey Bee Garden (Lowest Price: No price mentioned)
Dinner: Tip Top Sky Bar Ella (Cuisines: American, Indian, Asian, Sri Lankan)
Returning to Hotel

----------------------------------------

Day 4 - Ella
Hotel: La Ella Breeze (Rating: 4.0)
Breakfast: The Pastry House (Cuisines: Italian, Chinese, Indian, Asian, Sri Lankan)
Attraction 1: Honey Bee Garden (Lowest Price: No price mentioned)
Lunch: Tip Top Sky Bar Ella (Cuisines: American, Indian, Asian, Sri Lankan)
Attraction 2: Priya Cookery Class (Lowest Price: No price mentioned)
Dinner: The True Food Restaurant Ella (Cuisines: Chinese, Indian, Asian, Sri Lankan)
Returning to Hotel

----------------------------------------

Day 5 - Ella
Hotel: La Ella Breeze (Rating: 4.0)
Breakfast: Tip Top Sky Bar Ella (Cuisines: American, Indian, Asian, Sri Lankan)
Attraction 1: Priya Cookery Class (Lowest Price: No price mentioned)
Lunch: The True Food Restaurant Ella (Cui

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.neighbors import NearestNeighbors

# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #

user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/user_inputs_preprocessed.xlsx')
Hotels = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/ProcessedHotels.csv')
Restaurants = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/PreprocessedRestaurants.csv')
Attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/attractions_preprocessed.xlsx')

# ---------------------------- #
# Step 2: Clean Column Names (Remove Spaces)
# ---------------------------- #
def clean_column_names(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

user_inputs = clean_column_names(user_inputs)
Hotels = clean_column_names(Hotels)
Restaurants = clean_column_names(Restaurants)
Attractions = clean_column_names(Attractions)

# ---------------------------- #
# Step 3: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Allowed destinations
allowed_destinations = {'Kandy', 'Ella', 'Colombo', 'Nuwara Eliya'}

# Encode destinations
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(allowed_destinations))}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip(), 0) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in Restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in Hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
veg_friendly = {'Vegetarian friendly', 'Vegan options', 'Halal', 'Gluten free options'}
non_veg_friendly = {'No Special Dietary'}

def encode_dietary_restrictions(dietary_str):
    if pd.isna(dietary_str):
        return set()
    restrictions = set(dietary_str.split(', '))
    if restrictions & veg_friendly:
        return {1}  # Veg
    elif restrictions & non_veg_friendly:
        return {3}  # Non-Veg
    return set()

Restaurants['encoded_dietaryrestrictions'] = Restaurants['dietaryrestrictions'].apply(encode_dietary_restrictions)

# Apply encoding
Restaurants['encoded_cuisines'] = Restaurants['cuisines'].apply(lambda x: {cuisine_encoder.get(c, 0) for c in x.split(', ')} if pd.notna(x) else set())
Hotels['encoded_amenities'] = Hotels['all_amenities'].apply(lambda x: {amenity_encoder.get(a, 0) for a in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 4: Extract City Information
# ---------------------------- #
def extract_city(address):
    match = re.search(r'([A-Za-z ]+),? Sri Lanka', str(address))
    return match.group(1).strip() if match else None

Hotels['extracted_city'] = Hotels['address'].apply(extract_city)

# ---------------------------- #
# Step 5: ML-Based Matching Functions
# ---------------------------- #
def match_best_hotels(user_budget, user_destination, hotels_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame(columns=['name', 'rating'])
    hotels_in_city = hotels_df[hotels_df['extracted_city'] == user_destination]
    return hotels_in_city.sort_values(by=['rating', 'rankingposition'], ascending=[False, True])

def match_best_restaurants(user_cuisine_pref, user_food_pref, user_destination, restaurants_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame(columns=['name', 'cuisines'])
    restaurants_in_city = restaurants_df[restaurants_df['addressobj_city'] == user_destination].copy()
    return restaurants_in_city.sort_values(by='rating', ascending=False)

def match_best_attractions(user_budget, user_destination, attractions_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame(columns=['Name', 'Lowest_Price'])
    attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) & (attractions_df['Lowest_Price'] <= user_budget)]
    return attractions_in_city.sort_values(by=['Rating', 'Ranking_Position'], ascending=[False, True])

# ---------------------------- #
# Step 6: Generate Itinerary
# ---------------------------- #

for index, user in user_inputs.iterrows():
    destinations = user['destination'].split(', ')
    num_days = user['number_of_days']

    # Handle multiple destinations: Assign 2 days per destination if num_days > 4
    if num_days >= 4:
        selected_destinations = destinations[:2]  # First two destinations
    else:
        selected_destinations = [destinations[0]]  # First destination

    # Print user details
    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destinations: {', '.join(selected_destinations)}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")
    print(f"Budget Per Day: {user['budget_per_day']}")

    for day in range(num_days):
        # Determine the current destination based on day and number of destinations
        current_destination = selected_destinations[min(day // 2, len(selected_destinations) - 1)]

        # Hotel recommendation: Change every 3 days if staying in one destination
        recommended_hotels = match_best_hotels(user['budget_per_day'], current_destination, Hotels)
        hotel = recommended_hotels.iloc[(day // 3) % len(recommended_hotels)] if not recommended_hotels.empty else None

        # Restaurant recommendations for breakfast, lunch, and dinner
        recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], current_destination, Restaurants)
        breakfast_restaurant = recommended_restaurants.iloc[day % len(recommended_restaurants)] if not recommended_restaurants.empty else None
        lunch_restaurant = recommended_restaurants.iloc[(day + 1) % len(recommended_restaurants)] if not recommended_restaurants.empty else None
        dinner_restaurant = recommended_restaurants.iloc[(day + 2) % len(recommended_restaurants)] if not recommended_restaurants.empty else None

        # Attraction recommendations
        recommended_attractions = match_best_attractions(user['budget_per_day'], current_destination, Attractions)
        attraction_1 = recommended_attractions.iloc[0] if not recommended_attractions.empty else None
        attraction_2 = recommended_attractions.iloc[1] if len(recommended_attractions) > 1 else None

        # Output the itinerary for the day
        print(f"\nDay {day + 1} - {current_destination}")
        print(f"Hotel: {hotel['name']} (Rating: {hotel['rating']})" if hotel is not None else "Hotel: Information Not Available")
        print(f"Breakfast: {breakfast_restaurant['name']} (Cuisines: {breakfast_restaurant['cuisines']}, Price: {breakfast_restaurant['pricelevel_lkr']})" if breakfast_restaurant is not None else "Breakfast: Information Not Available")
        print(f"Attraction 1: {attraction_1['Name']} (Lowest Price: {attraction_1['Lowest_Price']})" if attraction_1 is not None else "Attraction 1: Information Not Available")
        print(f"Lunch: {lunch_restaurant['name']} (Cuisines: {lunch_restaurant['cuisines']}, Price: {lunch_restaurant['pricelevel_lkr']})" if lunch_restaurant is not None else "Lunch: Information Not Available")
        print(f"Attraction 2: {attraction_2['Name']} (Lowest Price: {attraction_2['Lowest_Price']})" if attraction_2 is not None else "Attraction 2: Information Not Available")
        print(f"Dinner: {dinner_restaurant['name']} (Cuisines: {dinner_restaurant['cuisines']}, Price: {dinner_restaurant['pricelevel_lkr']})" if dinner_restaurant is not None else "Dinner: Information Not Available")
        print("Returning to Hotel\n")
        print("-" * 40)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Attraction 2: Yoga Reiki Meditation (Lowest Price: No price mentioned)
Dinner: The Colonial Bistro & Lounge (Cuisines: Steakhouse, Bar, Barbecue, Asian, Pub, Sri Lankan, Beer restaurants, Price: 600)
Returning to Hotel

----------------------------------------

Day 2 - Kandy
Hotel: Railway Retiring Rooms Kandy (Rating: 3.5)
Breakfast: Pepe's Rustic Italian (Cuisines: Italian, Bar, Pub, Price: 750)
Attraction 1: Aaliya Tours (Lowest Price: LKR 6,406.52)
Lunch: The Colonial Bistro & Lounge (Cuisines: Steakhouse, Bar, Barbecue, Asian, Pub, Sri Lankan, Beer restaurants, Price: 600)
Attraction 2: Yoga Reiki Meditation (Lowest Price: No price mentioned)
Dinner: The Blizz Cafe (Cuisines: Italian, Chinese, Cafe, Asian, Soups, Price: 750)
Returning to Hotel

----------------------------------------

Day 3 - Kandy
Hotel: Railway Retiring Rooms Kandy (Rating: 3.5)
Breakfast: The Colonial Bistro & Lounge (Cuisines: Steakhouse, Bar, B

In [None]:
# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #

user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/user_inputs_preprocessed.xlsx')
Hotels = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/ProcessedHotels.csv')
Restaurants = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/PreprocessedRestaurants.csv')
Attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/attractions_preprocessed.xlsx')

# ---------------------------- #
# Step 2: Clean Column Names (Remove Spaces)
# ---------------------------- #
def clean_column_names(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

user_inputs = clean_column_names(user_inputs)
Hotels = clean_column_names(Hotels)
Restaurants = clean_column_names(Restaurants)
Attractions = clean_column_names(Attractions)

# ---------------------------- #
# Step 3: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Allowed destinations
allowed_destinations = {'Kandy', 'Ella', 'Colombo', 'Nuwara Eliya'}

# Encode destinations
all_destinations = set()
for dest in user_inputs['destination'].dropna():
    all_destinations.update(dest.split(', '))
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(all_destinations)) if dest in allowed_destinations}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip()) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in Restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in Hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
veg_friendly = {'Vegetarian friendly', 'Vegan options', 'Halal', 'Gluten free options'}
non_veg_friendly = {'No Special Dietary'}

def encode_dietary_restrictions(dietary_str):
    if pd.isna(dietary_str):
        return set()
    restrictions = set(dietary_str.split(', '))
    if restrictions & veg_friendly:
        return {1}  # Veg
    elif restrictions & non_veg_friendly:
        return {3}  # Non-Veg
    return set()

Restaurants['encoded_dietaryrestrictions'] = Restaurants['dietaryrestrictions'].apply(encode_dietary_restrictions)

# Apply encoding
Restaurants['encoded_cuisines'] = Restaurants['cuisines'].apply(lambda x: {cuisine_encoder[c] for c in x.split(', ')} if pd.notna(x) else set())
Hotels['encoded_amenities'] = Hotels['all_amenities'].apply(lambda x: {amenity_encoder[a] for a in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 4: Extract City Information
# ---------------------------- #
def extract_city(address):
    match = re.search(r'([A-Za-z ]+),? Sri Lanka', str(address))
    return match.group(1).strip() if match else None

Hotels['extracted_city'] = Hotels['address'].apply(extract_city)

# ---------------------------- #
# Step 5: Clean Price Data (Attractions, Restaurants, Hotels, and User Budget)
# ---------------------------- #

# Function to clean the prices
def clean_price(price_str):
    if pd.isna(price_str):
        return None
    if 'No price mentioned' in price_str:
        return None
    price_str = price_str.replace('LKR', '').replace(',', '').strip()
    try:
        return float(price_str)
    except ValueError:
        return None

# Clean 'Lowest_Price' in Attractions
Attractions['Lowest_Price'] = Attractions['Lowest_Price'].apply(clean_price)

# Clean 'priceRange_LKR' in Hotels
Hotels['pricerange'] = Hotels['pricerange'].apply(clean_price)

# Clean 'budget_per_day' in user inputs
def clean_budget(budget_str):
    if pd.isna(budget_str):
        return None
    if 'Rs.' not in budget_str:
        return None
    budget_str = budget_str.replace('Rs.', '').replace(',', '').strip()
    if '-' in budget_str:
        lower, upper = budget_str.split(' - ')
        return (float(lower.strip()), float(upper.strip()))
    elif '+' in budget_str:
        return float(budget_str.replace('+', '').strip())
    return float(budget_str.strip())

user_inputs['cleaned_budget_per_day'] = user_inputs['budget_per_day'].apply(clean_budget)

# ---------------------------- #
# Step 6: Matching Functions (With Budget Constraints)
# ---------------------------- #

def match_best_hotels(user_budget, user_destination, hotels_df):
    hotels_in_city = hotels_df[hotels_df['extracted_city'] == user_destination]
    hotels_in_city = hotels_in_city[hotels_in_city['pricerange'] <= user_budget] if isinstance(user_budget, float) else hotels_in_city
    return hotels_in_city.sort_values(by=['rating', 'rankingposition'], ascending=[False, True])

def match_best_restaurants(user_cuisine_pref, user_food_pref, user_destination, restaurants_df, user_budget):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    restaurants_in_city = restaurants_df[restaurants_df['addressobj_city'] == user_destination].copy()
    user_encoded_cuisines = {cuisine_encoder.get(c, 0) for c in user_cuisine_pref.split(', ') if c in cuisine_encoder}
    user_encoded_food_pref = food_preference_encoder.get(user_food_pref, 3)

    def calculate_match_score(row):
        cuisine_score = len(user_encoded_cuisines.intersection(row['encoded_cuisines']))
        dietary_score = int(user_encoded_food_pref in row['encoded_dietaryrestrictions'])
        return cuisine_score + dietary_score

    restaurants_in_city['match_score'] = restaurants_in_city.apply(calculate_match_score, axis=1)
    # Filter by budget
    restaurants_in_city = restaurants_in_city[restaurants_in_city['pricelevel_lkr'] <= user_budget] if isinstance(user_budget, float) else restaurants_in_city
    return restaurants_in_city.sort_values(by='match_score', ascending=False)

def match_best_attractions(user_budget, user_destination, attractions_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()

    # Handle the case where the budget is a range (tuple)
    if isinstance(user_budget, tuple):
        min_budget, max_budget = user_budget
        attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) &
                                             (attractions_df['Lowest_Price'] >= min_budget) &
                                             (attractions_df['Lowest_Price'] <= max_budget)]
    else:
        attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) &
                                             (attractions_df['Lowest_Price'] <= user_budget)]

    return attractions_in_city.sort_values(by=['Rating', 'Ranking_Position'], ascending=[False, True])


# ---------------------------- #
# Step 7: Generate Itinerary (With Budget Constraints)
# ---------------------------- #

for index, user in user_inputs.iterrows():
    destinations = user['destination'].split(', ')
    num_days = user['number_of_days']
    selected_destinations = destinations[:2] if num_days >= 4 else [destinations[0]]

    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destination: {', '.join(selected_destinations)}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")
    print(f"Budget Per Day: {user['budget_per_day']}")

    for day in range(num_days):
        current_destination = selected_destinations[min(day // 2, len(selected_destinations) - 1)]
        recommended_hotels = match_best_hotels(user['cleaned_budget_per_day'], current_destination, Hotels)
        recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], current_destination, Restaurants, user['cleaned_budget_per_day'])
        recommended_attractions = match_best_attractions(user['cleaned_budget_per_day'], current_destination, Attractions)

        if recommended_hotels.empty:
          print("No hotels found for the selected destination and budget.")
          continue  # Skip to the next user

        if recommended_restaurants.empty:
          print(f"No restaurants found for the selected destination on Day {day + 1}.")
          continue  # Skip to the next day

        if recommended_attractions.empty:
          print(f"No attractions found for the selected destination on Day {day + 1}.")
          continue  # Skip to the next day

        # Now you can safely proceed to the original logic
        hotel = recommended_hotels.iloc[(day // 3) % len(recommended_hotels)]

        breakfast_restaurant = recommended_restaurants.iloc[day % len(recommended_restaurants)]
        attraction_1 = recommended_attractions.iloc[day % len(recommended_attractions)]
        lunch_restaurant = recommended_restaurants.iloc[(day + 1) % len(recommended_restaurants)]
        attraction_2 = recommended_attractions.iloc[(day + 1) % len(recommended_attractions)]
        dinner_restaurant = recommended_restaurants.iloc[(day + 2) % len(recommended_restaurants)]

        print(f"\nDay {day + 1} - {current_destination}")
        print(f"Hotel: {hotel['name']} (Rating: {hotel['rating']})")
        print(f"Breakfast: {breakfast_restaurant['name']} (Cuisines: {breakfast_restaurant['cuisines']})")
        print(f"Attraction 1: {attraction_1['Name']} (Lowest Price: {attraction_1['Lowest_Price']})")
        print(f"Lunch: {lunch_restaurant['name']} (Cuisines: {lunch_restaurant['cuisines']})")
        print(f"Attraction 2: {attraction_2['Name']} (Lowest Price: {attraction_2['Lowest_Price']})")
        print(f"Dinner: {dinner_restaurant['name']} (Cuisines: {dinner_restaurant['cuisines']})")
        print("Returning to Hotel\n")
        print("-" * 40)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Attraction 1: Lover's Leap Falls (Lowest Price: 7611.24)
Lunch: Fusion Grill (Cuisines: Italian, Indian, Asian, Sri Lankan, Arabic)
Attraction 2: Pedro Tea Factory (Lowest Price: 7611.24)
Dinner: La Cottage Boutique Hotel & Restuarant (Cuisines: Chinese, Indian, European, Asian, Sri Lankan, Arabic)
Returning to Hotel

----------------------------------------

===== User 150: Daham  =====
Destination: Nuwara Eliya
Cuisine Preference: Sri Lankan, Indian, Italian, Western
Food Preference: Non-Veg
Budget Per Day: Rs. 5100 - Rs. 10000

Day 1 - Nuwara Eliya
Hotel: Woodnest Sithula Garden Resort (Rating: 5)
Breakfast: Pub Nuwara Eliya (Cuisines: Italian, French, Indian, Irish, Brew Pub, Bar, Cafe, Seafood, Fast Food, European, Asian, Pub, Wine Bar, Street Food, Swedish, Sri Lankan, Arabic, Fruit parlours, Dining bars, Beer restaurants)
Attraction 1: Lover's Leap Falls (Lowest Price: 7611.24)
Lunch: Midky Restaurant (Cuisines: It

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #

user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/user_inputs_preprocessed.xlsx')
Hotels = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/ProcessedHotels.csv')
Restaurants = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/PreprocessedRestaurants.csv')
Attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/attractions_preprocessed.xlsx')

# ---------------------------- #
# Step 2: Clean Column Names (Remove Spaces)
# ---------------------------- #
def clean_column_names(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

user_inputs = clean_column_names(user_inputs)
Hotels = clean_column_names(Hotels)
Restaurants = clean_column_names(Restaurants)
Attractions = clean_column_names(Attractions)

# ---------------------------- #
# Step 3: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Allowed destinations
allowed_destinations = {'Kandy', 'Ella', 'Colombo', 'Nuwara Eliya'}

# Encode destinations
all_destinations = set()
for dest in user_inputs['destination'].dropna():
    all_destinations.update(dest.split(', '))
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(all_destinations)) if dest in allowed_destinations}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip()) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in Restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in Hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
veg_friendly = {'Vegetarian friendly', 'Vegan options', 'Halal', 'Gluten free options'}
non_veg_friendly = {'No Special Dietary'}

def encode_dietary_restrictions(dietary_str):
    if pd.isna(dietary_str):
        return set()
    restrictions = set(dietary_str.split(', '))
    if restrictions & veg_friendly:
        return {1}  # Veg
    elif restrictions & non_veg_friendly:
        return {3}  # Non-Veg
    return set()

Restaurants['encoded_dietaryrestrictions'] = Restaurants['dietaryrestrictions'].apply(encode_dietary_restrictions)

# Apply encoding
Restaurants['encoded_cuisines'] = Restaurants['cuisines'].apply(lambda x: {cuisine_encoder[c] for c in x.split(', ')} if pd.notna(x) else set())
Hotels['encoded_amenities'] = Hotels['all_amenities'].apply(lambda x: {amenity_encoder[a] for a in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 4: Extract City Information
# ---------------------------- #
def extract_city(address):
    match = re.search(r'([A-Za-z ]+),? Sri Lanka', str(address))
    return match.group(1).strip() if match else None

Hotels['extracted_city'] = Hotels['address'].apply(extract_city)

# ---------------------------- #
# Step 5: Clean Price Data (Attractions, Restaurants, Hotels, and User Budget)
# ---------------------------- #

# Function to clean the prices
def clean_price(price_str):
    if pd.isna(price_str):
        return None
    if 'No price mentioned' in price_str:
        return None
    price_str = price_str.replace('LKR', '').replace(',', '').strip()
    try:
        return float(price_str)
    except ValueError:
        return None

# Clean 'Lowest_Price' in Attractions
Attractions['Lowest_Price'] = Attractions['Lowest_Price'].apply(clean_price)

# Clean 'priceRange_LKR' in Hotels
Hotels['pricerange'] = Hotels['pricerange'].apply(clean_price)

# Clean 'budget_per_day' in user inputs
def clean_budget(budget_str):
    if pd.isna(budget_str):
        return None
    if 'Rs.' not in budget_str:
        return None
    budget_str = budget_str.replace('Rs.', '').replace(',', '').strip()
    if '-' in budget_str:
        lower, upper = budget_str.split(' - ')
        return (float(lower.strip()), float(upper.strip()))
    elif '+' in budget_str:
        return float(budget_str.replace('+', '').strip())
    return float(budget_str.strip())

user_inputs['cleaned_budget_per_day'] = user_inputs['budget_per_day'].apply(clean_budget)

# ---------------------------- #
# Step 6: Matching Functions (With Budget Constraints)
# ---------------------------- #

def match_best_hotels(user_budget, user_destination, hotels_df):
    hotels_in_city = hotels_df[hotels_df['extracted_city'] == user_destination]
    hotels_in_city = hotels_in_city[hotels_in_city['pricerange'] <= user_budget] if isinstance(user_budget, float) else hotels_in_city
    return hotels_in_city.sort_values(by=['rating', 'rankingposition'], ascending=[False, True])

def match_best_restaurants(user_cuisine_pref, user_food_pref, user_destination, restaurants_df, user_budget):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    restaurants_in_city = restaurants_df[restaurants_df['addressobj_city'] == user_destination].copy()
    user_encoded_cuisines = {cuisine_encoder.get(c, 0) for c in user_cuisine_pref.split(', ') if c in cuisine_encoder}
    user_encoded_food_pref = food_preference_encoder.get(user_food_pref, 3)

    def calculate_match_score(row):
        cuisine_score = len(user_encoded_cuisines.intersection(row['encoded_cuisines']))
        dietary_score = int(user_encoded_food_pref in row['encoded_dietaryrestrictions'])
        return cuisine_score + dietary_score

    restaurants_in_city['match_score'] = restaurants_in_city.apply(calculate_match_score, axis=1)
    # Filter by budget
    restaurants_in_city = restaurants_in_city[restaurants_in_city['pricelevel_lkr'] <= user_budget] if isinstance(user_budget, float) else restaurants_in_city
    return restaurants_in_city.sort_values(by='match_score', ascending=False)

def match_best_attractions(user_budget, user_destination, attractions_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()

    # Handle the case where the budget is a range (tuple)
    if isinstance(user_budget, tuple):
        min_budget, max_budget = user_budget
        attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) &
                                             (attractions_df['Lowest_Price'] >= min_budget) &
                                             (attractions_df['Lowest_Price'] <= max_budget)]
    else:
        attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) &
                                             (attractions_df['Lowest_Price'] <= user_budget)]

    return attractions_in_city.sort_values(by=['Rating', 'Ranking_Position'], ascending=[False, True])

# ---------------------------- #
# Step 8: Feature Engineering for Content-Based Filtering
# ---------------------------- #

def replace_not_rated(value, median_value):
    return median_value if value == 'not rated' else value

def replace_no_ranking(value, median_value):
    return median_value if value == 'no ranking' else value

# Convert 'not rated' and 'no ranking' values to NaN, compute median, and replace
Attractions['Rating'] = pd.to_numeric(Attractions['Rating'], errors='coerce')
median_attraction_rating = Attractions['Rating'].median()
Attractions['Rating'].fillna(median_attraction_rating, inplace=True)

Hotels['rating'] = pd.to_numeric(Hotels['rating'], errors='coerce')
median_hotel_rating = Hotels['rating'].median()
Hotels['rating'].fillna(median_hotel_rating, inplace=True)

Attractions['Ranking_Position'] = pd.to_numeric(Attractions['Ranking_Position'], errors='coerce')
median_attraction_ranking = Attractions['Ranking_Position'].median()
Attractions['Ranking_Position'].fillna(median_attraction_ranking, inplace=True)

Hotels['rankingposition'] = pd.to_numeric(Hotels['rankingposition'], errors='coerce')
median_hotel_ranking = Hotels['rankingposition'].median()
Hotels['rankingposition'].fillna(median_hotel_ranking, inplace=True)

# Normalize and transform data for similarity calculations
def create_feature_matrix(df, feature_columns):
    scaler = StandardScaler()
    feature_matrix = scaler.fit_transform(df[feature_columns].fillna(0))
    return feature_matrix

# Feature columns for hotels and attractions
hotel_features = ['rating', 'rankingposition', 'pricerange']
attraction_features = ['Rating', 'Ranking_Position', 'Lowest_Price']

Hotels['feature_matrix'] = list(create_feature_matrix(Hotels, hotel_features))
Attractions['feature_matrix'] = list(create_feature_matrix(Attractions, attraction_features))

# ---------------------------- #
# Step 9: Content-Based Filtering for Hotels
# ---------------------------- #

def recommend_similar_hotels(hotel_df, user_budget, user_destination, top_n=5):
    hotels_in_city = hotel_df[hotel_df['extracted_city'] == user_destination].copy()
    if isinstance(user_budget, float):
        hotels_in_city = hotels_in_city[hotels_in_city['pricerange'] <= user_budget]

    if hotels_in_city.empty:
        return pd.DataFrame()

    feature_matrix = np.stack(hotels_in_city['feature_matrix'].values)
    similarity_matrix = cosine_similarity(feature_matrix)

    # Rank hotels based on similarity to all other hotels
    avg_similarity = similarity_matrix.mean(axis=1)
    hotels_in_city.loc[:, 'similarity_score'] = avg_similarity
    return hotels_in_city.sort_values(by=['similarity_score', 'rating', 'rankingposition'], ascending=[False, False, True]).head(top_n)

# ---------------------------- #
# Step 10: Content-Based Filtering for Attractions
# ---------------------------- #

def recommend_similar_attractions(attraction_df, user_budget, user_destination, top_n=5):
    attractions_in_city = attraction_df[attraction_df['City'] == user_destination].copy()

    if isinstance(user_budget, tuple):
        min_budget, max_budget = user_budget
        attractions_in_city = attractions_in_city[(attractions_in_city['Lowest_Price'] >= min_budget) & (attractions_in_city['Lowest_Price'] <= max_budget)]
    elif isinstance(user_budget, float):
        attractions_in_city = attractions_in_city[attractions_in_city['Lowest_Price'] <= user_budget]

    if attractions_in_city.empty:
        return pd.DataFrame()

    feature_matrix = np.stack(attractions_in_city['feature_matrix'].values)
    similarity_matrix = cosine_similarity(feature_matrix)

    # Rank attractions based on similarity
    avg_similarity = similarity_matrix.mean(axis=1)
    attractions_in_city.loc[:, 'similarity_score'] = avg_similarity
    return attractions_in_city.sort_values(by=['similarity_score', 'Rating', 'Ranking_Position'], ascending=[False, False, True]).head(top_n)

# ---------------------------- #
# Step 11: Integrating Enhanced Recommendations into Itinerary
# ---------------------------- #

for index, user in user_inputs.iterrows():
    destinations = user['destination'].split(', ')
    num_days = user['number_of_days']
    selected_destinations = destinations[:2] if num_days >= 4 else [destinations[0]]

    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destination: {', '.join(selected_destinations)}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")
    print(f"Budget Per Day: {user['budget_per_day']}")

    hotel_stay_counter = 0  # Track how long user stays in a hotel
    current_hotel = None  # Store the current hotel

    for day in range(num_days):
        current_destination = selected_destinations[min(day // 2, len(selected_destinations) - 1)]

        if hotel_stay_counter == 0 or hotel_stay_counter >= 3:
            recommended_hotels = recommend_similar_hotels(Hotels, user['cleaned_budget_per_day'], current_destination)
            if not recommended_hotels.empty:
                current_hotel = recommended_hotels.iloc[0]
                hotel_stay_counter = 1  # Reset the counter after changing hotel
            else:
                print("No hotels found for the selected destination and budget.")
                continue
        else:
            hotel_stay_counter += 1  # Increment stay counter if continuing in the same hotel

        recommended_attractions = recommend_similar_attractions(Attractions, user['cleaned_budget_per_day'], current_destination)
        recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], current_destination, Restaurants, user['cleaned_budget_per_day'])

        if recommended_restaurants.empty:
            print(f"No restaurants found for {current_destination} on Day {day + 1}.")
            continue

        if recommended_attractions.empty:
            print(f"No attractions found for {current_destination} on Day {day + 1}.")
            continue

        breakfast_restaurant = recommended_restaurants.iloc[day % len(recommended_restaurants)]
        attraction_1 = recommended_attractions.iloc[day % len(recommended_attractions)]
        lunch_restaurant = recommended_restaurants.iloc[(day + 1) % len(recommended_restaurants)]
        attraction_2 = recommended_attractions.iloc[(day + 1) % len(recommended_attractions)]
        dinner_restaurant = recommended_restaurants.iloc[(day + 2) % len(recommended_restaurants)]

        print(f"\nDay {day + 1} - {current_destination}")
        print(f"Hotel: {current_hotel['name']} (Rating: {current_hotel['rating']})")
        print(f"Breakfast: {breakfast_restaurant['name']} (Cuisines: {breakfast_restaurant['cuisines']})")
        print(f"Attraction 1: {attraction_1['Name']} (Lowest Price: {attraction_1['Lowest_Price']})")
        print(f"Lunch: {lunch_restaurant['name']} (Cuisines: {lunch_restaurant['cuisines']})")
        print(f"Attraction 2: {attraction_2['Name']} (Lowest Price: {attraction_2['Lowest_Price']})")
        print(f"Dinner: {dinner_restaurant['name']} (Cuisines: {dinner_restaurant['cuisines']})")
        print("Returning to Hotel\n")
        print("-" * 40)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Attractions['Rating'].fillna(median_attraction_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Hotels['rating'].fillna(median_hotel_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obj

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Attraction 1: Pedro Tea Factory (Lowest Price: 7611.24)
Lunch: Fusion Grill (Cuisines: Italian, Indian, Asian, Sri Lankan, Arabic)
Attraction 2: Lover's Leap Falls (Lowest Price: 7611.24)
Dinner: La Cottage Boutique Hotel & Restuarant (Cuisines: Chinese, Indian, European, Asian, Sri Lankan, Arabic)
Returning to Hotel

----------------------------------------

===== User 150: Daham  =====
Destination: Nuwara Eliya
Cuisine Preference: Sri Lankan, Indian, Italian, Western
Food Preference: Non-Veg
Budget Per Day: Rs. 5100 - Rs. 10000

Day 1 - Nuwara Eliya
Hotel: Serenus Boutique Villa (Rating: 4.5)
Breakfast: Pub Nuwara Eliya (Cuisines: Italian, French, Indian, Irish, Brew Pub, Bar, Cafe, Seafood, Fast Food, European, Asian, Pub, Wine Bar, Street Food, Swedish, Sri Lankan, Arabic, Fruit parlours, Dining bars, Beer restaurants)
Attraction 1: Pedro Tea Factory (Lowest Price: 7611.24)
Lunch: Midky Restaurant (Cuisines: Italian, 