<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/E_Personalized_Itinerary_Generator/model_training_All_cities4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re



drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #

user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/user_inputs_preprocessed.xlsx')
Hotels = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/ProcessedHotels.csv')
Restaurants = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/PreprocessedRestaurants.csv')
Attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/attractions_preprocessed.xlsx')

# ---------------------------- #
# Step 2: Clean Column Names (Remove Spaces)
# ---------------------------- #
def clean_column_names(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

user_inputs = clean_column_names(user_inputs)
Hotels = clean_column_names(Hotels)
Restaurants = clean_column_names(Restaurants)
Attractions = clean_column_names(Attractions)

# ---------------------------- #
# Step 3: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Allowed destinations
allowed_destinations = {'Kandy', 'Ella', 'Colombo', 'Nuwara Eliya'}

# Encode destinations
all_destinations = set()
for dest in user_inputs['destination'].dropna():
    all_destinations.update(dest.split(', '))
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(all_destinations)) if dest in allowed_destinations}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip()) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in Restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in Hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
veg_friendly = {'Vegetarian friendly', 'Vegan options', 'Halal', 'Gluten free options'}
non_veg_friendly = {'No Special Dietary'}

def encode_dietary_restrictions(dietary_str):
    if pd.isna(dietary_str):
        return set()
    restrictions = set(dietary_str.split(', '))
    if restrictions & veg_friendly:
        return {1}  # Veg
    elif restrictions & non_veg_friendly:
        return {3}  # Non-Veg
    return set()

Restaurants['encoded_dietaryrestrictions'] = Restaurants['dietaryrestrictions'].apply(encode_dietary_restrictions)

# Apply encoding
Restaurants['encoded_cuisines'] = Restaurants['cuisines'].apply(lambda x: {cuisine_encoder[c] for c in x.split(', ')} if pd.notna(x) else set())
Hotels['encoded_amenities'] = Hotels['all_amenities'].apply(lambda x: {amenity_encoder[a] for a in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 4: Extract City Information
# ---------------------------- #
def extract_city(address):
    match = re.search(r'([A-Za-z ]+),? Sri Lanka', str(address))
    return match.group(1).strip() if match else None

Hotels['extracted_city'] = Hotels['address'].apply(extract_city)

# ---------------------------- #
# Step 5: Matching Functions
# ---------------------------- #
def match_best_hotels(user_budget, user_destination, hotels_df):
    hotels_in_city = hotels_df[hotels_df['extracted_city'] == user_destination]
    return hotels_in_city.sort_values(by=['rating', 'rankingposition'], ascending=[False, True])

def match_best_restaurants(user_cuisine_pref, user_food_pref, user_destination, restaurants_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    restaurants_in_city = restaurants_df[restaurants_df['addressobj_city'] == user_destination].copy()
    user_encoded_cuisines = {cuisine_encoder.get(c, 0) for c in user_cuisine_pref.split(', ') if c in cuisine_encoder}
    user_encoded_food_pref = food_preference_encoder.get(user_food_pref, 3)

    def calculate_match_score(row):
        cuisine_score = len(user_encoded_cuisines.intersection(row['encoded_cuisines']))
        dietary_score = int(user_encoded_food_pref in row['encoded_dietaryrestrictions'])
        return cuisine_score + dietary_score

    restaurants_in_city['match_score'] = restaurants_in_city.apply(calculate_match_score, axis=1)
    return restaurants_in_city.sort_values(by='match_score', ascending=False)

def match_best_attractions(user_budget, user_destination, attractions_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) & (attractions_df['Lowest_Price'] <= user_budget)]
    return attractions_in_city.sort_values(by=['Rating', 'Ranking_Position'], ascending=[False, True])

# ---------------------------- #
# Step 6: Generate Itinerary
# ---------------------------- #

for index, user in user_inputs.iterrows():
    destinations = user['destination'].split(', ')
    num_days = user['number_of_days']
    selected_destinations = destinations[:2] if num_days >= 4 else [destinations[0]]

    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destination: {', '.join(selected_destinations)}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")
    print(f"Budget Per Day: {user['budget_per_day']}")

    for day in range(num_days):
        current_destination = selected_destinations[min(day // 2, len(selected_destinations) - 1)]
        recommended_hotels = match_best_hotels(user['budget_per_day'], current_destination, Hotels)
        recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], current_destination, Restaurants)
        recommended_attractions = match_best_attractions(user['budget_per_day'], current_destination, Attractions)

        if recommended_hotels.empty:
            print("No hotels found for the selected destination and budget.")
            continue  # Skip to the next user

        # Change the hotel every 3 days within a destination
        hotel = recommended_hotels.iloc[(day // 3) % len(recommended_hotels)]


        breakfast_restaurant = recommended_restaurants.iloc[day % len(recommended_restaurants)]
        attraction_1 = recommended_attractions.iloc[day % len(recommended_attractions)]
        lunch_restaurant = recommended_restaurants.iloc[(day + 1) % len(recommended_restaurants)]
        attraction_2 = recommended_attractions.iloc[(day + 1) % len(recommended_attractions)]
        dinner_restaurant = recommended_restaurants.iloc[(day + 2) % len(recommended_restaurants)]

        print(f"\nDay {day + 1} - {current_destination}")
        print(f"Hotel: {hotel['name']} (Rating: {hotel['rating']})")
        print(f"Breakfast: {breakfast_restaurant['name']} (Cuisines: {breakfast_restaurant['cuisines']})")
        print(f"Attraction 1: {attraction_1['Name']} (Lowest Price: {attraction_1['Lowest_Price']})")
        print(f"Lunch: {lunch_restaurant['name']} (Cuisines: {lunch_restaurant['cuisines']})")
        print(f"Attraction 2: {attraction_2['Name']} (Lowest Price: {attraction_2['Lowest_Price']})")
        print(f"Dinner: {dinner_restaurant['name']} (Cuisines: {dinner_restaurant['cuisines']})")
        print("Returning to Hotel\n")
        print("-" * 40)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Attraction 2: Honey Bee Garden (Lowest Price: No price mentioned)
Dinner: Tip Top Sky Bar Ella (Cuisines: American, Indian, Asian, Sri Lankan)
Returning to Hotel

----------------------------------------

Day 4 - Ella
Hotel: La Ella Breeze (Rating: 4.0)
Breakfast: The Pastry House (Cuisines: Italian, Chinese, Indian, Asian, Sri Lankan)
Attraction 1: Honey Bee Garden (Lowest Price: No price mentioned)
Lunch: Tip Top Sky Bar Ella (Cuisines: American, Indian, Asian, Sri Lankan)
Attraction 2: Priya Cookery Class (Lowest Price: No price mentioned)
Dinner: The True Food Restaurant Ella (Cuisines: Chinese, Indian, Asian, Sri Lankan)
Returning to Hotel

----------------------------------------

Day 5 - Ella
Hotel: La Ella Breeze (Rating: 4.0)
Breakfast: Tip Top Sky Bar Ella (Cuisines: American, Indian, Asian, Sri Lankan)
Attraction 1: Priya Cookery Class (Lowest Price: No price mentioned)
Lunch: The True Food Restaurant Ella (Cui