<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/E_Personalized_Itinerary_Generator/model_training_all_cities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re



drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load datasets
user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/preprocessed_user_inputs.xlsx')
colombo_hotels = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboHotels_processed.xlsx')
colombo_restaurants = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboRestaurants_processed.xlsx')
colombo_attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colombo_attractions_processed.xlsx')

# Function to clean budget and get median value
def extract_median_budget(budget):
    numbers = re.findall(r'\d+', str(budget).replace(',', ''))
    if len(numbers) == 2:
        return (int(numbers[0]) + int(numbers[1])) / 2
    return np.nan

user_inputs['budget_per_day'] = user_inputs['budget_per_day'].apply(extract_median_budget)

# Preprocess categorical and text-based features
def process_list_columns(df, column_name):
    return df[column_name].fillna('').apply(lambda x: x.replace(',', ' '))

user_inputs['cuisine_preference'] = process_list_columns(user_inputs, 'cuisine_preference')
user_inputs['activities_preference'] = process_list_columns(user_inputs, 'activities_preference')
colombo_hotels['all_amenities'] = process_list_columns(colombo_hotels, 'all_amenities')
colombo_restaurants['cuisines'] = process_list_columns(colombo_restaurants, 'cuisines')
colombo_restaurants['features'] = process_list_columns(colombo_restaurants, 'features')
colombo_restaurants['mealTypes'] = process_list_columns(colombo_restaurants, 'mealTypes')
colombo_attractions['subcategories'] = process_list_columns(colombo_attractions, 'subcategories')

# Compute TF-IDF similarity for content-based filtering
def compute_similarity(df, column):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[column].fillna(''))
    return cosine_similarity(tfidf_matrix)

hotel_similarity = compute_similarity(colombo_hotels, 'all_amenities')
restaurant_similarity = compute_similarity(colombo_restaurants, 'cuisines')
attraction_similarity = compute_similarity(colombo_attractions, 'subcategories')

# Collaborative filtering using rating similarity
hotel_rating_sim = cosine_similarity(colombo_hotels[['rating']].fillna(0))
restaurant_rating_sim = cosine_similarity(colombo_restaurants[['rating']].fillna(0))
attraction_rating_sim = cosine_similarity(colombo_attractions[['rating']].fillna(0))

# Hybrid recommendation function
def hybrid_recommend(df, similarity_matrix, rating_sim_matrix, user_preference, top_n=5):
    final_score = (0.5 * similarity_matrix) + (0.5 * rating_sim_matrix)
    recommendations = np.argsort(-final_score, axis=1)[:, :top_n]
    filtered_recommendations = []
    for i in range(len(df)):
        filtered = df.iloc[recommendations[i]]
        if 'cuisines' in df.columns and user_preference:
            filtered = filtered[filtered['cuisines'].str.contains(user_preference, case=False, na=False)]
        if filtered.empty:
            filtered = df.iloc[recommendations[i]]  # Fallback to original recommendations
        filtered_recommendations.append(filtered)
    return filtered_recommendations

# Generate itinerary for all users
for index, user in user_inputs.iterrows():
    print(f"\nUser {index + 1}: {user['name']}")
    print(f"Destination: {user['destination']}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Activity Preference: {user['activities_preference']}")

    if user['destination'].strip().lower() != 'colombo':
        print("Information not available for the selected destination.")
        continue

    recommended_hotels = hybrid_recommend(colombo_hotels, hotel_similarity, hotel_rating_sim, '')
    recommended_restaurants = hybrid_recommend(colombo_restaurants, restaurant_similarity, restaurant_rating_sim, user['cuisine_preference'])
    recommended_attractions = hybrid_recommend(colombo_attractions, attraction_similarity, attraction_rating_sim, user['activities_preference'])

    itinerary = []
    hotel_index = 0
    hotel_change_interval = 3
    for day in range(6):  # Assuming a 6-day trip
        if day % hotel_change_interval == 0:
            hotel_index = min(day // hotel_change_interval, len(recommended_hotels) - 1)
        hotel = recommended_hotels[hotel_index].iloc[0]
        breakfast = recommended_restaurants[day % len(recommended_restaurants)].iloc[0] if not recommended_restaurants[day % len(recommended_restaurants)].empty else colombo_restaurants.iloc[0]
        attraction_1 = recommended_attractions[day % len(recommended_attractions)].iloc[0]
        lunch = recommended_restaurants[(day + 1) % len(recommended_restaurants)].iloc[0]
        attraction_2 = recommended_attractions[(day + 1) % len(recommended_attractions)].iloc[0]
        dinner = recommended_restaurants[(day + 2) % len(recommended_restaurants)].iloc[0]

        itinerary.append({
            'Day': day + 1,
            'Hotel': hotel['name'],
            'Hotel Rating': hotel['rating'],
            'Breakfast': breakfast['name'],
            'Breakfast Cuisines': breakfast['cuisines'],
            'Attraction 1': attraction_1['name'],
            'Attraction 1 Type': attraction_1['subcategories'],
            'Lunch': lunch['name'],
            'Lunch Cuisines': lunch['cuisines'],
            'Attraction 2': attraction_2['name'],
            'Attraction 2 Type': attraction_2['subcategories'],
            'Dinner': dinner['name'],
            'Dinner Cuisines': dinner['cuisines'],
        })

    for day in itinerary:
        print(f"\nDay {day['Day']}")
        print(f"Hotel: {day['Hotel']} (Rating: {day['Hotel Rating']})")
        print(f"Breakfast: {day['Breakfast']} - Cuisines: {day['Breakfast Cuisines']}")
        print(f"Attraction 1: {day['Attraction 1']} - Type: {day['Attraction 1 Type']}")
        print(f"Lunch: {day['Lunch']} - Cuisines: {day['Lunch Cuisines']}")
        print(f"Attraction 2: {day['Attraction 2']} - Type: {day['Attraction 2 Type']}")
        print(f"Dinner: {day['Dinner']} - Cuisines: {day['Dinner Cuisines']}")




User 1: Mathusha 
Destination: Ella
Cuisine Preference: Sri Lankan  Indian  Italian
Activity Preference: Nature Trails  Cultural Experiences  Adventurous  Shopping  Religious
Information not available for the selected destination.

User 2: nielia
Destination: Nuwara Eliya
Cuisine Preference: Sri Lankan
Activity Preference: Nature Trails  Cultural Experiences  Adventurous  Shopping
Information not available for the selected destination.

User 3: Ehansa
Destination: Ella
Cuisine Preference: Italian  Western
Activity Preference: Adventurous  Shopping  Spa and Wellness
Information not available for the selected destination.

User 4: Oshini
Destination: Ella
Cuisine Preference: Sri Lankan  Indian  Chinese  Western
Activity Preference: Historical Sites  Nature Trails  Shopping  Wildlife  Religious
Information not available for the selected destination.

User 5: Umar
Destination: Dambulla
Cuisine Preference: Chinese  Italian  Western
Activity Preference: Nature Trails  Adventurous  Wildlife 

In [None]:
from sklearn.cluster import KMeans

# Load Datasets
user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/preprocessed_user_inputs.xlsx')
colombo_hotels = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboHotels_processed.xlsx')
colombo_restaurants = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboRestaurants_processed.xlsx')
colombo_attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colombo_attractions_processed.xlsx')


# Data Preprocessing
def extract_median_budget(budget):
    numbers = re.findall(r'\d+', str(budget).replace(',', ''))
    if len(numbers) == 2:
        return (int(numbers[0]) + int(numbers[1])) / 2
    return np.nan

user_inputs['budget_per_day'] = user_inputs['budget_per_day'].apply(extract_median_budget)

def process_list_columns(df, column_name):
    return df[column_name].fillna('').apply(lambda x: x.replace(',', ' '))

user_inputs['cuisine_preference'] = process_list_columns(user_inputs, 'cuisine_preference')
user_inputs['activities_preference'] = process_list_columns(user_inputs, 'activities_preference')
colombo_hotels['all_amenities'] = process_list_columns(colombo_hotels, 'all_amenities')
colombo_restaurants['cuisines'] = process_list_columns(colombo_restaurants, 'cuisines')
colombo_restaurants['features'] = process_list_columns(colombo_restaurants, 'features')
colombo_restaurants['mealTypes'] = process_list_columns(colombo_restaurants, 'mealTypes')
colombo_attractions['subcategories'] = process_list_columns(colombo_attractions, 'subcategories')

# Compute Similarity
def compute_similarity(df, column):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[column].fillna(''))
    return cosine_similarity(tfidf_matrix)

hotel_similarity = compute_similarity(colombo_hotels, 'all_amenities')
restaurant_similarity = compute_similarity(colombo_restaurants, 'cuisines')
attraction_similarity = compute_similarity(colombo_attractions, 'subcategories')

hotel_rating_sim = cosine_similarity(colombo_hotels[['rating']].fillna(0))
restaurant_rating_sim = cosine_similarity(colombo_restaurants[['rating']].fillna(0))
attraction_rating_sim = cosine_similarity(colombo_attractions[['rating']].fillna(0))

# Hybrid Recommendation
import re

def hybrid_recommend(df, similarity_matrix, rating_sim_matrix, user_preference, top_n=5):
    final_score = (0.5 * similarity_matrix) + (0.5 * rating_sim_matrix)
    recommendations = np.argsort(-final_score, axis=1)[:, :top_n]
    filtered_recommendations = []

    for i in range(len(df)):
        filtered = df.iloc[recommendations[i]]
        if 'cuisines' in df.columns and user_preference:
            user_cuisines = re.split(r'\s+', user_preference.strip().lower())
            filtered = filtered[filtered['cuisines'].apply(lambda x: any(cuisine in x.lower().split() for cuisine in user_cuisines))]
        if filtered.empty:
            filtered = df.iloc[recommendations[i]]
        filtered_recommendations.append(filtered)

    return filtered_recommendations

# KMeans Clustering with 'address'
colombo_attractions['address_processed'] = colombo_attractions['address'].str.replace(r'[^a-zA-Z ]', '', regex=True)
vectorizer = TfidfVectorizer()
address_matrix = vectorizer.fit_transform(colombo_attractions['address_processed'])

num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
colombo_attractions['cluster'] = kmeans.fit_predict(address_matrix)

# Knapsack Algorithm
def knapsack(items, max_time):
    items = sorted(items, key=lambda x: x[1] / x[2], reverse=True)
    selected_items = []
    total_time = 0
    for item in items:
        if total_time + item[2] <= max_time:
            selected_items.append(item)
            total_time += item[2]
    return selected_items

# Generate Itinerary
for index, user in user_inputs.iterrows():
    print(f"\nUser {index + 1}: {user['name']}")
    print(f"Destination: {user['destination']}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Activity Preference: {user['activities_preference']}")

    if user['destination'].strip().lower() != 'colombo':
        print("Information not available for the selected destination.")
        continue

    recommended_hotels = hybrid_recommend(colombo_hotels, hotel_similarity, hotel_rating_sim, '')
    recommended_restaurants = hybrid_recommend(colombo_restaurants, restaurant_similarity, restaurant_rating_sim, user['cuisine_preference'])
    recommended_attractions = hybrid_recommend(colombo_attractions, attraction_similarity, attraction_rating_sim, user['activities_preference'])

    itinerary = []
    hotel_index = 0
    hotel_change_interval = 3
    for day in range(6):
        if day % hotel_change_interval == 0:
            hotel_index = min(day // hotel_change_interval, len(recommended_hotels) - 1)
        hotel = recommended_hotels[hotel_index].iloc[0]
        breakfast = recommended_restaurants[day % len(recommended_restaurants)].iloc[0]

        available_attractions = [(row['name'], row['rating'], row['duration_hours']) for _, row in recommended_attractions[day % len(recommended_attractions)].iterrows()]
        selected_attractions = knapsack(available_attractions, max_time=7)

        lunch = recommended_restaurants[(day + 1) % len(recommended_restaurants)].iloc[0]
        dinner = recommended_restaurants[(day + 2) % len(recommended_restaurants)].iloc[0]

        itinerary.append({
            'Day': day + 1,
            'Hotel': hotel['name'],
            'Hotel Rating': hotel['rating'],
            'Breakfast': breakfast['name'],
            'Attractions': selected_attractions,
            'Lunch': lunch['name'],
            'Dinner': dinner['name'],
        })

    for day in itinerary:
        print(f"\nDay {day['Day']}")
        print(f"Hotel: {day['Hotel']} (Rating: {day['Hotel Rating']})")
        print(f"Breakfast: {day['Breakfast']}")
        print("Attractions:")
        for attraction in day['Attractions']:
            print(f"- {attraction[0]} (Rating: {attraction[1]}, Duration: {attraction[2]} hrs)")
        print(f"Lunch: {day['Lunch']}")
        print(f"Dinner: {day['Dinner']}")



User 1: Mathusha 
Destination: Ella
Cuisine Preference: Sri Lankan  Indian  Italian
Activity Preference: Nature Trails  Cultural Experiences  Adventurous  Shopping  Religious
Information not available for the selected destination.

User 2: nielia
Destination: Nuwara Eliya
Cuisine Preference: Sri Lankan
Activity Preference: Nature Trails  Cultural Experiences  Adventurous  Shopping
Information not available for the selected destination.

User 3: Ehansa
Destination: Ella
Cuisine Preference: Italian  Western
Activity Preference: Adventurous  Shopping  Spa and Wellness
Information not available for the selected destination.

User 4: Oshini
Destination: Ella
Cuisine Preference: Sri Lankan  Indian  Chinese  Western
Activity Preference: Historical Sites  Nature Trails  Shopping  Wildlife  Religious
Information not available for the selected destination.

User 5: Umar
Destination: Dambulla
Cuisine Preference: Chinese  Italian  Western
Activity Preference: Nature Trails  Adventurous  Wildlife 

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

# Load Datasets
user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/preprocessed_user_inputs.xlsx')
colombo_hotels = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboHotels_processed.xlsx')
colombo_restaurants = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboRestaurants_processed.xlsx')
colombo_attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colombo_attractions_processed.xlsx')

# ---------------------------- #
# Data Preprocessing
# ---------------------------- #
def extract_median_budget(budget):
    numbers = re.findall(r'\d+', str(budget).replace(',', ''))
    if len(numbers) == 2:
        return (int(numbers[0]) + int(numbers[1])) / 2
    return np.nan

user_inputs['budget_per_day'] = user_inputs['budget_per_day'].apply(extract_median_budget)

# ---------------------------- #
# Encode List-Based Features
# ---------------------------- #
def encode_column(df, column_name):
    """Encodes a list-based categorical column into numerical labels."""
    unique_values = set()
    df[column_name].fillna('').apply(lambda x: unique_values.update(x.split(', ')))

    encoder = LabelEncoder()
    encoder.fit(list(unique_values))

    return df[column_name].fillna('').apply(lambda x: [encoder.transform([item])[0] for item in x.split(', ') if item in encoder.classes_])

# Apply encoding to relevant columns
colombo_hotels['encoded_amenities'] = encode_column(colombo_hotels, 'all_amenities')
colombo_restaurants['encoded_dietaryRestrictions'] = encode_column(colombo_restaurants, 'dietaryRestrictions')
colombo_restaurants['encoded_mealTypes'] = encode_column(colombo_restaurants, 'mealTypes')
colombo_restaurants['encoded_features'] = encode_column(colombo_restaurants, 'features')
colombo_restaurants['encoded_cuisines'] = encode_column(colombo_restaurants, 'cuisines')
colombo_attractions['encoded_subcategories'] = encode_column(colombo_attractions, 'subcategories')
user_inputs['encoded_cuisine_preference'] = encode_column(user_inputs, 'cuisine_preference')
user_inputs['encoded_activities_preference'] = encode_column(user_inputs, 'activities_preference')

# ---------------------------- #
# Compute Similarity
# ---------------------------- #
def compute_similarity(df, column):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[column].fillna(''))
    return cosine_similarity(tfidf_matrix)

hotel_similarity = compute_similarity(colombo_hotels, 'all_amenities')
restaurant_similarity = compute_similarity(colombo_restaurants, 'cuisines')
attraction_similarity = compute_similarity(colombo_attractions, 'subcategories')

hotel_rating_sim = cosine_similarity(colombo_hotels[['rating']].fillna(0))
restaurant_rating_sim = cosine_similarity(colombo_restaurants[['rating']].fillna(0))
attraction_rating_sim = cosine_similarity(colombo_attractions[['rating']].fillna(0))

# ---------------------------- #
# Hybrid Recommendation
# ---------------------------- #
def hybrid_recommend(df, similarity_matrix, rating_sim_matrix, user_preference, top_n=5):
    final_score = (0.5 * similarity_matrix) + (0.5 * rating_sim_matrix)
    recommendations = np.argsort(-final_score, axis=1)[:, :top_n]
    filtered_recommendations = []

    for i in range(len(df)):
        filtered = df.iloc[recommendations[i]]
        if 'cuisines' in df.columns and user_preference:
            user_cuisines = re.split(r'\s+', user_preference.strip().lower())
            filtered = filtered[filtered['cuisines'].apply(lambda x: any(cuisine in x.lower().split() for cuisine in user_cuisines))]
        if filtered.empty:
            filtered = df.iloc[recommendations[i]]
        filtered_recommendations.append(filtered)

    return filtered_recommendations

# ---------------------------- #
# KMeans Clustering with 'address'
# ---------------------------- #
colombo_attractions['address_processed'] = colombo_attractions['address'].str.replace(r'[^a-zA-Z ]', '', regex=True)
vectorizer = TfidfVectorizer()
address_matrix = vectorizer.fit_transform(colombo_attractions['address_processed'])

num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
colombo_attractions['cluster'] = kmeans.fit_predict(address_matrix)

# ---------------------------- #
# Knapsack Algorithm
# ---------------------------- #
def knapsack(items, max_time):
    items = sorted(items, key=lambda x: x[1] / x[2], reverse=True)
    selected_items = []
    total_time = 0
    for item in items:
        if total_time + item[2] <= max_time:
            selected_items.append(item)
            total_time += item[2]
    return selected_items

# ---------------------------- #
# Generate Itinerary
# ---------------------------- #
for index, user in user_inputs.iterrows():
    print(f"\nUser {index + 1}: {user['name']}")
    print(f"Destination: {user['destination']}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Activity Preference: {user['activities_preference']}")

    if user['destination'].strip().lower() != 'colombo':
        print("Information not available for the selected destination.")
        continue

    recommended_hotels = hybrid_recommend(colombo_hotels, hotel_similarity, hotel_rating_sim, '')
    recommended_restaurants = hybrid_recommend(colombo_restaurants, restaurant_similarity, restaurant_rating_sim, user['cuisine_preference'])
    recommended_attractions = hybrid_recommend(colombo_attractions, attraction_similarity, attraction_rating_sim, user['activities_preference'])

    itinerary = []
    hotel_index = 0
    hotel_change_interval = 3
    for day in range(6):
        if day % hotel_change_interval == 0:
            hotel_index = min(day // hotel_change_interval, len(recommended_hotels) - 1)
        hotel = recommended_hotels[hotel_index].iloc[0]

        breakfast = recommended_restaurants[day % len(recommended_restaurants)].iloc[0]
        lunch = recommended_restaurants[(day + 1) % len(recommended_restaurants)].iloc[0]
        dinner = recommended_restaurants[(day + 2) % len(recommended_restaurants)].iloc[0]

        available_attractions = [(row['name'], row['rating'], row['duration_hours']) for _, row in recommended_attractions[day % len(recommended_attractions)].iterrows()]
        selected_attractions = knapsack(available_attractions, max_time=7)

        itinerary.append({
            'Day': day + 1,
            'Hotel': hotel['name'],
            'Hotel Rating': hotel['rating'],
            'Breakfast': (breakfast['name'], breakfast['cuisines']),
            'Attractions': selected_attractions,
            'Lunch': (lunch['name'], lunch['cuisines']),
            'Dinner': (dinner['name'], dinner['cuisines']),
        })

    # Print the itinerary
    for day in itinerary:
        print(f"\nDay {day['Day']}")
        print(f"Hotel: {day['Hotel']} (Rating: {day['Hotel Rating']})")

        print(f"Breakfast: {day['Breakfast'][0]} | Cuisines: {day['Breakfast'][1]}")
        print("Attractions:")
        for attraction in day['Attractions']:
            print(f"- {attraction[0]} (Rating: {attraction[1]}, Duration: {attraction[2]} hrs)")
        print(f"Lunch: {day['Lunch'][0]} | Cuisines: {day['Lunch'][1]}")
        print(f"Dinner: {day['Dinner'][0]} | Cuisines: {day['Dinner'][1]}")



User 1: Mathusha 
Destination: Ella
Cuisine Preference: Sri Lankan, Indian, Italian
Activity Preference: Nature Trails, Cultural Experiences, Adventurous, Shopping, Religious
Information not available for the selected destination.

User 2: nielia
Destination: Nuwara Eliya
Cuisine Preference: Sri Lankan
Activity Preference: Nature Trails, Cultural Experiences, Adventurous, Shopping
Information not available for the selected destination.

User 3: Ehansa
Destination: Ella
Cuisine Preference: Italian, Western
Activity Preference: Adventurous, Shopping, Spa and Wellness
Information not available for the selected destination.

User 4: Oshini
Destination: Ella
Cuisine Preference: Sri Lankan, Indian, Chinese, Western
Activity Preference: Historical Sites, Nature Trails, Shopping, Wildlife, Religious
Information not available for the selected destination.

User 5: Umar
Destination: Dambulla
Cuisine Preference: Chinese, Italian, Western
Activity Preference: Nature Trails, Adventurous, Wildlife,

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #
user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/preprocessed_user_inputs.xlsx')
colombo_hotels = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboHotels_processed.xlsx')
colombo_restaurants = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colomboRestaurants_processed.xlsx')
colombo_attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/colombo_attractions_processed.xlsx')

# ---------------------------- #
# Step 2: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Encode destinations
all_destinations = set()
for dest in user_inputs['destination'].dropna():
    all_destinations.update(dest.split(', '))
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(all_destinations))}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip()) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in colombo_restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in colombo_hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
all_dietary_restrictions = set()
for dietary_restriction in colombo_restaurants['dietaryRestrictions'].dropna():
    all_dietary_restrictions.update(dietary_restriction.split(', '))
dietary_encoder = {diet: idx + 1 for idx, diet in enumerate(sorted(all_dietary_restrictions))}

# Apply encoding
colombo_restaurants['encoded_cuisines'] = colombo_restaurants['cuisines'].apply(lambda x: {cuisine_encoder[c] for c in x.split(', ')} if pd.notna(x) else set())
colombo_hotels['encoded_amenities'] = colombo_hotels['all_amenities'].apply(lambda x: {amenity_encoder[a] for a in x.split(', ')} if pd.notna(x) else set())
colombo_restaurants['encoded_dietaryRestrictions'] = colombo_restaurants['dietaryRestrictions'].apply(lambda x: {dietary_encoder[d] for d in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 3: Hotel and Restaurant Matching
# ---------------------------- #

def encode_budget(budget_str):
    if pd.isna(budget_str):
        return None
    budget_str = budget_str.replace('Rs.', '').replace(',', '')
    budget_range = budget_str.split('-')
    if len(budget_range) == 1:
        return (float(budget_range[0].replace('+', '').strip()), float('inf'))
    else:
        return (float(budget_range[0].strip()), float(budget_range[1].strip()))

def match_best_hotels(user_budget, hotels_df):
    encoded_budget = encode_budget(user_budget)
    if not encoded_budget:
        return hotels_df.sort_values(by=['rating', 'rankingPosition', 'rankingDenominator'], ascending=[False, True, True])
    min_budget, max_budget = encoded_budget
    hotels_df['priceRange_LKR'] = hotels_df['priceRange_LKR'].astype(str)
    hotels_df['min_price'] = hotels_df['priceRange_LKR'].apply(lambda x: float(x.split('-')[0].replace('Rs.', '')) if '-' in x else float(x.replace('Rs.', '').replace('+', '')))
    hotels_df['max_price'] = hotels_df['priceRange_LKR'].apply(lambda x: float(x.split('-')[1].replace('Rs.', '')) if '-' in x else float('inf'))
    budget_hotels = hotels_df[(hotels_df['min_price'] >= min_budget) & (hotels_df['max_price'] <= max_budget)]
    return budget_hotels.sort_values(by=['rating', 'rankingPosition', 'rankingDenominator'], ascending=[False, True, True]) if not budget_hotels.empty else hotels_df

def match_best_restaurants(user_cuisine_pref, user_food_pref, restaurants_df):
    user_encoded_cuisines = {cuisine_encoder.get(c, 0) for c in user_cuisine_pref.split(', ') if c in cuisine_encoder}
    user_encoded_food_pref = food_preference_encoder.get(user_food_pref, 3)
    restaurants_df['match_score'] = restaurants_df.apply(lambda row: len(user_encoded_cuisines.intersection(row['encoded_cuisines'])) + int(user_encoded_food_pref in row['encoded_dietaryRestrictions']), axis=1)
    return restaurants_df.sort_values(by='match_score', ascending=False)

def match_best_attractions(attractions_df):
    return attractions_df.sort_values(by=['rankingPosition', 'rankingDenominator'], ascending=[True, False])

# ---------------------------- #
# Step 4: Generate Itinerary
# ---------------------------- #

for index, user in user_inputs.iterrows():
    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destination: {user['destination']}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")

    if 'Colombo' not in user['destination']:
        print("WARNING: Information not available for the selected destination.")
        continue

    # Recommend hotels
    recommended_hotels = match_best_hotels(user['budget_per_day'], colombo_hotels)
    if recommended_hotels.empty:
        print("WARNING: No hotels found within budget.")
        continue

    # Recommend restaurants
    recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], colombo_restaurants)

    # Recommend attractions
    recommended_attractions = match_best_attractions(colombo_attractions)

    print("\nGenerated Itinerary:\n")

    for day in range(6):
        print(f"Day {day + 1}")

        # Select hotel (change every 3 days)
        hotel = recommended_hotels.iloc[min(day // 3, len(recommended_hotels) - 1)]
        print(f"Hotel: {hotel['name']} (Rating: {hotel['rating']})")

        # Select restaurants for the day
        breakfast = recommended_restaurants.iloc[day % len(recommended_restaurants)]
        lunch = recommended_restaurants.iloc[(day + 1) % len(recommended_restaurants)]
        dinner = recommended_restaurants.iloc[(day + 2) % len(recommended_restaurants)]

        # Select attractions
        available_attractions = recommended_attractions.copy()
        morning_activities = []
        afternoon_activities = []

        morning_time = 4  # 9:45 AM to 1:30 PM (4 hours available)
        afternoon_time = 4.5  # 3:00 PM to 7:30 PM (4.5 hours available)

        for _, attraction in available_attractions.iterrows():
            duration = attraction['duration_hours']
            if morning_time >= duration:
                morning_activities.append(attraction)
                morning_time -= duration
            elif afternoon_time >= duration:
                afternoon_activities.append(attraction)
                afternoon_time -= duration

        def print_meal(meal_name, restaurant):
            dietary = restaurant['dietaryRestrictions'] if restaurant['dietaryRestrictions'] != 'unknown' else 'Non-Veg'
            print(f"{meal_name}: {restaurant['name']} ({restaurant['cuisines']}) | Dietary: {dietary}")

        print_meal("Breakfast (8:30 AM - 9:30 AM)", breakfast)

        print("Morning Attractions (9:45 AM - 1:30 PM):")
        for attraction in morning_activities:
            print(f"- {attraction['name']} (Duration: {attraction['duration_hours']} hours)")

        print_meal("Lunch (1:45 PM - 2:45 PM)", lunch)

        print("Afternoon Attractions (3:00 PM - 7:30 PM):")
        for attraction in afternoon_activities:
            print(f"- {attraction['name']} (Duration: {attraction['duration_hours']} hours)")

        print_meal("Dinner (7:45 PM - 8:45 PM)", dinner)

        print("Returning to Hotel\n")
        print("-" * 40)  # Separator for clarity


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Afternoon Attractions (3:00 PM - 7:30 PM):
- Sri Lanka Driver Hire (Duration: 3 hours)
Dinner (7:45 PM - 8:45 PM): AYU (Indian, European, Asian, Contemporary, Fusion, Sri Lankan) | Dietary: Vegetarian friendly, Vegan options, Gluten free options
Returning to Hotel

----------------------------------------
Day 4
Hotel: Ivy Lane Colombo (Rating: 4.0)
Breakfast (8:30 AM - 9:30 AM): Arcadia Cafe and restaurant (Italian, Chinese, American, Indian, Asian, Sri Lankan) | Dietary: Vegetarian friendly
Morning Attractions (9:45 AM - 1:30 PM):
- Blue Lanka Tours (Duration: 4 hours)
Lunch (1:45 PM - 2:45 PM): AYU (Indian, European, Asian, Contemporary, Fusion, Sri Lankan) | Dietary: Vegetarian friendly, Vegan options, Gluten free options
Afternoon Attractions (3:00 PM - 7:30 PM):
- Sri Lanka Driver Hire (Duration: 3 hours)
Dinner (7:45 PM - 8:45 PM): Momo (Chinese, Indian, Seafood, Asian, Diner, Sri Lankan) | Dietary: Vegetarian frien

In [18]:
# ---------------------------- #
# Step 1: Load Datasets
# ---------------------------- #

user_inputs = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/user_inputs_preprocessed.xlsx')
Hotels = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/ProcessedHotels.csv')
Restaurants = pd.read_csv('/content/drive/MyDrive/DSGP_ME/MergedDatasets/PreprocessedRestaurants.csv')
Attractions = pd.read_excel('/content/drive/MyDrive/DSGP_ME/MergedDatasets/attractions_preprocessed.xlsx')

# ---------------------------- #
# Step 2: Clean Column Names (Remove Spaces)
# ---------------------------- #
def clean_column_names(df):
    df.columns = [col.replace(' ', '_') for col in df.columns]
    return df

user_inputs = clean_column_names(user_inputs)
Hotels = clean_column_names(Hotels)
Restaurants = clean_column_names(Restaurants)
Attractions = clean_column_names(Attractions)

# ---------------------------- #
# Step 3: Encoding Destinations, Cuisines, Amenities, and Dietary Preferences
# ---------------------------- #

# Allowed destinations
allowed_destinations = {'Kandy', 'Ella', 'Colombo', 'Nuwara Eliya'}

# Encode destinations
all_destinations = set()
for dest in user_inputs['destination'].dropna():
    all_destinations.update(dest.split(', '))
destination_encoder = {dest: idx + 1 for idx, dest in enumerate(sorted(all_destinations)) if dest in allowed_destinations}

def encode_destinations(destination_str):
    return [destination_encoder.get(dest.strip()) for dest in destination_str.split(', ') if dest.strip() in destination_encoder]

user_inputs['encoded_destination'] = user_inputs['destination'].apply(encode_destinations)

# Encode cuisines in restaurants
all_cuisines = set()
for cuisines in Restaurants['cuisines'].dropna():
    all_cuisines.update(cuisines.split(', '))
cuisine_encoder = {cuisine: idx + 1 for idx, cuisine in enumerate(sorted(all_cuisines))}

# Encode amenities in hotels
all_amenities = set()
for amenities in Hotels['all_amenities'].dropna():
    all_amenities.update(amenities.split(', '))
amenity_encoder = {amenity: idx + 1 for idx, amenity in enumerate(sorted(all_amenities))}

# Encode dietary restrictions in restaurants
veg_friendly = {'Vegetarian friendly', 'Vegan options', 'Halal', 'Gluten free options'}
non_veg_friendly = {'No Special Dietary'}

def encode_dietary_restrictions(dietary_str):
    if pd.isna(dietary_str):
        return set()
    restrictions = set(dietary_str.split(', '))
    if restrictions & veg_friendly:
        return {1}  # Veg
    elif restrictions & non_veg_friendly:
        return {3}  # Non-Veg
    return set()

Restaurants['encoded_dietaryrestrictions'] = Restaurants['dietaryrestrictions'].apply(encode_dietary_restrictions)

# Apply encoding
Restaurants['encoded_cuisines'] = Restaurants['cuisines'].apply(lambda x: {cuisine_encoder[c] for c in x.split(', ')} if pd.notna(x) else set())
Hotels['encoded_amenities'] = Hotels['all_amenities'].apply(lambda x: {amenity_encoder[a] for a in x.split(', ')} if pd.notna(x) else set())

# Encode user's food preference
food_preference_encoder = {'Veg': 1, 'Non-Veg': 3}
user_inputs['encoded_food_preference'] = user_inputs['food_preference'].map(food_preference_encoder)

# ---------------------------- #
# Step 4: Extract City Information
# ---------------------------- #
def extract_city(address):
    match = re.search(r'([A-Za-z ]+),? Sri Lanka', str(address))
    return match.group(1).strip() if match else None

Hotels['extracted_city'] = Hotels['address'].apply(extract_city)

# ---------------------------- #
# Step 5: Matching Functions
# ---------------------------- #
def match_best_hotels(user_budget, user_destination, hotels_df):
    hotels_in_city = hotels_df[hotels_df['extracted_city'] == user_destination]
    return hotels_in_city.sort_values(by=['rating', 'rankingposition'], ascending=[False, True])

def match_best_restaurants(user_cuisine_pref, user_food_pref, user_destination, restaurants_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    restaurants_in_city = restaurants_df[restaurants_df['addressobj_city'] == user_destination].copy()
    user_encoded_cuisines = {cuisine_encoder.get(c, 0) for c in user_cuisine_pref.split(', ') if c in cuisine_encoder}
    user_encoded_food_pref = food_preference_encoder.get(user_food_pref, 3)

    def calculate_match_score(row):
        cuisine_score = len(user_encoded_cuisines.intersection(row['encoded_cuisines']))
        dietary_score = int(user_encoded_food_pref in row['encoded_dietaryrestrictions'])
        return cuisine_score + dietary_score

    restaurants_in_city['match_score'] = restaurants_in_city.apply(calculate_match_score, axis=1)
    return restaurants_in_city.sort_values(by='match_score', ascending=False)

def match_best_attractions(user_budget, user_destination, attractions_df):
    if user_destination not in allowed_destinations:
        return pd.DataFrame()
    attractions_in_city = attractions_df[(attractions_df['City'] == user_destination) & (attractions_df['Lowest_Price'] <= user_budget)]
    return attractions_in_city.sort_values(by=['Rating', 'Ranking_Position'], ascending=[False, True])

# ---------------------------- #
# Step 6: Generate Itinerary
# ---------------------------- #

for index, user in user_inputs.iterrows():
    print(f"\n===== User {index + 1}: {user['name']} =====")
    print(f"Destination: {user['destination']}")
    print(f"Cuisine Preference: {user['cuisine_preference']}")
    print(f"Food Preference: {user['food_preference']}")
    print(f"Budget Per Day: {user['budget_per_day']}")

    if user['destination'] not in allowed_destinations:
        print("WARNING: Information not available for the selected destination.")
        continue

    recommended_hotels = match_best_hotels(user['budget_per_day'], user['destination'], Hotels)
    recommended_restaurants = match_best_restaurants(user['cuisine_preference'], user['food_preference'], user['destination'], Restaurants)
    recommended_attractions = match_best_attractions(user['budget_per_day'], user['destination'], Attractions)

    print("\nGenerated Itinerary:\n")
    for day in range(6):
        print(f"Day {day + 1}")

        hotel = recommended_hotels.iloc[min(day // 3, len(recommended_hotels) - 1)]
        breakfast_restaurant = recommended_restaurants.iloc[min(day * 3, len(recommended_restaurants) - 1)]
        attraction_1 = recommended_attractions.iloc[min(day * 2, len(recommended_attractions) - 1)]
        lunch_restaurant = recommended_restaurants.iloc[min(day * 3 + 1, len(recommended_restaurants) - 1)]
        attraction_2 = recommended_attractions.iloc[min(day * 2 + 1, len(recommended_attractions) - 1)]
        dinner_restaurant = recommended_restaurants.iloc[min(day * 3 + 2, len(recommended_restaurants) - 1)]

        print(f"Hotel: {hotel['name']} (Rating: {hotel['rating']})")
        print(f"Breakfast: {breakfast_restaurant['name']} (Cuisines: {breakfast_restaurant['cuisines']})")
        print(f"Attraction 1: {attraction_1['Name']} (Lowest Price: {attraction_1['Lowest_Price']})")
        print(f"Lunch: {lunch_restaurant['name']} (Cuisines: {lunch_restaurant['cuisines']})")
        print(f"Attraction 2: {attraction_2['Name']} (Lowest Price: {attraction_2['Lowest_Price']})")
        print(f"Dinner: {dinner_restaurant['name']} (Cuisines: {dinner_restaurant['cuisines']})")
        print("Returning to Hotel\n")
        print("-" * 40)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Breakfast: The Pub (Cuisines: Bar, Pub)
Attraction 1: Sri Lanka Magical Tours (Lowest Price: No price mentioned)
Lunch: Oishi Cafeteria (Cuisines: Chinese, Indian, Barbecue, Asian, Sri Lankan)
Attraction 2: Best Journey (Lowest Price: No price mentioned)
Dinner: Cafe Noshers (Cuisines: Italian, Cafe, European, Healthy)
Returning to Hotel

----------------------------------------
Day 5
Hotel: Serenus Boutique Villa (Rating: 4.5)
Breakfast: The Cafe By The Mount Star (Cuisines: Pizza, Cafe, Asian, Sri Lankan, Arabic)
Attraction 1: 35A Cabs and Tours Nuwara Eliya (Lowest Price: No price mentioned)
Lunch: Cool Land Hot Hut Restaurant (Cuisines: Asian, Sri Lankan)
Attraction 2: Splendidceylon (Lowest Price: No price mentioned)
Dinner: Tck 6685 Restaurant (Cuisines: International, Sri Lankan)
Returning to Hotel

----------------------------------------
Day 6
Hotel: Serenus Boutique Villa (Rating: 4.5)
Breakfast: Grill Rush Cafe