<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Personalized_Itinerary_Generator_Based_Radius/Model_Training_for_Travel_Planner_Based_Radius.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, MultiLabelBinarizer
from geopy.geocoders import Nominatim
from urllib.parse import urlparse

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Restaurants Data Preprocessed

# Load the merged CSV file
file_path = '/content/drive/My Drive/Colombo/FinalPreprocessedMergedRestaurants.csv'
df = pd.read_csv(file_path)

# 1. Address Preprocessing
# Geocode to latitude and longitude (if needed)
geolocator = Nominatim(user_agent="geoapiExercises")
def get_coordinates(address):
    try:
        location = geolocator.geocode(address)
        return location.latitude, location.longitude
    except:
        return None, None

# Uncomment if geocoding is needed
#df['latitude'], df['longitude'] = zip(*df['address'].apply(get_coordinates))

# Print the row size before removing duplicates
rows_before = df.shape[0]
print(f"Rows before removing duplicates: {rows_before}")

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Print the row size after removing duplicates
rows_after = df.shape[0]
print(f"Rows after removing duplicates: {rows_after}")

# 2. Category Encoding
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# 5. Multi-Label Columns Preprocessing
def process_list_columns(column):
    return column.apply(lambda x: [item.strip() for item in str(x).split(',')] if pd.notnull(x) else [])

# Step 1: Create a mapping dictionary for city grouping
city_mapping = {
    # Map to Nuwara Eliya
    'Kotmale': 'Nuwara Eliya',
    'Nanu Oya': 'Nuwara Eliya',
    'Ramboda': 'Nuwara Eliya',
    'Talawakele': 'Nuwara Eliya',
    'Ambewela': 'Nuwara Eliya',

    # Map to Ella
    'Haputale': 'Ella',
    'Bandarawela': 'Ella',
    'Kithalella': 'Ella',
    'Wellawaya': 'Ella',

    # Map to Kandy
    'Peradeniya': 'Kandy',
    'Gampola': 'Kandy',
    'Kadugannawa': 'Kandy',
    'Tennekumbura': 'Kandy',

    # Map to Colombo
    'Kadawata': 'Colombo',
    'Maharagama': 'Colombo',
    'Wattala': 'Colombo',
    'Kaduwela': 'Colombo',
    'Rajagiriya': 'Colombo',
    'Battaramulla': 'Colombo',
    'Dehiwala-Mount Lavinia': 'Colombo',
    'Nugegoda': 'Colombo',
    'Mount Lavinia': 'Colombo',
    'Uswetakeiyawa': 'Colombo',
    'Boralesgamuwa': 'Colombo',
    'Malabe': 'Colombo',
    'Thalawathugoda': 'Colombo',
    'Dehiwala': 'Colombo',
    'Kelaniya': 'Colombo',
    'Kiribathgoda': 'Colombo',
    'Panadura': 'Colombo'
}

# Apply the city mapping to create a permanent city column
df['city'] = df['addressobj_city'].replace(city_mapping)

# Ensure all other cities are labeled correctly
df['city'] = df['city'].apply(lambda x: x if x in ['Nuwara Eliya', 'Ella', 'Kandy', 'Colombo'] else 'Other')

# 3. One-Hot Encoding to create only four main city columns
city_dummies = pd.get_dummies(df['city'], prefix='city')[['city_Colombo', 'city_Ella', 'city_Kandy', 'city_Nuwara Eliya']].astype(int)
df = pd.concat([df, city_dummies], axis=1)

# drop addressobj_city column
df.drop(columns=['addressobj_city'], inplace=True)

df['cuisines'] = process_list_columns(df['cuisines'])
df['features'] = process_list_columns(df['features'])
df['dietaryrestrictions'] = process_list_columns(df['dietaryrestrictions'])
df['mealtypes'] = process_list_columns(df['mealtypes'])

# One-Hot Encoding for multi-label data
mlb = MultiLabelBinarizer()
# Create DataFrames for one-hot encoded features and add prefix to column names
cuisine_df = pd.DataFrame(mlb.fit_transform(df['cuisines']), columns=mlb.classes_, index=df.index)
cuisine_df = cuisine_df.add_prefix('cuisine_')  # Add prefix here
df = df.join(cuisine_df)

feature_df = pd.DataFrame(mlb.fit_transform(df['features']), columns=mlb.classes_, index=df.index)
feature_df = feature_df.add_prefix('feature_')  # Add prefix here
df = df.join(feature_df)

dietary_df = pd.DataFrame(mlb.fit_transform(df['dietaryrestrictions']), columns=mlb.classes_, index=df.index)
dietary_df = dietary_df.add_prefix('dietary_')  # Add prefix here
df = df.join(dietary_df)

mealtype_df = pd.DataFrame(mlb.fit_transform(df['mealtypes']), columns=mlb.classes_, index=df.index)
mealtype_df = mealtype_df.add_prefix('mealtype_')  # Add prefix here
df = df.join(mealtype_df)

# Save the final cleaned data to a new file
final_processed_file_path = "/content/drive/My Drive/DataPre/Restaurants/LastPreprocessedMergedRestaurants.csv"
df.to_csv(final_processed_file_path, index=False)

Rows before removing duplicates: 693
Rows after removing duplicates: 693


In [5]:
# User Data Preprocessing

# Load the uploaded file
file_path = "/content/drive/My Drive/DataPre/User/UserInputs.csv"
df = pd.read_csv(file_path)

# Generate synthetic 'maximum_distance' column with values ranging from 10km to 50km
df['maximum_distance'] = np.random.randint(10, 51, size=len(df))

# Define a dictionary to map old column names to new names
column_mapping = {
    "name": "name",
    "how_many_people_are_traveling": "peoplecount",
    "destination": "destination",
    "number_of_days": "numberofdays",
    "what_is_your_age_category": "agecategory",
    "accomadation_type": "accomadation_type",
    "budget_per_day": "budget_per_day",
    "food_preference": "food_preference",
    "cuisine_preference": "cuisine_preference",
    "activities_preference": "activities_preference",
    "time_preference_activities": "time_preference_activities",
    "which_transportation_modes_are_you_preferred_for_traveling_within_the_destination": "transportation_mode",
    "children_or_pets": "children_or_pets",
    "maximum_distance": "maximum_distance"
}

# Select and rename columns in one step using the dictionary
df = df.rename(columns=column_mapping)[column_mapping.values()]

# Dropping unnecessary columns if they exist in the dataset
columns_to_drop = [
    "do_you_wish_to_have_a_trip_itinerary_generator_website_for_your_future_trips",
    "is_there_anything_else_you_would_like_us_to_know_about_your_travel_preferences"
]
df = df.drop(columns=columns_to_drop, errors='ignore')

# Convert "budget_per_day" ranges into numerical values (average of range)
def convert_budget_to_numeric(budget):
    budget = str(budget).replace('Rs.', '').strip()  # Remove 'Rs.' prefix
    match = re.findall(r'\d+', budget.replace(',', ''))
    if len(match) == 2:
        return (int(match[0]) + int(match[1])) / 2
    elif len(match) == 1:
        return int(match[0])
    else:
        return None

df['budget_per_day'] = df['budget_per_day'].apply(convert_budget_to_numeric)

# Print the row size before removing duplicates
rows_before = df.shape[0]
print(f"Rows before removing duplicates: {rows_before}")

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Print the row size after removing duplicates
rows_after = df.shape[0]
print(f"Rows after removing duplicates: {rows_after}")

# Reapplying preprocessing with multi-label splitting and one-hot encoding for specific columns

from sklearn.preprocessing import MultiLabelBinarizer

# Define multi-label columns for special encoding
multi_label_columns = [
    "cuisine_preference",
    "activities_preference",
    "time_preference_activities",
    "transportation_mode",
    "peoplecount",
    "destination",
    "agecategory",
    "accomadation_type",
    "food_preference",
    "children_or_pets"
]

# Split multi-label columns into lists
for column in multi_label_columns:
    df[column] = df[column].apply(lambda x: [item.strip() for item in str(x).split(',')])

# Apply MultiLabelBinarizer to each multi-label column
mlb = MultiLabelBinarizer()

multi_label_encoded_data = pd.DataFrame()

for column in multi_label_columns:
    # One-hot encode the multi-label columns
    encoded = pd.DataFrame(mlb.fit_transform(df[column]),
                           columns=[f"{column}_{cls}" for cls in mlb.classes_],
                           index=df.index)
    multi_label_encoded_data = pd.concat([multi_label_encoded_data, encoded], axis=1)

# Drop the original multi-label columns from filtered data
df = df.drop(columns=multi_label_columns)

# Combine the multi-label encoded columns with the rest of the encoded data
df = pd.concat([df, multi_label_encoded_data], axis=1)

# Save the updated file back to Google Drive
file_path = "/content/drive/My Drive/DataPre/User/PreprocessedUserInputs.csv"
df.to_csv(file_path, index=False)

print("File successfully saved!")

Rows before removing duplicates: 999
Rows after removing duplicates: 364
File successfully saved!


In [14]:
#Attraction Preprocessing
import pandas as pd
import os
import glob
import numpy as np

# List of folder paths where the CSVs are stored
folders = [
    "/content/drive/My Drive/DataPre/Attractions"
]

# Initialize an empty list to store DataFrames
dataframes = []

# Iterate over each folder and read only CSV files that start with "Attractions"
for folder in folders:
    csv_files = glob.glob(os.path.join(folder, "Attractions*.csv"))  # Corrected file pattern

    for file in csv_files:
        df = pd.read_csv(file)  # Read CSV
        dataframes.append(df)   # Append DataFrame to the list

# Concatenate all DataFrames into a single one
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged DataFrame as a CSV
merged_file_path = "/content/drive/My Drive/DataPre/Attractions/MergedAttractions.csv"
merged_df.to_csv(merged_file_path, index=False)

# Load the uploaded file
file_path = "/content/drive/My Drive/DataPre/Attractions/MergedAttractions.csv"
df = pd.read_csv(file_path)

# Print the row size before removing duplicates
rows_before = df.shape[0]
print(f"Rows before removing duplicates: {rows_before}")

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Print the row size after removing duplicates
rows_after = df.shape[0]
print(f"Rows after removing duplicates: {rows_after}")

def clean_and_process_df(df):
    # Standardize column names
    df.columns = [col.strip().lower().replace(' ', '').replace('/', '') for col in df.columns]

    # Define the required columns
    required_columns = [
        'name', 'address', 'addressobjcity', 'category',
        'latitude', 'longitude', 'rankingposition',
        'rating', 'offergrouplowestprice', 'weburl'
    ]

    # Include subcategories and subtype columns dynamically
    subcategory_columns = [f'subcategories{i}' for i in range(10)]
    subtype_columns = [f'subtype{i}' for i in range(46)]

    all_required_columns = required_columns + subcategory_columns + subtype_columns

    # Drop columns that are not in the required columns list and create a copy
    df = df[[col for col in df.columns if col in all_required_columns]].copy()

    # Combine subcategories and subtype columns into separate columns
    subcategory_columns = [col for col in df.columns if col.startswith('subcategories')]
    subtype_columns = [col for col in df.columns if col.startswith('subtype')]

    # Concatenate subcategories into 'SubCategory' column, excluding NaN values
    df['SubCategory'] = df[subcategory_columns].apply(
        lambda row: ', '.join([str(x) for x in row if pd.notna(x) and x and x.lower() != 'none']),
        axis=1
    )

    # Concatenate subtypes into 'Subtype' column, excluding NaN values
    df['Subtype'] = df[subtype_columns].apply(
        lambda row: ', '.join([str(x) for x in row if pd.notna(x) and x and x.lower() != 'none']),
        axis=1
    )

    # Drop the original subcategory and subtype columns to clean up
    df.drop(columns=subcategory_columns + subtype_columns, inplace=True)

    # Rename columns as specified
    df = df.rename(columns={
        'name': 'Name',
        'address': 'Address',
        'addressobjcity': 'City',
        'category': 'Category',
        'latitude': 'Latitude',
        'longitude': 'Longitude',
        'rankingposition': 'Ranking_Position',
        'rating': 'Rating',
        'offergrouplowestprice': 'Lowest Price',
        'weburl': 'Web URL'
    })

    # Fill missing values using .loc to avoid SettingWithCopyWarning
    # Convert Ranking_Position to numeric, fill missing with the mode
    df['Ranking_Position'] = pd.to_numeric(df['Ranking_Position'], errors='coerce')
    mode_value = df['Ranking_Position'].mode()[0] if not df['Ranking_Position'].mode().empty else 0
    df['Ranking_Position'] = df['Ranking_Position'].fillna(mode_value)

    # Convert Latitude and Longitude to numeric and coerce invalid values to NaN
    df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
    df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

    # Drop rows where either Latitude, Longitude or City are NaN
    df = df.dropna(subset=['Latitude', 'Longitude', 'City'])

    # Convert the 'Lowest Price' to LKR if it is in USD
    def convert_to_lkr(price):
        if isinstance(price, str) and price.startswith('$'):
            try:
                usd_value = float(price.replace('$', '').strip())
                lkr_value = usd_value * 320  # Convert USD to LKR
                return round(lkr_value, 2)   # Keep as float with 2 decimal points
            except ValueError:
                return np.nan  # Return NaN for invalid prices
        try:
            # If the price is already in LKR, convert it to a float
            return float(price)
        except ValueError:
            return np.nan  # Return NaN if conversion fails

    # Apply the conversion to the 'Lowest Price' column
    df['Lowest Price'] = df['Lowest Price'].apply(convert_to_lkr)

    # Calculate the mode of the 'Lowest Price' column
    mode_price = df['Lowest Price'].mode()[0] if not df['Lowest Price'].mode().empty else 'No price mentioned'

    # Fill missing values with the mode value
    df['Lowest Price'] = df['Lowest Price'].fillna(mode_price)

    # Generate a random duration between 1 to 4 hours
    df['Duration'] = np.random.choice([1, 2, 3, 4], size=len(df))

    # Define city mappings based on the provided criteria
    city_mapping = {
        'Peradeniya': 'Kandy',
        'Gampola': 'Kandy',
        'Pussellawa': 'Kandy',
        'Heeloya': 'Kandy',
        'Gurudeniya': 'Kandy',
        'Pilimathalawa': 'Kandy',
        'Murutalawa': 'Kandy',

        'Badulla': 'Ella',
        'Nanu Oya': 'Ella',
        'Demodara': 'Ella',
        'Kalupahana': 'Ella',
        'Bandarawela': 'Ella',
        'Ambagollapathana': 'Ella',

        'Hatton': 'Nuwara Eliya',

        'Negombo': 'Colombo',
        'Wattala': 'Colombo',
        'Katunayaka': 'Colombo',
        'Kaduwela': 'Colombo',
        'Katunayake': 'Colombo'
    }

    # Apply the mapping to the 'City' column
    df['City'] = df['City'].apply(lambda x: city_mapping.get(x, x))

    return df

# Clean and process the merged dataset
df = clean_and_process_df(merged_df)

from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
import pandas as pd

# Multi-label encoding for 'City', 'SubCategory', and 'Subtype' columns
def multi_label_encoding(df, columns):
    for column in columns:
        mlb = MultiLabelBinarizer()
        # Split by comma, strip whitespace, and handle empty strings
        encoded_data = mlb.fit_transform(df[column].apply(lambda x: [item.strip() for item in str(x).split(',') if item.strip()]))
        encoded_df = pd.DataFrame(encoded_data, columns=[f"{column}_{cls}" for cls in mlb.classes_])
        df = pd.concat([df, encoded_df], axis=1)
    return df

# One-hot encoding for 'Category' column
def category_encoding(df, column):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Apply OneHotEncoder
    encoded_data = ohe.fit_transform(df[[column]])

    # Generate column names dynamically based on categories
    category_columns = [f"{column}_{cls}" for cls in ohe.categories_[0]]

    # Ensure the shape of data matches the expected columns
    encoded_df = pd.DataFrame(encoded_data, columns=category_columns, index=df.index)

    # Concatenate the encoded data with the original DataFrame
    df = pd.concat([df, encoded_df], axis=1)

    return df

# Apply multi-label encoding
multi_label_columns = ['City', 'SubCategory', 'Subtype']
df = multi_label_encoding(df, multi_label_columns)

# Apply category encoding
df = category_encoding(df, 'Category')

# Save the cleaned DataFrame to a new CSV file
output_file = '/content/drive/My Drive/DataPre/Attractions/PreprocessedMergedAttractions.csv'
df.to_csv(output_file, index=False)

print(f"Processing complete. Cleaned data saved to '{output_file}'")

Rows before removing duplicates: 988
Rows after removing duplicates: 988
Processing complete. Cleaned data saved to '/content/drive/My Drive/DataPre/Attractions/PreprocessedMergedAttractions.csv'


In [11]:
# Load the uploaded file
file_path = "/content/drive/My Drive/DataPre/Hotels/MergedHotels.csv"
df = pd.read_csv(file_path)

# Print the row size before removing duplicates
rows_before = df.shape[0]
print(f"Rows before removing duplicates: {rows_before}")

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Print the row size after removing duplicates
rows_after = df.shape[0]
print(f"Rows after removing duplicates: {rows_after}")

# Step 1: Drop rows where latitude or longitude is "no latitude" or "no longitude"
df = df[(df['latitude'] != 'no latitude') & (df['longitude'] != 'no longitude') & (df['address'] != 'no address') & (df['addressobj_city'] != 'no city')]
# Step 2: Rename "all_amenities" to "amenities" and fill empty cells with "General"
df.rename(columns={'all_amenities': 'amenities'}, inplace=True)
df['amenities'] = df['amenities'].fillna('General')

# Step 3: Drop the "website" column
df.drop(columns=['website'], inplace=True)
df.drop(columns=['rankingstring'], inplace=True)
df.drop(columns=['pricerange'], inplace=True)

# Define a mapping for price levels based on the number of dollar signs
price_level_mapping = {
    '$': '5000',
    '$$': '10000',
    '$$$': '15000',
    '$$$$': '30000'
}

# Apply the mapping to the "pricelevel" column
def map_price_level(value):
    if pd.isnull(value) or 'unknown' in value.lower():
        return 'no pricelevel mentioned'

    # Normalize the value for consistent matching
    value = value.strip().replace(' ', '').replace('LKR', '').lower()

    return price_level_mapping.get(value, 'no pricelevel mentioned')

df['pricelevel'] = df['pricelevel'].apply(map_price_level)

# Calculate the mode of the "pricelevel" column, excluding 'no pricelevel mentioned'
pricelevel_mode = df[df['pricelevel'] != 'no pricelevel mentioned']['pricelevel'].mode()[0]

# Replace "no pricelevel mentioned" with the mode value
df['pricelevel'] = df['pricelevel'].replace('no pricelevel mentioned', pricelevel_mode)

# Step 5: Rename "addressobj_city" to "city"
df.rename(columns={'addressobj_city': 'city'}, inplace=True)

# Step 2: Replace "not rated" in the "rating" column with the mode of the column
rating_mode = df['rating'].mode()[0]  # Get the most frequent value (mode)
df['rating'] = df['rating'].replace('not rated', rating_mode)

# Step 7: Multi-label encoding for "amenities" and "category" columns

# For "amenities" column: Split amenities and apply one-hot encoding with proper prefix
amenities_dummies = df['amenities'].str.get_dummies(sep=', ')
amenities_dummies = amenities_dummies.add_prefix('amenities_')

# For "category" column: One-hot encoding with proper prefix
category_dummies = pd.get_dummies(df['category'], prefix='category')

# Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, amenities_dummies, category_dummies], axis=1)

city_mapping = {
    "Kandy": {"Werapitiya", "Bowalawatta", "Gurudeniya", "Heerassagala", "Narampanawa",
              "Teldeniya", "Kundasale", "Pallekele", "Digana", "Tennekumbura",
              "Sirimalwatta", "Peradeniya", "Thalathuoya"},
    "Ella": {"Ambagollapathana", "Bandarawela", "Badulla", "Yahalewela",
             "Kithalella", "Demodara"},
    "Nuwara Eliya": {"Seetha Eliya", "Hatton"},
    "Colombo": {"Kiribathgoda", "Piliyandala", "Katunayaka", "Kotte",
                "Sri Jayawardenepura", "Seeduwa", "Negombo",
                "Katunayake", "Dehiwala-Mount Lavinia"}
}

# Map city names using the nested dictionary
def map_city_name(city):
    for main_city, sub_cities in city_mapping.items():
        if city in sub_cities:
            return main_city
    return city  # Return the original city if not found in the mapping

# Apply the mapping function to the "city" column
df['city'] = df['city'].apply(map_city_name)

# Multi-label encoding for the "city" column with the prefix 'city_'
city_dummies = pd.get_dummies(df['city'], prefix='city')

# Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, city_dummies], axis=1)

# Save the merged DataFrame as a CSV
df.to_csv("/content/drive/My Drive/DataPre/Hotels/PreprocessedHotels.csv", index=False)

Rows before removing duplicates: 1925
Rows after removing duplicates: 1925


In [26]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from geopy.distance import geodesic
import random

# Load datasets from Google Drive
from google.colab import drive

drive.mount('/content/drive')

user_inputs = pd.read_csv('/content/drive/My Drive/DataPre/User/PreprocessedUserInputs.csv')
attractions = pd.read_csv('/content/drive/My Drive/DataPre/Attractions/PreprocessedMergedAttractions.csv')
restaurants = pd.read_csv('/content/drive/My Drive/DataPre/Restaurants/LastPreprocessedMergedRestaurants.csv')
hotels = pd.read_csv('/content/drive/My Drive/DataPre/Hotels/PreprocessedHotels.csv')

# Standardize column names for consistency
attractions.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# Preprocessing and feature engineering
# Ensure there are no missing values
user_inputs.fillna(0, inplace=True)
attractions.fillna(0, inplace=True)
restaurants.fillna(0, inplace=True)
hotels.fillna(0, inplace=True)

# Function to get suggestions within a maximum distance
def get_nearby_options(lat, lon, options, max_distance_km):
    nearby_options = []
    for _, row in options.iterrows():
        distance = geodesic((lat, lon), (row['latitude'], row['longitude'])).km
        if distance <= max_distance_km:
            nearby_options.append((row['name'], distance, row['category'], row['rating']))
    return sorted(nearby_options, key=lambda x: x[1])

# Function to generate an itinerary based on user input days
def generate_itinerary(user, hotel, attractions, restaurants, max_hours_per_day=8):
    num_days = int(user['numberofdays'])
    max_distance = float(user['maximum_distance'])
    itinerary = {f'Day {i+1}': {'Hotel': hotel['name'], 'Restaurants': [], 'Attractions': []} for i in range(num_days)}
    used_restaurants = set()
    used_attractions = set()
    hotel_lat, hotel_lon = hotel['latitude'], hotel['longitude']

    for day in itinerary.keys():
        # Select 3 restaurants
        available_restaurants = get_nearby_options(hotel_lat, hotel_lon, restaurants, max_distance)
        available_restaurants = [r for r in available_restaurants if r[0] not in used_restaurants]
        selected_restaurants = random.sample(available_restaurants, min(3, len(available_restaurants)))
        itinerary[day]['Restaurants'] = [r[0] for r in selected_restaurants]
        used_restaurants.update(itinerary[day]['Restaurants'])

        # Select 3-4 attractions considering max duration
        available_attractions = get_nearby_options(hotel_lat, hotel_lon, attractions, max_distance)
        available_attractions = [a for a in available_attractions if a[0] not in used_attractions]
        selected_attractions = random.sample(available_attractions, min(4, len(available_attractions)))
        total_duration = 0
        day_attractions = []

        for attraction in selected_attractions:
            duration = random.randint(1, 3)  # Mock duration between 1 to 3 hours
            if total_duration + duration <= max_hours_per_day:
                day_attractions.append(attraction[0])
                total_duration += duration

        itinerary[day]['Attractions'] = day_attractions
        used_attractions.update(day_attractions)

    return itinerary

# Generate itineraries for all users
all_itineraries = {}
for index, user in user_inputs.iterrows():
    hotel = hotels.iloc[index % len(hotels)]  # Rotate hotels for demonstration
    itinerary = generate_itinerary(user, hotel, attractions, restaurants)
    all_itineraries[user['name']] = itinerary

# Display itineraries for all users
for user_name, itinerary in all_itineraries.items():
    print(f'Itinerary for {user_name}:')
    for day, details in itinerary.items():
        print(f"\n{day}:")
        print(f"Hotel: {details['Hotel']}")
        print("Restaurants:")
        for restaurant in details['Restaurants']:
            print(f"  - {restaurant}")
        print("Attractions:")
        for attraction in details['Attractions']:
            print(f"  - {attraction}")
    print('\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  - Ella Taxi by rts-srilanka.com
  - Ella Tours
  - Suwamadura
  - Ella Spice Garden Cooking Class

Day 5:
Hotel: Jayamali Homestay
Restaurants:
  - La Mensa
  - Cafe Ice Cube
  - Nature life guest inn Ella Sri Lanka
Attractions:
  - Ella Nine Arch Spice Garden
  - Nine Arches Bridge
  - Travel Mo Sri Lanka


Itinerary for Uditha Ishan:

Day 1:
Hotel: Ella Retreat
Restaurants:
  - Golden Cabin Restaurant Ella
  - Cafe 4 You Ella
  - Tip Top Sky Bar Ella
Attractions:
  - Idyll Ella Hiking
  - Travel Mo Sri Lanka
  - Salon Glamour - Ella

Day 2:
Hotel: Ella Retreat
Restaurants:
  - Cafe Chill
  - Isle Of Gelato
  - Cafe on the Bend
Attractions:
  - Train Tours
  - Ella Spice Garden Cooking Class
  - Dream Club Ella
  - Helena Spa & Organic Gifts

Day 3:
Hotel: Ella Retreat
Restaurants:
  - Spice Ella Restaurant & Bar
  - The Happy Couple
  - Tiki Bar Ella
Attractions:
  - ECO TAXI Ella
  - Ella Taxi
  - Nine Arches Bridge


In [27]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
from geopy.distance import geodesic
import random

# Load datasets from Google Drive
from google.colab import drive

drive.mount('/content/drive')

user_inputs = pd.read_csv('/content/drive/My Drive/DataPre/User/PreprocessedUserInputs.csv')
attractions = pd.read_csv('/content/drive/My Drive/DataPre/Attractions/PreprocessedMergedAttractions.csv')
restaurants = pd.read_csv('/content/drive/My Drive/DataPre/Restaurants/LastPreprocessedMergedRestaurants.csv')
hotels = pd.read_csv('/content/drive/My Drive/DataPre/Hotels/PreprocessedHotels.csv')

# Standardize column names for consistency
attractions.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in attractions.columns]

# Preprocessing and feature engineering
user_inputs.fillna(0, inplace=True)
attractions.fillna(0, inplace=True)
restaurants.fillna(0, inplace=True)
hotels.fillna(0, inplace=True)

# Model Training for Hotel Recommendation using KMeans Clustering
def train_hotel_recommendation_model(user_inputs, hotels, n_clusters=5):
    hotel_features = hotels[['latitude', 'longitude', 'pricelevel', 'hotelclass']]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    hotel_clusters = kmeans.fit_predict(hotel_features)
    hotels['cluster'] = hotel_clusters

    # Clustering Evaluation Metrics
    silhouette_avg = silhouette_score(hotel_features, hotel_clusters)
    davies_bouldin = davies_bouldin_score(hotel_features, hotel_clusters)
    calinski_harabasz = calinski_harabasz_score(hotel_features, hotel_clusters)
    print(f'Silhouette Score: {silhouette_avg}')
    print(f'Davies-Bouldin Index: {davies_bouldin}')
    print(f'Calinski-Harabasz Index: {calinski_harabasz}')

    # Diversity and Coverage Metrics
    unique_clusters = len(set(hotel_clusters))
    coverage = unique_clusters / n_clusters
    diversity = len(np.unique(hotel_clusters)) / len(hotels)
    print(f'Coverage: {coverage}')
    print(f'Diversity: {diversity}')

    return kmeans, hotels

kmeans_model, hotels = train_hotel_recommendation_model(user_inputs, hotels)

# Function to get suggestions within a maximum distance
def get_nearby_options(lat, lon, options, max_distance_km):
    nearby_options = []
    for _, row in options.iterrows():
        distance = geodesic((lat, lon), (row['latitude'], row['longitude'])).km
        if distance <= max_distance_km:
            nearby_options.append((row['name'], distance, row['category'], row['rating']))
    return sorted(nearby_options, key=lambda x: x[1])

# Function to generate an itinerary based on user input days
def generate_itinerary(user, hotel, attractions, restaurants, max_hours_per_day=8):
    num_days = int(user['numberofdays'])
    max_distance = float(user['maximum_distance'])
    itinerary = {f'Day {i+1}': {'Restaurants': [], 'Attractions': [], 'Hotel': hotel['name']} for i in range(num_days)}
    used_restaurants = set()
    used_attractions = set()
    hotel_lat, hotel_lon = hotel['latitude'], hotel['longitude']

    for day in itinerary.keys():
        # Select 3 restaurants
        available_restaurants = get_nearby_options(hotel_lat, hotel_lon, restaurants, max_distance)
        available_restaurants = [r for r in available_restaurants if r[0] not in used_restaurants]
        selected_restaurants = random.sample(available_restaurants, min(3, len(available_restaurants)))
        itinerary[day]['Restaurants'] = [r[0] for r in selected_restaurants]
        used_restaurants.update(itinerary[day]['Restaurants'])

        # Select 3-4 attractions considering max duration
        available_attractions = get_nearby_options(hotel_lat, hotel_lon, attractions, max_distance)
        available_attractions = [a for a in available_attractions if a[0] not in used_attractions]
        selected_attractions = random.sample(available_attractions, min(4, len(available_attractions)))
        total_duration = 0
        day_attractions = []

        for attraction in selected_attractions:
            duration = random.randint(1, 3)  # Mock duration between 1 to 3 hours
            if total_duration + duration <= max_hours_per_day:
                day_attractions.append(attraction[0])
                total_duration += duration

        itinerary[day]['Attractions'] = day_attractions
        used_attractions.update(day_attractions)

    return itinerary

# Generate itineraries for all users
all_itineraries = {}
for index, user in user_inputs.iterrows():
    hotel = hotels.iloc[index % len(hotels)]  # Rotate hotels for demonstration
    itinerary = generate_itinerary(user, hotel, attractions, restaurants)
    all_itineraries[user['name']] = itinerary

# Display itineraries for all users
for user_name, itinerary in all_itineraries.items():
    print(f'Itinerary for {user_name}:')
    for day, details in itinerary.items():
        print(f"\n{day}:")
        print(f"Hotel: {details['Hotel']}")
        print("Restaurants:")
        for restaurant in details['Restaurants']:
            print(f"  - {restaurant}")
        print("Attractions:")
        for attraction in details['Attractions']:
            print(f"  - {attraction}")
    print('\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  - The Breeze Restaurant
  - Cafe Umbrella
Attractions:
  - Kalu Tuk Tuk Ella
  - Flying Ravana Mega Zipline
  - Honey Bee Garden
  - Travel Mo Sri Lanka


Itinerary for Uditha Ishan:

Day 1:
Hotel: Ella Retreat
Restaurants:
  - Down Town Rotti Hut
  - Ella Siyapatha Hut Restaurant
  - Cafe sameera
Attractions:
  - Ella Taxi
  - Ella Taxi by rts-srilanka.com
  - Beauty Lanka Travels
  - Nature's Spa Ella

Day 2:
Hotel: Ella Retreat
Restaurants:
  - Cafe Ice Cube
  - Pub Ceylon Ella
  - Cafe Infinity
Attractions:
  - Ella Rock
  - Little Adam's Peak View Point
  - Halpewatte Tea Factory Tour

Day 3:
Hotel: Ella Retreat
Restaurants:
  - Rainbow Cafe
  - Cafe on the Bend
  - Udayanga Pub Ceylon
Attractions:
  - Ella Train Journey
  - The Ella Odyssey Train
  - Honey Bee Garden
  - Cookery/ Art @ Wilpattu Tamaravila

Day 4:
Hotel: Ella Retreat
Restaurants:
  - Nanda Restaurant
  - Tip Top Sky Bar Ella
  - Cafe A Lounge
Attra