<a href="https://colab.research.google.com/github/mathu3004/Pearl_Path/blob/Personalized_Itinerary_Generator_Based_Radius/ModelTraining1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ace-tools



In [None]:
!pip install geopy



In [None]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import random

In [None]:
# Load the datasets
hotels_df = pd.read_csv("colomboHotels_processed.csv").dropna(subset=["latitude", "longitude"])
attractions_df = pd.read_csv("colombo_attractions_processed.csv").dropna(subset=["latitude", "longitude"])
restaurants_df = pd.read_csv("colomboRestaurants_processed.csv").dropna(subset=["latitude", "longitude"])

# Function to calculate distance between two coordinates
def haversine_distance(coord1, coord2):
    return geodesic(coord1, coord2).kilometers

# Function to suggest top hotels in a place
def suggest_hotels():
    return hotels_df.sort_values(by=["rating", "priceRange_LKR"], ascending=[False, True]).head(5)

# Function to get nearby attractions within range
def get_nearby_attractions(hotel_lat, hotel_lon, max_range_km):
    attractions_df["distance"] = attractions_df.apply(
        lambda row: haversine_distance((hotel_lat, hotel_lon), (row["latitude"], row["longitude"])), axis=1
    )
    return attractions_df[attractions_df["distance"] <= max_range_km].sort_values(by="rating", ascending=False)

# Function to get nearby restaurants within range
def get_nearby_restaurants(hotel_lat, hotel_lon, max_range_km):
    restaurants_df["distance"] = restaurants_df.apply(
        lambda row: haversine_distance((hotel_lat, hotel_lon), (row["latitude"], row["longitude"])), axis=1
    )
    return restaurants_df[restaurants_df["distance"] <= max_range_km].sort_values(by="rating", ascending=False)

# Function to create an itinerary
def plan_itinerary(hotel, num_days, max_range_km):
    hotel_lat, hotel_lon = hotel["latitude"], hotel["longitude"]

    nearby_attractions = get_nearby_attractions(hotel_lat, hotel_lon, max_range_km).copy()
    nearby_restaurants = get_nearby_restaurants(hotel_lat, hotel_lon, max_range_km).copy()

    itinerary = []

    for day in range(1, num_days + 1):
        attractions_today = nearby_attractions.sample(n=min(len(nearby_attractions), random.randint(3, 4)), replace=False)
        nearby_attractions = nearby_attractions.drop(attractions_today.index)

        # Filter and sample restaurants for each meal within the loop
        breakfast = nearby_restaurants[nearby_restaurants["mealTypes"].str.contains("Breakfast", na=False)].sample(n=min(1, len(nearby_restaurants[nearby_restaurants["mealTypes"].str.contains("Breakfast", na=False)])), replace=False)
        lunch = nearby_restaurants[nearby_restaurants["mealTypes"].str.contains("Lunch", na=False)].sample(n=min(1,len(nearby_restaurants[nearby_restaurants["mealTypes"].str.contains("Lunch", na=False)])), replace=False)
        dinner = nearby_restaurants[nearby_restaurants["mealTypes"].str.contains("Dinner", na=False)].sample(n=min(1,len(nearby_restaurants[nearby_restaurants["mealTypes"].str.contains("Dinner", na=False)])), replace=False)

        # Drop the selected restaurants for the current day from the available options for the next day
        nearby_restaurants = nearby_restaurants.drop(breakfast.index.union(lunch.index).union(dinner.index), errors='ignore')

        itinerary.append({
            "Day": day,
            "Attractions": list(attractions_today["name"].values),
            "Breakfast": list(breakfast["name"].values),
            "Lunch": list(lunch["name"].values),
            "Dinner": list(dinner["name"].values),
        })

    return pd.DataFrame(itinerary)

# Sample user inputs
num_days = 3  # Number of days in trip
max_range_km = 5  # Max range from hotel in km

# Suggest hotels and select one
suggested_hotels = suggest_hotels()
selected_hotel = suggested_hotels.iloc[0]  # Assume user selects the top hotel

# Generate itinerary
itinerary_df = plan_itinerary(selected_hotel, num_days, max_range_km)

print(itinerary_df)
display(itinerary_df)


   Day                                        Attractions  \
0    1  [Seema Malakaya Temple, One Galle Face, IAMSRI...   
1    2  [Pettah, ZRI Adventures, Sri Lanka Airport Tra...   
2    3  [Sri Lanka Tour Driver By Praneeth, Doi Doi We...   

                        Breakfast                                 Lunch  \
0  [The Poolside Bar and Terrace]                              [Thalis]   
1                        [Plates]  [Chutneys At Cinnamon Grand Colombo]   
2   [Barista, World Trade Center]                   [Colombo Fort Cafe]   

                Dinner  
0          [Cloud Red]  
1  [Hotel de Pilawoos]  
2      [Emperor's Wok]  


Unnamed: 0,Day,Attractions,Breakfast,Lunch,Dinner
0,1,"[Seema Malakaya Temple, One Galle Face, IAMSRI...",[The Poolside Bar and Terrace],[Thalis],[Cloud Red]
1,2,"[Pettah, ZRI Adventures, Sri Lanka Airport Tra...",[Plates],[Chutneys At Cinnamon Grand Colombo],[Hotel de Pilawoos]
2,3,"[Sri Lanka Tour Driver By Praneeth, Doi Doi We...","[Barista, World Trade Center]",[Colombo Fort Cafe],[Emperor's Wok]


In [1]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from geopy.distance import geodesic
from statsmodels.tsa.arima.model import ARIMA
import random
from sklearn.model_selection import train_test_split

# Load datasets from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load datasets
user_inputs = pd.read_csv('/content/drive/My Drive/DataPre/User/PreprocessedUserInputs.csv')
attractions = pd.read_csv('/content/drive/My Drive/DataPre/Attractions/PreprocessedMergedAttractions.csv')
restaurants = pd.read_csv('/content/drive/My Drive/DataPre/Restaurants/LastPreprocessedMergedRestaurants.csv')
hotels = pd.read_csv('/content/drive/My Drive/DataPre/Hotels/PreprocessedHotels.csv')

# Standardize column names
datasets = [user_inputs, attractions, restaurants, hotels]
for df in datasets:
    df.columns = [col.strip().lower().replace(' ', '_').replace('/', '_') for col in df.columns]

# Train GMM for hotel clustering
def train_hotel_gmm(hotels, n_clusters=5):
    features = hotels[['latitude', 'longitude', 'pricelevel', 'hotelclass', 'rating']].fillna(0)
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    hotel_clusters = gmm.fit_predict(features)
    hotels['cluster'] = hotel_clusters

    # Clustering evaluation
    silhouette_avg = silhouette_score(features, hotel_clusters)
    davies_bouldin = davies_bouldin_score(features, hotel_clusters)
    calinski_harabasz = calinski_harabasz_score(features, hotel_clusters)

    print(f'Silhouette Score: {silhouette_avg}')
    print(f'Davies-Bouldin Index: {davies_bouldin}')
    print(f'Calinski-Harabasz Index: {calinski_harabasz}')

    return gmm, hotels

gmm_model, hotels = train_hotel_gmm(hotels)

# Train RandomForestClassifier for hotel recommendation
def train_hotel_classifier(hotels):
    features = ['latitude', 'longitude', 'pricelevel', 'hotelclass', 'rating']
    hotels['target'] = hotels['cluster']

    X_train, X_test, y_train, y_test = train_test_split(hotels[features], hotels['target'], test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=10, min_samples_split=5)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print('Hotel Recommendation Model Accuracy:', accuracy_score(y_test, y_pred))

    # Return the model and the test data
    return model, X_test, y_test

# Call train_hotel_classifier to get the model and test data
hotel_model, X_test, y_test = train_hotel_classifier(hotels)

# Allocate days to destinations
def allocate_days_to_destinations(destinations, num_days):
    allocation_rules = {1: (1, 1), 2: (1, 2), 3: (1, 3), 4: (1, 4), 5: (1, 4), 6: (1, 4), 7: (1, 4)}
    min_dest, max_dest = allocation_rules.get(num_days, (1, 1))

    num_destinations = min(max_dest, len(destinations))
    selected_destinations = destinations[:num_destinations]

    days_per_destination = {dest: 1 for dest in selected_destinations}
    remaining_days = num_days - len(selected_destinations)

    index = 0
    while remaining_days > 0 and selected_destinations:
        dest = selected_destinations[index]
        days_per_destination[dest] += 1
        remaining_days -= 1
        index = (index + 1) % len(selected_destinations)

    return days_per_destination

# Get nearby options based on distance
def get_nearby_options(lat, lon, options, max_distance_km):
    return sorted([
        (row, geodesic((lat, lon), (row['latitude'], row['longitude'])).km)
        for _, row in options.iterrows()
        if geodesic((lat, lon), (row['latitude'], row['longitude'])).km <= max_distance_km
    ], key=lambda x: x[1])

# Recommend best match using content-based filtering
def recommend_best_match(user, options, feature_cols, top_n=5):
    user_profile = ' '.join([col for col in user.index if user[col] == 1])

    # Handle feature_cols as either individual columns or prefixes for one-hot encoded columns
    combined_features_list = []
    for col in feature_cols:
        if col in options.columns:  # If it's a regular column
            combined_features_list.append(options[col].astype(str))
        else:  # If it's a prefix for one-hot encoded columns
            encoded_cols = [c for c in options.columns if c.startswith(col + '_')]
            if encoded_cols:
                # Concatenate one-hot encoded columns into a single string
                combined_features_list.append(options[encoded_cols].astype(str).agg(' '.join, axis=1))

    options['combined_features'] = pd.concat(combined_features_list, axis=1).agg(' '.join, axis=1)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(options['combined_features'])
    user_vector = vectorizer.transform([user_profile])

    similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()
    options['similarity_score'] = similarities

    return options.nlargest(top_n, 'similarity_score')

# Generate itinerary with unique attractions and restaurants
def generate_itinerary(user, hotels, attractions, restaurants, max_hours_per_day=8):
    num_days = int(user['numberofdays'])
    max_distance = float(user['maximum_distance'])

    selected_destinations = [col.split('_')[-1] for col in user.index if 'destination_' in col and user[col] == 1]
    days_per_destination = allocate_days_to_destinations(selected_destinations, num_days)

    itinerary = {}
    used_hotels = {}
    used_restaurants = set()
    used_attractions = set()

    # Determine user's food preference (Veg/Non-Veg)
    food_preference = 'non_veg' if user.get('food_preference_non_veg', 0) == 1 else 'veg'
    food_preference_column = f'dietary_{food_preference}'

    for destination, days in days_per_destination.items():
        hotel_city_col = f'city_{destination.lower()}'
        available_hotels = hotels[(hotels[hotel_city_col] == 1)]

        # Check if available_hotels is empty before accessing its elements
        if available_hotels.empty:
            hotel = None
            hotel_lat, hotel_lon = None, None
            used_hotels[destination] = 'No hotel'  # Assign 'No hotel' directly
        else:
            hotel = available_hotels.sample(1).iloc[0]
            hotel_lat, hotel_lon = hotel['latitude'], hotel['longitude']
            used_hotels[destination] = hotel['name']  # Access 'name' without iloc

        for day in range(days):
            day_name = f'Day {len(itinerary) + 1}'
            itinerary[day_name] = {'Hotel': used_hotels.get(destination, 'No hotel'), 'Restaurants': [], 'Attractions': []}

            if hotel_lat is not None and hotel_lon is not None:
                # Get available restaurants
                filtered_restaurants = restaurants[
                    (restaurants[hotel_city_col] == 1) &
                    (restaurants[food_preference_column] == 1)
                ]

                meal_types = ['Breakfast', 'Lunch', 'Dinner']
                for meal in meal_types:
                    meal_column = f'mealtype_{meal.lower()}'
                    if meal_column in restaurants.columns:
                        meal_options = filtered_restaurants[filtered_restaurants[meal_column] == 1]
                        nearby_restaurants = get_nearby_options(hotel_lat, hotel_lon, meal_options, max_distance)

                        # Ensure unique restaurant selection
                        for nearby in nearby_restaurants:
                            restaurant_name = nearby[0]['name']
                            if restaurant_name not in used_restaurants:
                                itinerary[day_name]['Restaurants'].append(restaurant_name)
                                used_restaurants.add(restaurant_name)
                                break  # Only add one per meal type

                # Get available attractions
                nearby_attractions = get_nearby_options(hotel_lat, hotel_lon, attractions, max_distance)

                # Ensure unique attraction selection
                for attraction in nearby_attractions:
                    attraction_name = attraction[0]['name']
                    if attraction_name not in used_attractions:
                        itinerary[day_name]['Attractions'].append(attraction_name)
                        used_attractions.add(attraction_name)
                        if len(itinerary[day_name]['Attractions']) >= 4:
                            break  # Stop after selecting 4 unique attractions

    return itinerary

# Generate and display itineraries
all_itineraries = {}
for index, user in user_inputs.iterrows():
    itinerary = generate_itinerary(user, hotels, attractions, restaurants)
    all_itineraries[user['name']] = itinerary

# Display Itineraries
for user_name, itinerary in all_itineraries.items():
    num_days = len(itinerary)
    print(f'Itinerary for {user_name}: Days {num_days}')
    for day, details in itinerary.items():
        print(f"{day}: Hotel: {details['Hotel']}, Restaurants: {details['Restaurants']}, Attractions: {details['Attractions']}")
    print("")

Mounted at /content/drive
Silhouette Score: 0.9810629659506325
Davies-Bouldin Index: 0.006689149419662749
Calinski-Harabasz Index: 1445023216.6764877
Hotel Recommendation Model Accuracy: 1.0
Itinerary for Mathusha: Days 1
Day 1: Hotel: Hornbill Holiday Home, Restaurants: ['The Station', 'Barracuda Seafood & Grill', 'Sugar Beach'], Attractions: ['Island Scuba', 'Sazy Lanka Tours', "Stone 'N' String", 'South Lanka Tours - Day Tours']

Itinerary for nielia: Days 4
Day 1: Hotel: Melford Heaven, Restaurants: ['Themparadu', 'Calamander Lake Gregory', 'Thinking Cup'], Attractions: ['Hakgala Botanic Gardens', 'Royal Turf Club', 'International Buddhist Centre', 'Suwamadu Ayurvedic Health Resort']
Day 2: Hotel: Melford Heaven, Restaurants: ['Araliya Green City Food Court', 'Queenswood Restaurant', "Country House Restaurant at Trevene's"], Attractions: ['Ruwan Taxi and Tours', 'Gregory Lake', 'Hikers City Tours', 'Horizon Tours Sri Lanka']
Day 3: Hotel: Melford Heaven, Restaurants: ['Barnes Hall'