In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
df = pd.read_csv('tourism_data.csv')

In [25]:
df.tail(200)

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,AttractionCityId,AttractionTypeId,Attraction,AttractionAddress,AttractionType,ContenentId,RegionId,CountryId,CityId,CityName,Contenent,Country,Region,VisitMode,Overall_Avg_Rating,AttractionDetails
52721,210930,65802,2016,11,4,1297,3,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,3,14,101,3260,Surabaya,Asia,Indonesia,South East Asia,Friends,3.538847,yogyakarta palace surabaya
52722,210932,65874,2015,2,4,1297,4,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,3,11,74,2468,Taipei,Asia,Taiwan,East Asia,Friends,3.538847,yogyakarta palace taipei
52723,210933,65874,2015,2,4,1297,3,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,3,11,74,2468,Taipei,Asia,Taiwan,East Asia,Friends,3.538847,yogyakarta palace taipei
52724,210934,65931,2013,12,4,1297,3,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,3,14,106,3551,Singapore,Asia,Singapore,South East Asia,Friends,3.538847,yogyakarta palace singapore
52725,210935,66089,2017,4,2,1297,3,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,5,21,163,8745,Pontypridd,Europe,United Kingdom,Western Europe,Couples,3.538847,yogyakarta palace pontypridd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52916,211227,87100,2018,9,2,1297,4,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,5,21,159,7460,Lyon,Europe,France,Western Europe,Couples,3.538847,yogyakarta palace lyon
52917,211238,88112,2016,2,2,1297,5,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,5,17,133,6164,Bratislava Region,Europe,Slovakia,Central Europe,Couples,3.538847,yogyakarta palace bratislava region
52918,211239,88112,2016,2,2,1297,4,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,5,17,133,6164,Bratislava Region,Europe,Slovakia,Central Europe,Couples,3.538847,yogyakarta palace bratislava region
52919,211240,88112,2016,2,2,1297,4,3,44,Yogyakarta Palace,Yogyakarta,Historic Sites,5,17,133,6164,Bratislava Region,Europe,Slovakia,Central Europe,Couples,3.538847,yogyakarta palace bratislava region


In [6]:
# Aggregate duplicate entries by averaging ratings without modifying df
df_grouped = df.groupby(['UserId', 'AttractionId'], as_index=False).agg({'Rating': 'mean'})


In [7]:
# Pivot table for user-item matrix
user_item_matrix = df_grouped.pivot(index='UserId', columns='AttractionId', values='Rating').fillna(0)


In [8]:
# Convert user-item matrix to a sparse matrix
user_item_sparse = csr_matrix(user_item_matrix.values)

In [9]:
# Perform Singular Value Decomposition (SVD)
k = min(user_item_sparse.shape) - 1  # Ensure k is valid
k = max(1, min(50, k))  # Keep k reasonable
U, sigma, Vt = svds(user_item_sparse, k=k)
sigma = np.diag(sigma)

In [10]:
# Reconstruct predicted ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

In [11]:
def collaborative_recommend(user_id, preferred_city=None, preferred_type=None, n=5):
    if user_id not in predicted_ratings_df.index:
        return []
    sorted_ratings = predicted_ratings_df.loc[user_id].sort_values(ascending=False)


    recommendations = []
    seen_attractions = set()

    # Get attractions user already rated
    user_rated_attractions = set(df[df['UserId'] == user_id]['Attraction'])

    for aid in sorted_ratings.index:
        attraction_rows = df[df['AttractionId'] == aid]
        if attraction_rows.empty:
            continue

        for _, attraction_row in attraction_rows.iterrows():
            formatted_attraction = f"{attraction_row['Attraction']} ({attraction_row['CityName']})"

            # Skip if user already rated it
            if attraction_row['Attraction'] in user_rated_attractions:
                continue  
            
            # Apply filters
            if preferred_city and attraction_row['CityName'] != preferred_city:
                continue
            if preferred_type and attraction_row['AttractionType'] != preferred_type:
                continue

            recommendations.append(formatted_attraction)
            seen_attractions.add(attraction_row['Attraction'])  

            if len(recommendations) >= n:
                return recommendations

    # If CF fails, return popular attractions as backup
    if len(recommendations) < n:
        popular_attractions = (
            df[df['CityName'] == preferred_city]
            .groupby("Attraction")["Rating"]
            .mean()
            .sort_values(ascending=False)
            .index.tolist()
        )
        
        for att in popular_attractions:
            if att not in seen_attractions:
                recommendations.append(f"{att} ({preferred_city})")
            if len(recommendations) >= n:
                break

    return recommendations


In [12]:
# Content-Based Filtering
vectorizer = TfidfVectorizer(stop_words='english')
df['AttractionDetails'] = df['Attraction'].str.lower().str.strip() + " " + df['CityName'].str.lower().str.strip()
tfidf_matrix = vectorizer.fit_transform(df['AttractionDetails'])


In [13]:
# Use Nearest Neighbors with increased neighbors
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
knn_model.fit(tfidf_matrix)

In [14]:
# Function to get content-based recommendations using KNN with filtering
def content_recommend(attraction_name, preferred_city=None, preferred_type=None, n=5):
    attraction_name = attraction_name.lower().strip()
    if attraction_name not in df['Attraction'].str.lower().str.strip().values:
        return []
    idx = df[df['Attraction'].str.lower().str.strip() == attraction_name].index[0]
    distances, indices = knn_model.kneighbors(tfidf_matrix[idx], n_neighbors=20)
    recommended_attractions = []
    for i in indices.flatten()[1:]:  # Skip the first one (itself)
        attraction_row = df.iloc[i]
        if preferred_city and attraction_row['CityName'] != preferred_city:
            continue
        if preferred_type and attraction_row['AttractionType'] != preferred_type:
            continue
        formatted_attraction = f"{attraction_row['Attraction']} ({attraction_row['CityName']})"
        if formatted_attraction not in recommended_attractions:
            recommended_attractions.append(formatted_attraction)
        if len(recommended_attractions) >= n:
            break
    
    # Fallback to return at least n recommendations
    if len(recommended_attractions) < n:
        for i in indices.flatten()[1:]:
            attraction_row = df.iloc[i]
            formatted_attraction = f"{attraction_row['Attraction']} ({attraction_row['CityName']})"
            if formatted_attraction not in recommended_attractions:
                recommended_attractions.append(formatted_attraction)
            if len(recommended_attractions) >= n:
                break
    
    return recommended_attractions


In [15]:
def hybrid_recommend(user_id, attraction_name, preferred_city=None, preferred_type=None, cf_weight=0.5, cb_weight=0.5, n=5):
    cf_recommendations = collaborative_recommend(user_id, preferred_city, preferred_type, n=n)
    cb_recommendations = content_recommend(attraction_name, preferred_city, preferred_type, n=n)

    # Combine both, ensuring variety
    hybrid_results = list(set(cf_recommendations[:int(cf_weight * n)] + cb_recommendations[:int(cb_weight * n)]))

    if len(hybrid_results) < n:
        extra_items = list(set(cf_recommendations + cb_recommendations) - set(hybrid_results))
        hybrid_results.extend(extra_items[: (n - len(hybrid_results))])

    return hybrid_results[:n]


In [26]:
# Example Usage
user_id = 66089
attraction_name = "Yogyakarta Palace"
preferred_city = "Pontypridd"
preferred_type = "Historic Sites"
print("Collaborative Filtering Recommendations:", collaborative_recommend(user_id, preferred_city=preferred_city, preferred_type=preferred_type))
print("Content-Based Recommendations:", content_recommend(attraction_name, preferred_city=preferred_city, preferred_type=preferred_type))
print("Hybrid Recommendations:", hybrid_recommend(user_id, attraction_name, preferred_city=preferred_city, preferred_type=preferred_type))


Collaborative Filtering Recommendations: ['Seminyak Beach (Pontypridd)', 'Jomblang Cave (Pontypridd)', 'Ratu Boko Temple (Pontypridd)', 'Ullen Sentalu Museum (Pontypridd)', 'Ramayana Ballet at Prambanan (Pontypridd)']
Content-Based Recommendations: ['Yogyakarta Palace (Jakarta)']
Hybrid Recommendations: ['Yogyakarta Palace (Jakarta)', 'Seminyak Beach (Pontypridd)', 'Jomblang Cave (Pontypridd)', 'Ratu Boko Temple (Pontypridd)', 'Ullen Sentalu Museum (Pontypridd)']


In [17]:
print(collaborative_recommend(user_id))  


['Balekambang Beach (Uttarakhand)', 'Balekambang Beach (Bali)', 'Balekambang Beach (Amsterdam)', 'Balekambang Beach (Malang)', 'Balekambang Beach (Batu Caves)']


In [18]:
print(collaborative_recommend(user_id, preferred_city="Guildford"))  
print(collaborative_recommend(user_id, preferred_type="Nature & Wildlife Areas"))


['Bromo Tengger Semeru National Park (Guildford)', 'Nusa Dua Beach (Guildford)', 'Nusa Dua Beach (Guildford)', 'Nusa Dua Beach (Guildford)', 'Nusa Dua Beach (Guildford)']
['Sempu Island (Yogyakarta)', 'Sempu Island (Jakarta)', 'Sempu Island (Batu)', 'Sempu Island (Kuala Lumpur)', 'Sempu Island (El Cerrito)']


In [19]:
print(df[df['UserId'] == user_id][['Attraction', 'CityName', 'AttractionType']])


                       Attraction   CityName           AttractionType
0  Sacred Monkey Forest Sanctuary  Guildford  Nature & Wildlife Areas


In [20]:
import joblib

# Save Collaborative Filtering Data (SVD-based predicted ratings)
joblib.dump(predicted_ratings_df, "predicted_ratings.pkl")

# Save Content-Based Filtering Model (TF-IDF + KNN)
joblib.dump(knn_model, "knn_model.pkl")
joblib.dump(tfidf_matrix, "tfidf_matrix.pkl")

# Save User-Item Matrix for Recommendations
joblib.dump(user_item_matrix, "user_item_matrix.pkl")

# Save Attraction Mapping (Attraction ID to Name)
joblib.dump(df.set_index("AttractionId")["Attraction"].to_dict(), "attraction_mapping.pkl")

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ Models and data saved successfully!")


✅ Models and data saved successfully!
