In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [8]:
df=pd.read_csv("tourism_sql.csv")

Content based

Based on attraction type

In [17]:
df['Rating'] = df['Rating'].astype(str)
tourism = df.copy() 
tourism = tourism.sample(n=3000, random_state=42).reset_index(drop=True)
# STEP 2: Drop duplicate attractions based on content
tourism = tourism.drop_duplicates(
    subset=['Attraction', 'CityName', 'Country', 'AttractionType', 'Rating'],
    keep='first'
).reset_index(drop=True)
# ‚úÖ Create content from key features
tourism['Attraction_recomendation'] = (
    tourism['Attraction'] + ' ' +
    tourism['AttractionType']+ ' ' +
    tourism['CityName'] + ' ' +
    tourism['Country'] + ' ' +
    tourism['Rating'] 
)

In [18]:
# ‚úÖ Build TF-IDF matrix once (don't crash)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(tourism['Attraction_recomendation'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
def recommend_attractions(attraction_name, city_name, df=tourism, cosine_sim=cosine_sim):
    # Get the index matching both Attraction and City
    idx = df[(df['Attraction'] == attraction_name) & (df['CityName'] == city_name)].index

    if idx.empty:
        return f"Attraction '{attraction_name}' in '{city_name}' not found."

    idx = idx[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity score (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]

    # Get indices of top 5
    top_indices = [i[0] for i in sim_scores]

    # Return recommended attractions
    return df[['Attraction', 'CityName', 'Country', 'AttractionType', 'Rating']].iloc[top_indices]


In [20]:
recommend_attractions("Uluwatu Temple", "London")

Unnamed: 0,Attraction,CityName,Country,AttractionType,Rating
1701,Uluwatu Temple,London,United Kingdom,Religious Sites,5
659,Uluwatu Temple,Canada,Canada,Religious Sites,2
586,Tanah Lot Temple,London,United Kingdom,Religious Sites,4
2103,Tanah Lot Temple,London,United Kingdom,Religious Sites,1
2120,Tanah Lot Temple,London,United Kingdom,Religious Sites,5


Suggest attractions similar to those already visited by the user based on features like attraction type, location, and amenities.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# STEP 1: Preprocessing
tourism = df.copy()

tourism.drop_duplicates(
    subset=['Attraction', 'CityName', 'Country', 'AttractionType', 'Rating'],
    inplace=True
)
tourism.reset_index(drop=True, inplace=True) 

# Combine features into a text field
tourism['Attraction_recommendation'] = (
    tourism['Attraction'].astype(str) + ' ' +
    tourism['AttractionType'].astype(str) + ' ' +
    tourism['CityName'].astype(str) + ' ' +
    tourism['Country'].astype(str) + ' ' +
    tourism['Rating'].astype(str)
)

# STEP 2: TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(tourism['Attraction_recommendation'])

# STEP 3: User profile generator
def build_user_profile(user_id, df=tourism, tfidf_matrix=tfidf_matrix):
    visited = df[df['UserId'] == user_id]
    if visited.empty:
        return None, "User has not visited any attractions."

    # Get indices of user's visited attractions
    user_indices = visited.index.tolist()

    # Average TF-IDF vector for visited attractions
    user_profile_vector = tfidf_matrix[user_indices].mean(axis=0)

    return user_profile_vector, visited

# STEP 4: Recommend attractions based on user profile
def recommend_for_user(user_id, df=tourism, tfidf_matrix=tfidf_matrix, top_n=5):
    user_profile_vector, visited = build_user_profile(user_id, df, tfidf_matrix)

    if user_profile_vector is None:
        return visited  # message

    # Compute cosine similarity between user profile and all attractions
    sim_scores = cosine_similarity(np.asarray(user_profile_vector), tfidf_matrix).flatten()


    # Filter out attractions already visited
    visited_indices = set(visited.index)
    recommendations = [
        (i, score) for i, score in enumerate(sim_scores) if i not in visited_indices
    ]

    # Sort and select top N
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    top_indices = [i[0] for i in recommendations]

    return df[['Attraction', 'CityName', 'Country', 'AttractionType', 'Rating']].iloc[top_indices]

# STEP 5: Example usage
user_id = 694  # Replace with actual UserId
print(recommend_for_user(user_id))


              Attraction     CityName    Country   AttractionType Rating
10495     Uluwatu Temple  Rockhampton  Australia  Religious Sites      4
11448   Tanah Lot Temple  Rockhampton  Australia  Religious Sites      4
11449  Kuta Beach - Bali  Rockhampton  Australia          Beaches      5
11445     Nusa Dua Beach  Rockhampton  Australia          Beaches      4
11446     Nusa Dua Beach  Rockhampton  Australia          Beaches      5


collabrative

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
Found existing installation: category_encoders 2.8.1
Uninstalling category_encoders-2.8.1:
  Successfully uninstalled category_encoders-2.8.1
Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.11.0
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting category_encoders==2.6.1
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl.metadata (7.9 kB)
Downloading scikit_learn-1.3.2-cp312-cp312-win_amd64.whl (9.1 MB)
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.1 MB 4.8 MB/s eta 0:00:02
   --------- ------------------------------ 2.1/9


1.Create a user-attraction matrix of ratings

2.Compute user-user similarity

3.Recommend attractions liked by similar users that the target user hasn't rated

Item based:

Item-Based Collaborative Filtering: Find attractions similar to those the target user has rated highly (based on other users‚Äô ratings) and recommend them.

In [22]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Step 1: Load your data
# Replace with your file path or DataFrame as needed
df=pd.read_csv("tourism_sql.csv")
# Step 2: Create User-Attraction Ratings Matrix
ratings_matrix = df.pivot_table(index='UserId', columns='AttractionId', values='Rating')

# Step 3: Fill NaN with 0 (or use mean imputation if preferred)
ratings_filled = ratings_matrix.fillna(0)

# Step 4: Calculate Item-Item Similarity (cosine similarity)
item_similarity = cosine_similarity(ratings_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, 
                                   index=ratings_filled.columns, 
                                   columns=ratings_filled.columns)

# Step 5: Define a function to recommend similar attractions
def recommend_attractions(user_id, top_n=5):
    user_ratings = ratings_filled.loc[user_id]
    unrated_items = user_ratings[user_ratings == 0]
    
    scores = {}
    for item_id in unrated_items.index:
        sim_items = item_similarity_df[item_id]
        user_rated_items = user_ratings[user_ratings > 0]
        weighted_sum = sum(sim_items[user_rated_items.index] * user_rated_items)
        sim_sum = sum(sim_items[user_rated_items.index])
        scores[item_id] = weighted_sum / sim_sum if sim_sum != 0 else 0

    recommended = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [(item_id, df[df['AttractionId'] == item_id]['Attraction'].iloc[0]) for item_id, _ in recommended]

# Example usage
user_id = 70333  # Replace with an actual UserId from your dataset
recommendations = recommend_attractions(user_id, top_n=5)

# Print recommendations
print("Top recommendations for user", user_id)
for item_id, attraction in recommendations:
    print(f"- {attraction} (Attraction ID: {item_id})")


Top recommendations for user 70333
- Waterbom Bali (Attraction ID: 841)
- Kuta Beach - Bali (Attraction ID: 369)
- Nusa Dua Beach (Attraction ID: 481)
- Sanur Beach (Attraction ID: 650)
- Seminyak Beach (Attraction ID: 673)


In [9]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# --- Load and prepare data ---
# Ensure your DataFrame `df` contains: UserId, AttractionId, Rating, Attraction
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['UserId', 'AttractionId', 'Rating']], reader)

# --- Train/test split ---
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# --- Train model ---
algo = SVD()
algo.fit(trainset)

# --- Predict on testset and calculate RMSE, MAE ---
test_predictions = algo.test(testset)
rmse = accuracy.rmse(test_predictions)
mae = accuracy.mae(test_predictions)

# --- Top-N Recommendations for a specific user ---
user_id = str(70333)
all_attractions = df['AttractionId'].unique()
seen = df[df['UserId'] == int(user_id)]['AttractionId'].tolist()

# Predict ratings for unseen attractions
predictions = [algo.predict(user_id, str(i)) for i in all_attractions if i not in seen]
top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:5]

# Convert to DataFrame
recommendations_df = pd.DataFrame({
    "AttractionId": [int(pred.iid) for pred in top_n],
    "Attraction": [df[df['AttractionId'] == int(pred.iid)]['Attraction'].iloc[0] for pred in top_n],
    "Predicted Rating": [round(pred.est, 2) for pred in top_n]
})

print("Top-5 Recommendations:")
print(recommendations_df)

# --- Evaluation: Precision@5, Recall@5, MAP@5 ---

# Relevant items = attractions the user rated >= 4
relevant = df[(df['UserId'] == int(user_id)) & (df['Rating'] >= 4)]['AttractionId'].tolist()
recommended = [int(pred.iid) for pred in top_n]

# Precision@5
relevant_recommended = [aid for aid in recommended if aid in relevant]
precision_at_5 = len(relevant_recommended) / len(recommended)

# Recall@5
recall_at_5 = len(relevant_recommended) / len(relevant) if relevant else 0

# MAP@5
def average_precision(recommended, relevant):
    hits, sum_precisions = 0, 0
    for i, aid in enumerate(recommended):
        if aid in relevant:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / hits if hits else 0

map_at_5 = average_precision(recommended, relevant)

# --- Display evaluation ---
print(f"\nEvaluation for User {user_id}:")
print(f"Precision@5: {precision_at_5:.2f}")
print(f"Recall@5: {recall_at_5:.2f}")
print(f"MAP@5: {map_at_5:.2f}")


RMSE: 0.9200
MAE:  0.7190
Top-5 Recommendations:
   AttractionId               Attraction  Predicted Rating
0           748  Tegalalang Rice Terrace              4.16
1           841            Waterbom Bali              4.16
2           369        Kuta Beach - Bali              4.16
3           650              Sanur Beach              4.16
4           673           Seminyak Beach              4.16

Evaluation for User 70333:
Precision@5: 0.00
Recall@5: 0.00
MAP@5: 0.00


üîç Your Results:
üî¢ Error Metrics (All Users):
RMSE: 0.9200 ‚Äî average error in predicted ratings is less than 1 unit (good).

MAE: 0.7190 ‚Äî average absolute error is ~0.72 stars (reasonable).

‚úÖ This means your SVD model is learning patterns reasonably well on rating prediction.

üìä Recommendation Quality (User 70333):
Precision@5: 0.00

Recall@5: 0.00

MAP@5: 0.00

‚ö†Ô∏è This means none of the recommended items (Top-5) were among those the user rated ‚â• 4, i.e., they were not relevant to this user.


User-Based Collaborative Filtering in Python using svd

In [None]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

# Prepare the dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['UserId', 'AttractionId', 'Rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD()
algo.fit(trainset)

# Predict for a specific user
user_id = str(70333)
all_attractions = df['AttractionId'].unique()
seen = df[df['UserId'] == int(user_id)]['AttractionId'].tolist()

# Predict ratings for unseen attractions
predictions = [algo.predict(user_id, str(i)) for i in all_attractions if i not in seen]
top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:5]

# Convert to DataFrame
recommendations_df = pd.DataFrame({
    "AttractionId": [int(pred.iid) for pred in top_n],
    "Attraction": [df[df['AttractionId'] == int(pred.iid)]['Attraction'].iloc[0] for pred in top_n],
    "Predicted Rating": [round(pred.est, 2) for pred in top_n]
})

# Display as table
print(recommendations_df)


   AttractionId               Attraction  Predicted Rating
0           748  Tegalalang Rice Terrace              4.16
1           841            Waterbom Bali              4.16
2           369        Kuta Beach - Bali              4.16
3           650              Sanur Beach              4.16
4           673           Seminyak Beach              4.16


Item-Based Collaborative Filtering using SVD

In [None]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Prepare the dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['UserId', 'AttractionId', 'Rating']], reader)

trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Step 1: Build mapping of internal IDs to attraction IDs
raw_to_inner = trainset.to_inner_iid
inner_to_raw = trainset.to_raw_iid

# Step 2: Extract item (attraction) latent vectors
item_factors = np.array([algo.qi[raw_to_inner(i)] for i in df['AttractionId'].unique() if raw_to_inner(i) < len(algo.qi)])
item_ids = [i for i in df['AttractionId'].unique() if raw_to_inner(i) < len(algo.qi)]

# Step 3: Compute cosine similarity between items
item_sim = cosine_similarity(item_factors)

# Step 4: Create a DataFrame for similarity
sim_df = pd.DataFrame(item_sim, index=item_ids, columns=item_ids)

# Step 5: Define function to get top N similar attractions
def get_similar_attractions(attraction_id, top_n=5):
    if attraction_id not in sim_df.index:
        return pd.DataFrame()

    similar_scores = sim_df[attraction_id].sort_values(ascending=False)[1:top_n+1]  # Skip self (first row)
    result = pd.DataFrame({
        "AttractionId": similar_scores.index,
        "Attraction": [df[df['AttractionId'] == i]['Attraction'].iloc[0] for i in similar_scores.index],
        "Similarity": similar_scores.values
    })
    return result

# Example: get similar attractions to 'Uluwatu Temple' (ID = 824)
similar_attractions_df = get_similar_attractions(824)
print(similar_attractions_df)


   AttractionId                    Attraction  Similarity
0          1220  Ramayana Ballet at Prambanan    0.243197
1           947          Mount Semeru Volcano    0.139519
2           749          Tegenungan Waterfall    0.115407
3           737              Tanah Lot Temple    0.113244
4           913                Goa Cina Beach    0.111410


Hybrid Recommendation System (CF + CBF) using svd

Creating a hybrid recommendation system combines the strengths of both:

Collaborative Filtering (CF): Learns from user-item interactions.

Content-Based Filtering (CBF): Uses item features to find similar content.

SVD-based Collaborative Filtering: to predict how much a user would like an item.

TF-IDF-based Content Similarity: to adjust recommendations with item similarity.

üîß Hybrid Strategy For a given user + attraction, we:

Use SVD to predict rating for unseen attractions.

Use content similarity to filter those that are most similar to what the user has rated highly.

Combine both scores (weighted average).

‚öñÔ∏è Parameter alpha alpha = 1.0: pure collaborative

alpha = 0.0: pure content-based

alpha = 0.5: balanced hybrid

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader

# Step 1: Prepare content-based data
df['Rating'] = df['Rating'].astype(str)
tourism = df.sample(n=3000, random_state=42).drop_duplicates(
    subset=['Attraction', 'CityName', 'Country', 'AttractionType', 'Rating']
).reset_index(drop=True)

tourism['Attraction_recommendation'] = (
    tourism['Attraction'] + ' ' +
    tourism['AttractionType'] + ' ' +
    tourism['CityName'] + ' ' +
    tourism['Country'] + ' ' +
    tourism['Rating']
)

# Step 2: Compute TF-IDF content similarity matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(tourism['Attraction_recommendation'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 3: Collaborative Filtering with SVD
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['UserId', 'AttractionId', 'Rating']].astype(float), reader)
trainset = data.build_full_trainset()
svd_model = SVD()
svd_model.fit(trainset)

# Step 4: Hybrid Recommendation Function
def hybrid_recommend(user_id, liked_attraction, city_name, top_n=5, alpha=0.5):
    # Find index of liked attraction
    idx = tourism[(tourism['Attraction'] == liked_attraction) & (tourism['CityName'] == city_name)].index
    if idx.empty:
        return f"Attraction '{liked_attraction}' in '{city_name}' not found."

    idx = idx[0]

    # Step A: Get similar attractions based on content
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]  # Take top 50 similar for performance
    candidate_indices = [i[0] for i in sim_scores]

    # Step B: Predict ratings with collaborative filtering
    hybrid_scores = []
    for i in candidate_indices:
        attraction_id = tourism.loc[i, 'AttractionId']
        pred = svd_model.predict(str(user_id), str(attraction_id)).est
        content_score = cosine_sim[idx][i]
        hybrid_score = alpha * pred + (1 - alpha) * content_score #2.85
        hybrid_scores.append((i, hybrid_score))

    # Step C: Sort by hybrid score
    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:top_n]

    # Step D: Return recommended attractions
    top_indices = [i[0] for i in hybrid_scores]
    return tourism[['Attraction', 'CityName', 'Country', 'AttractionType', 'Rating']].iloc[top_indices].reset_index(drop=True)

In [25]:
hybrid_recommend(user_id=70333, liked_attraction="Uluwatu Temple", city_name="Saratoga", top_n=5, alpha=0.7)

Unnamed: 0,Attraction,CityName,Country,AttractionType,Rating
0,Uluwatu Temple,United States,United States,Religious Sites,5
1,Uluwatu Temple,Amsterdam,United States,Religious Sites,5
2,Uluwatu Temple,Houston,United States,Religious Sites,3
3,Uluwatu Temple,Austin,United States,Religious Sites,4
4,Tanah Lot Temple,United States,United States,Religious Sites,5
