#Data Loading

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


places_df = pd.read_csv('/content/drive/My Drive/codex/places.csv')
users_df = pd.read_csv('/content/drive/My Drive/codex/users.csv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Data Cleaning

In [49]:
import re

def contains_non_ascii(text):
    return bool(re.search(r'[^\x00-\x7F]', text))

# Function to check for non-ASCII characters in a list or string
def check_non_ascii(item):
    if isinstance(item, list):
        return any(contains_non_ascii(sub_item) for sub_item in item if isinstance(sub_item, str))
    elif isinstance(item, str):
        return contains_non_ascii(item)
    else:
        return False

# Apply the function to the entire DataFrame
non_ascii_check = places_df.applymap(check_non_ascii)

# Count non-ASCII containing cells for each column
non_ascii_count = non_ascii_check.sum()

print("\nCount of cells containing non-ASCII characters for each column:")
print(non_ascii_count)


Count of cells containing non-ASCII characters for each column:
name                   37
lat                     0
lng                     0
formatted_address       6
rating                  0
user_ratings_total      0
latest_reviews        360
dtype: int64


  non_ascii_check = places_df.applymap(check_non_ascii)


In [50]:
import re

for i in range (places_df.shape[0]):
  #Cleaning Name
  places_df.at[i, 'name'] = re.sub(r'[^\x00-\x7F]+', '', places_df['name'][i])
  #Cleaning Reviews
  text = places_df['latest_reviews'][i].replace("Ã¢Â€Â™", "'") # Replace the non-ASCII apostrophe with the ASCII apostrophe
  cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text) #Remove other unrecognizable non ascii character sequences
  reviews = re.split(r"'\n*\s*\n*,\s*\n*\n*'", cleaned_text) #individual reviews
  reviews = [review.strip(" []\n'") for review in reviews] # Remove any leading/trailing whitespace and brackets
  places_df.at[i, 'latest_reviews'] = reviews

In [51]:
places_df[places_df['lat'].isnull()]

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
89,Leisure World,,,Sri Lanka,,,"[Leisure World has potential, but my experienc..."


In [52]:
!pip install vaderSentiment



In [53]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [54]:
def analyze_reviews(row):
    # If the rating cell is not empty, skip calculation
    if pd.notna(row['rating']) and pd.notna(row['user_ratings_total']):
        return row['rating'], row['user_ratings_total']

    reviews = row['latest_reviews']
    ratings = []
    for review in reviews:
        sentiment_score = analyzer.polarity_scores(review)['compound']

        # Assign rating based on sentiment score
        if sentiment_score >= 0.7:
            ratings.append(5)
        elif sentiment_score >=0.5:
            ratings.append(4)
        elif sentiment_score >=0:
            ratings.append(3)
        elif sentiment_score >=-0.6:
            ratings.append(2)
        else:
            ratings.append(1)

    # Calculate overall rating and total reviews
    overall_rating = sum(ratings) / len(ratings) if ratings else None
    total_reviews = len(reviews)

    return overall_rating, total_reviews

# Apply function to DataFrame (only if rating or user_ratings_total is empty)
places_df['rating'], places_df['user_ratings_total'] = zip(*places_df.apply(analyze_reviews, axis=1))

# print the null counts of each column
print(places_df.isnull().sum())

name                  0
lat                   1
lng                   1
formatted_address     0
rating                0
user_ratings_total    0
latest_reviews        0
dtype: int64


In [55]:
places_df.to_csv('/content/drive/My Drive/codex/places_updated.csv', index=False)

In [56]:
print(places_df.head())
print(len(places_df["latest_reviews"][1]))

                             name       lat        lng  \
0                Arugam Bay Beach  6.840408  81.836848   
1                   Mirissa Beach  5.944703  80.459161   
2  Weligama Beach (surf and stay)  5.972486  80.435714   
3                        Ahangama  5.973975  80.362159   
4                 Hikkaduwa Beach  6.137727  80.099060   

             formatted_address  rating  user_ratings_total  \
0  Arugam Bay Beach, Sri Lanka     4.8              1591.0   
1           Mirissa, Sri Lanka     4.6              1748.0   
2          Weligama, Sri Lanka     4.4               325.0   
3          Ahangama, Sri Lanka     3.6                 5.0   
4   Hikkaduwa Beach, Sri Lanka     4.7              1438.0   

                                      latest_reviews  
0  [Arugam Bay Beach is a surfer's paradise! I sp...  
1  [Mirissa Beach is truly a gem on Sri Lanka's s...  
2  [Weligama Beach is a fantastic spot for both b...  
3  [Ahangama was a bit disappointing for me as a ...  
4  

In [57]:
rows = []
import ast
# Iterate through each row in the original DataFrame
# for _, row in users_df.iterrows():
#     username = row['User ID']
#     destinations = row['Bucket list destinations Sri Lanka']
#     # Convert the string representation of the list to a list
#     actual_destinations = ast.literal_eval(destinations)
#     # Create a new row for each destination
#     for destination in actual_destinations:
#         destination = destination.strip()
#         rows.append({'User ID': username, 'destination': destination})
users_df["Bucket list destinations Sri Lanka"] = [ ast.literal_eval(destinations) for destinations in users_df["Bucket list destinations Sri Lanka"]]
users_df["Preferred Activities"] = [ ast.literal_eval(activities) for activities in users_df["Preferred Activities"]]
users_df.head()
# Create a new DataFrame from the list of rows
# df_new_users = pd.DataFrame(rows)

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu..."
1,2,Emily Perry,emily.perry@example.com,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena ..."
3,4,Angelica Wilson,angelica.wilson@example.com,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Colombo Port City..."
4,5,Laurie Powers,laurie.powers@example.com,"[history tours, sailing, literary tours]","[Negombo Lagoon, Colombo Port City, Galle Dutc..."


#Creating places embeddings

In [58]:
!pip install tensorflow



In [59]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_review(review):
    # Convert to lowercase
    review = review.lower()

    # Tokenize the review
    tokens = word_tokenize(review)

    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    tokens = ' '.join(tokens)

    return tokens

# Example usage
review = "This is an example review that needs to be preprocessed and padded."
processed_tokens = preprocess_review(review)
print(processed_tokens)



example review need preprocessed padded


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
places_df.head()


Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,[Arugam Bay Beach is a surfer's paradise! I sp...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,[Mirissa Beach is truly a gem on Sri Lanka's s...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,[Weligama Beach is a fantastic spot for both b...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",3.6,5.0,[Ahangama was a bit disappointing for me as a ...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,[Hikkaduwa Beach is a delightful escape for so...


In [61]:
users_df.head()

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu..."
1,2,Emily Perry,emily.perry@example.com,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena ..."
3,4,Angelica Wilson,angelica.wilson@example.com,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Colombo Port City..."
4,5,Laurie Powers,laurie.powers@example.com,"[history tours, sailing, literary tours]","[Negombo Lagoon, Colombo Port City, Galle Dutc..."


In [62]:
!pip install sentence_transformers




In [63]:
from sentence_transformers import SentenceTransformer, util
import numpy as np


model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_review_embedding(review):

    # processed_reviews = [preprocess_review(review) for review in reviews]

    # processed_reviews = [' '.join(tokens) for tokens in processed_reviews]

    embeddings = model.encode(preprocess_review(review))
    # place_embedding = np.mean(embeddings, axis=0)
    return embeddings
def generate_place_embeddings(places_reviews):
    new_df = pd.DataFrame(columns=[ 'name', 'review_embedding'])
    for place, reviews in places_reviews.items():
        for review in reviews:

          new_df = pd.concat([new_df, pd.DataFrame([{'name': place, 'review_embedding': get_review_embedding(review)}])], ignore_index=True)
        print(place)

    return new_df



In [64]:
places_reviews = {}

for index, row in places_df.iterrows():
    place_name = row['name']
    reviews = row['latest_reviews']
    places_reviews[place_name] = reviews

In [65]:
place_embeddings = generate_place_embeddings(places_reviews)
print(place_embeddings)

Arugam Bay Beach
Mirissa Beach
Weligama Beach (surf and stay)
Ahangama
Hikkaduwa Beach
Tangalle
Unawatuna Beach
Pigeon Island
Galle Dutch Fort
Polonnaruwa Ancient City
Sigiriya
Yala National Park
Udawalawe National Park
Wilpattu National Park
Wasgamuwa National Park
Minneriya National Park
Sinharaja Forest Reserve
Horton Plains National Park
Kumana National Park
Bundala National Park
Anawilundawa Wetland
Sri Dalada Maligawa
Dambulla Royal Cave Temple and Golden Temple
Anuradhapura
Mihintale
Arankale Buddhist Monastery
Ella Rock Trailhead
Yapahuwa Rock Fortress
Knuckles
Surathali Ella
Sri Pada / Adam's Peak
Ventura Beach
Tangalle Beach
Jungle Beach
Uppuveli
Koggala
Hiriketiya Beach
Marakolliya
Colombo National Museum
Dhanaja Gem Museum
Dutch Museum
National Museum Galle
Kandy National Museum
Maritime Museum
Trincomalee Harbour
Negombo Lagoon
Mirissa
Port City Colombo
Galle Lighthouse
Pidurangala Rock
Pitawala Nature Trail
Kitulgala
Coral Sanctuary Boat Ticket Issue Center
Jaffna Public 

In [67]:
import pickle

# Save the DataFrame with embeddings using pickle
def save_embeddings_to_pickle(embeddings_df, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(embeddings_df, f)

# Example
save_embeddings_to_pickle(place_embeddings, '/content/drive/My Drive/codex/place_embeddings.pkl')
# loaded_df = pickle.load(open('place_embeddings.pkl', 'rb'))  # or joblib.load()


#Content Based Filtering

In [None]:
import ast
import torch


def get_top_places_for_activities(activities, place_emb, top_n=4):
    results = {}
    activities = ('').join(activities)
    # Get embeddings for the preferred activities
    activity_embedding = torch.tensor(get_review_embedding(activities)).squeeze()
    activity_results = []
    # Iterate through activities



        # Compute cosine similarity for each place
    for place_name, row in place_emb.iterrows():
            place_embedding = torch.tensor(row["review_embedding"]).squeeze()

            # Compute cosine similarity between the activity and the place
            cosine_similarity = util.pytorch_cos_sim(activity_embedding, place_embedding)

            # Append the place name and similarity tensor to activity_results
            activity_results.append((place_name, cosine_similarity))

        # Sort the places by cosine similarity in descending order
    activity_results = sorted(activity_results, key=lambda x: x[1], reverse=True)

        # Remove duplicate places, keeping only the first occurrence
    seen_places = set()
    unique_results = []
    for place_name, similarity in activity_results:

        if place_emb["name"][place_name] not in seen_places:

                unique_results.append((place_name, similarity))
                seen_places.add(place_emb["name"][place_name])

        # Store the top `top_n` places for the activity (after removing duplicates)


    return unique_results[:top_n]
new_places = []
y = 0
for i in users_df["User ID"]:
  # if i == 5:
  #   break
  print(y,"/", len(users_df))
  activities = users_df.loc[users_df['User ID'] == i, "Preferred Activities"].values[0]
  top_places = get_top_places_for_activities(activities, place_embeddings, top_n=4)
  placesk = []
  for activity, places in top_places:

      placesk.append((place_embeddings["name"][activity], places))
  new_places.append(placesk)
  y += 1
print(new_places)


# users_df["preferred_places"] = new_places
# users_df.head()

# Example usage:
# activities = users_df.loc[users_df['User ID'] == 1, "Preferred Activities"].values[0]
# top_places = get_top_places_for_activities(activities, place_embeddings, top_n=4)
# print(top_places)

# Get top 2 places for each activity

# print(place_embeddings["name"][0])

# for activity, places in top_places.items():
#     print(f"Top places for activity '{activity}':")
#     for place, similarity in places:
#         place1 = place_embeddings["name"][place]
#         print(f"  Place: {place1}, Cosine Similarity: {similarity}")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5001 / 10000
5002 / 10000
5003 / 10000
5004 / 10000
5005 / 10000
5006 / 10000
5007 / 10000
5008 / 10000
5009 / 10000
5010 / 10000
5011 / 10000
5012 / 10000
5013 / 10000
5014 / 10000
5015 / 10000
5016 / 10000
5017 / 10000
5018 / 10000
5019 / 10000
5020 / 10000
5021 / 10000
5022 / 10000
5023 / 10000
5024 / 10000
5025 / 10000
5026 / 10000
5027 / 10000
5028 / 10000
5029 / 10000
5030 / 10000
5031 / 10000
5032 / 10000
5033 / 10000
5034 / 10000
5035 / 10000
5036 / 10000
5037 / 10000
5038 / 10000
5039 / 10000
5040 / 10000
5041 / 10000
5042 / 10000
5043 / 10000
5044 / 10000
5045 / 10000
5046 / 10000
5047 / 10000
5048 / 10000
5049 / 10000
5050 / 10000
5051 / 10000
5052 / 10000
5053 / 10000
5054 / 10000
5055 / 10000
5056 / 10000
5057 / 10000
5058 / 10000
5059 / 10000
5060 / 10000
5061 / 10000
5062 / 10000
5063 / 10000
5064 / 10000
5065 / 10000
5066 / 10000
5067 / 10000
5068 / 10000
5069 / 10000
5070 / 10000
5071 / 10000
5072 / 10000

In [None]:
users_df["preferred_places"] = new_places

In [None]:
users_df.to_csv('/content/drive/My Drive/codex/users_updated.csv', index=False)

In [76]:

def bayesian_average(row, global_avg, min_threshold):
    """
    Calculate Bayesian average for a given row (place) based on its rating and user_ratings_total.

    Args:
    - row: A row of the DataFrame with 'rating' and 'user_ratings_total' columns.
    - global_avg: Global average rating across all places.
    - min_threshold: Minimum number of ratings for Bayesian adjustment.

    Returns:
    - The Bayesian average rating for the place.
    """
    avg_rating = row['rating']
    num_ratings = row['user_ratings_total']

    # Bayesian average formula
    weighted_rating = (avg_rating * num_ratings + global_avg * min_threshold) / (num_ratings + min_threshold)

    return weighted_rating

# Assuming you have a DataFrame named 'places_df' with 'rating' and 'user_ratings_total' columns
def calculate_bayesian_ratings(places_df, min_threshold=100):
    # Step 1: Fill NaN values in user_ratings_total with the column's mean
    user_ratings_total_mean = places_df['user_ratings_total'].mean()
    places_df['user_ratings_total'].fillna(user_ratings_total_mean, inplace=True)

    # Step 2: Calculate the global average rating across all places
    global_avg_rating = places_df['rating'].mean()

    # Step 3: Apply the Bayesian average calculation for each row
    places_df['bayesian_rating'] = places_df.apply(
        lambda row: bayesian_average(row, global_avg_rating, min_threshold), axis=1
    )

    return places_df

# Example usage:
# Assuming your DataFrame is called places_df with 'rating' and 'user_ratings_total' columns
places_df = calculate_bayesian_ratings(places_df)
places_df.head()
places_df.to_csv('/content/drive/My Drive/codex/places_updated.csv', index=False)

#Colaborative Filtering Eshan

In [None]:
rows = []

# Iterate through each row in the original DataFrame
# for _, row in users_df.iterrows():
#     username = row['User ID']
#     destinations = row['Bucket list destinations Sri Lanka']
#     # Convert the string representation of the list to a list
#     actual_destinations = ast.literal_eval(destinations)
#     # Create a new row for each destination
#     for destination in actual_destinations:
#         destination = destination.strip()
#         rows.append({'User ID': username, 'destination': destination})
users_df["Bucket list destinations Sri Lanka"] = [ ast.literal_eval(destinations) for destinations in users_df["Bucket list destinations Sri Lanka"]]
users_df["Preferred Activities"] = [ ast.literal_eval(activities) for activities in users_df["Preferred Activities"]]
users_df.head()
# Create a new DataFrame from the list of rows
# df_new_users = pd.DataFrame(rows)

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu..."
1,2,Emily Perry,emily.perry@example.com,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena ..."
3,4,Angelica Wilson,angelica.wilson@example.com,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Colombo Port City..."
4,5,Laurie Powers,laurie.powers@example.com,"[history tours, sailing, literary tours]","[Negombo Lagoon, Colombo Port City, Galle Dutc..."


In [None]:
print(type(users_df["Bucket list destinations Sri Lanka"][0]))

<class 'list'>


In [None]:
rows = []
for _, row in users_df.iterrows():
    username = row['User ID']
    destinations = row['preferred_places']

    # Create a new row for each destination
    for destination in destinations:
        destination = destination[0].strip()
        rows.append({'User ID': username, 'destination': destination})


In [None]:
# Initialize an empty dictionary to store counts
counts = {}

# Iterate through the DataFrame and count visits


# Iterate through each row in the original DataFrame
for _, row in users_df.iterrows():
    username = row['User ID']
    destinations = row['Bucket list destinations Sri Lanka']

    # Create a new row for each destination
    for destination in destinations:
        destination = destination.strip()
        rows.append({'User ID': username, 'destination': destination})


# Create a new DataFrame from the list of rows
df_new_users = pd.DataFrame(rows)

for _, row in df_new_users.iterrows():
    user_id = row['User ID']
    destination = row['destination']

    if user_id not in counts:
        counts[user_id] = {}

    if destination not in counts[user_id]:
        counts[user_id][destination] = 0

    counts[user_id][destination] += 1
# Get all unique user IDs and destinations
users = sorted(counts.keys())
destinations = sorted(set(dest for dests in counts.values() for dest in dests))

# Create an empty DataFrame
pivot_data = []

for user in users:
    row = {'User ID': user}
    for destination in destinations:
        if counts.get(user, {}).get(destination, 0) > 0:
          row[destination] = places_df[places_df['name'] == destination]['bayesian_rating'].values[0] if len(places_df[places_df['name'] == destination]) > 0 else 0
        else:
          row[destination] = 0
    pivot_data.append(row)

df_pivot = pd.DataFrame(pivot_data).set_index('User ID')

In [None]:
df_pivot.head()

Unnamed: 0_level_0,- Parewi Duwa Temple,Aanda Ella Fall,Aberdeen Waterfall,Ahangama,Ahungalla,Alahana Pirivena,Alankuda Casuarina Beach,"All Saints' Church, Galle - Church of Ceylon",Alupotha Ella Waterfall,"Aluthnuwara Rajamaha Viharaya , Aluthnuwara",...,Wirawila Tissa Sanctuary,Yala Green Safari,Yala National Park,Yapahuwa Rock Fortress,jungle muru safari,riapla Mask Museum,rumassala,| | Okanda Beach,| Panama Wewa,|Kokkilai Beach
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [None]:
df_pivot.to_csv('/content/drive/My Drive/codex/pivot_data.csv')

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# Apply SVD for collaborative filtering
svd = TruncatedSVD(n_components=50)
user_place_matrix_svd = svd.fit_transform(df_pivot)
user_place_matrix_svd

array([[ 2.35140803, -2.41410263, -1.38361938, ...,  0.37751139,
         0.25713189,  0.30084837],
       [ 1.7741877 , -1.15128646,  0.39592587, ..., -0.1841224 ,
         0.09788957, -0.24454614],
       [ 3.36634311,  3.66411714, -0.46236606, ...,  0.12092049,
         0.15733434,  0.38034239],
       ...,
       [ 2.85749765,  3.64454131, -0.41666058, ...,  0.27101984,
         0.6886511 , -0.8152898 ],
       [ 2.34804382, -1.33873533,  2.85858671, ..., -0.69878745,
         0.21117345,  0.67204392],
       [ 1.78293849, -1.09234827,  0.47480626, ...,  1.09395284,
        -0.23994809, -0.02858354]])

In [None]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(user_place_matrix_svd)

In [None]:
pip install joblib




In [None]:
import joblib
joblib.dump(svd, '/content/drive/My Drive/codex/svd_model.pkl')   # Save TruncatedSVD model
joblib.dump(knn, '/content/drive/My Drive/codex/knn_model.pkl')

# svd_model = joblib.load('svd_model.pkl')
# knn_model = joblib.load('knn_model.pkl')

# # Use the loaded models as usual


['/content/drive/My Drive/codex/knn_model.pkl']

#load models

In [129]:
svd = joblib.load('/content/drive/My Drive/codex/svd_model.pkl')
knn = joblib.load('/content/drive/My Drive/codex/knn_model.pkl')
df_pivot = pd.read_csv('/content/drive/My Drive/codex/pivot_data.csv')
users_df = pd.read_csv('/content/drive/My Drive/codex/users_updated.csv')
places_df = pd.read_csv('/content/drive/My Drive/codex/places_updated.csv')
place_embeddings =  pickle.load(open('/content/drive/My Drive/codex/place_embeddings.pkl', 'rb'))  # or joblib.load()
# svd = svd.transform(df_pivot.drop(columns=['User ID']))


In [120]:

def get_recommendations(user_id, df_pivot2, knn, user_place_matrix_svd):
  """
  Get recommendations for a given user based on collaborative filtering.

  Args:
    user_id: The ID of the user for whom to generate recommendations.
    df_pivot: The user-item interaction matrix.
    knn: The trained k-nearest neighbors model.
    user_place_matrix_svd: The reduced dimensionality user-item matrix.

  Returns:
    A list of recommended destinations.
  """

  try:
    print("hi", df_pivot.iloc[-1])
    user_index = df_pivot2.index.get_loc(user_id)

    distances, indices = knn.kneighbors(user_place_matrix_svd[user_index].reshape(1, -1))

    similar_users = [df_pivot2.index[i] for i in indices[0]]
    print(similar_users)

    # Get the destinations liked by similar users
    destinations_liked_by_similar_users = []
    for similar_user in similar_users:
      destinations_liked_by_similar_users.extend(
          [dest for dest, count in df_pivot2.loc[similar_user].items() if count > 0]
      )

    # Remove destinations the user has already visited
    user_visited_destinations = [dest for dest, count in df_pivot2.loc[user_id].items() if count > 0]
    recommendations = [dest for dest in destinations_liked_by_similar_users if dest not in user_visited_destinations]

    return recommendations[:5]  # Return top 5 recommendations

  except KeyError:
    print(f"User with ID {user_id} not found in the dataset.")
    return []


# Example usage:
user_id_to_recommend_for = 2
recommendations = get_recommendations(user_id_to_recommend_for, df_pivot, knn, user_place_matrix_svd)

if recommendations:
  print(f"Recommendations for user {user_id_to_recommend_for}: {recommendations}")
else:
  print(f"No recommendations found for user {user_id_to_recommend_for}.")

hi User ID                 10000.0
- Parewi Duwa Temple        0.0
Aanda Ella Fall             0.0
Aberdeen Waterfall          0.0
Ahangama                    0.0
                         ...   
riapla Mask Museum          0.0
rumassala                   0.0
|   | Okanda Beach          0.0
| Panama Wewa               0.0
|Kokkilai Beach             0.0
Name: 9999, Length: 445, dtype: float64
[2, 6211, 7532, 8540, 1767]
Recommendations for user 2: ['Batticaloa Lagoon', 'Pinnawala', 'Trincomalee Harbour', 'Meemure', 'Trincomalee Harbour']


In [121]:
import torch
from sentence_transformers import util

def get_top_places_for_activities(activities, place_emb, top_n=4):
    results = {}
    activities = ('').join(activities)
    # Get embeddings for the preferred activities
    activity_embedding = torch.tensor(get_review_embedding(activities)).squeeze()
    activity_results = []
    # Iterate through activities



        # Compute cosine similarity for each place
    for place_name, row in place_emb.iterrows():
            place_embedding = torch.tensor(row["review_embedding"]).squeeze()

            # Compute cosine similarity between the activity and the place
            cosine_similarity = util.pytorch_cos_sim(activity_embedding, place_embedding)

            # Append the place name and similarity tensor to activity_results
            activity_results.append((place_name, cosine_similarity))

        # Sort the places by cosine similarity in descending order
    activity_results = sorted(activity_results, key=lambda x: x[1], reverse=True)

        # Remove duplicate places, keeping only the first occurrence
    seen_places = set()
    unique_results = []
    for place_name, similarity in activity_results:

        if place_emb["name"][place_name] not in seen_places:

                unique_results.append((place_name, similarity))
                seen_places.add(place_emb["name"][place_name])

        # Store the top `top_n` places for the activity (after removing duplicates)


    return unique_results[:top_n]


# Example usage:
activities = users_df.loc[users_df['User ID'] == 1, "Preferred Activities"].values[0]

# Get top 2 places for each activity
top_places = get_top_places_for_activities(activities, place_embeddings, top_n=2)

top_places = [ place_embeddings["name"][place[0]] for place in top_places ]
print(top_places)
# Print the results
# for activity, places in top_places.items():
#     print(f"Top places for activity '{activity}':")
#     for place, similarity in places:
#         place1 = place_embeddings["name"][place]
#         print(f"  Place: {place1}, Cosine Similarity: {similarity}")


['Council Chamber', 'Sathmahal Prasadaya ( ) - 51']


In [130]:
u = len(df_pivot) + 1
row = { }
print(u)
row["User ID"] = u
destinations = df_pivot.columns.tolist()
user = { }
user["User_ID"] = len(df_pivot) + 1
user["bucketlist"] = ['Mihintale', 'Pigeon Island', 'Hikkaduwa Beach', 'Unawatuna Beach', 'Dambulla Royal Cave Temple and Golden Temple']
user["preferredactivities"] = ['arts and culture', 'temple pilgrimages', 'snorkeling']
for destination in destinations[1:]:
        if destination in user["bucketlist"]:
          row[destination] = places_df[places_df['name'] == destination]['bayesian_rating'].values[0] if len(places_df[places_df['name'] == destination]) > 0 else 0
        else:
          row[destination] = 0

# def recommend(preferred_activities, bucket_list):
print(row)
df_pivot = pd.concat([df_pivot, pd.DataFrame([row]) ])

df_pivot = df_pivot.reset_index(drop=True)
svd = svd.transform(df_pivot.drop(columns=['User ID']))

knn.kneighbors(svd)


10001
{'User ID': 10001, '- Parewi Duwa Temple': 0, 'Aanda Ella Fall': 0, 'Aberdeen Waterfall': 0, 'Ahangama': 0, 'Ahungalla': 0, 'Alahana Pirivena': 0, 'Alankuda Casuarina Beach': 0, "All Saints' Church, Galle - Church of Ceylon": 0, 'Alupotha Ella Waterfall': 0, 'Aluthnuwara Rajamaha Viharaya , Aluthnuwara': 0, 'Ambalangoda': 0, 'Ambalangoda Mask Workshop': 0, 'Ambuluwawa Biodiversity Complex': 0, 'Ambuluwawa Temple -': 0, 'Ambuluwawa Tower': 0, 'Anawilundawa Wetland': 0, 'Anawilundawa Wetlands': 0, 'Angammedilla National Park': 0, 'Anuradapura': 0, 'Anuradhapura': 0, 'Anuradhapura New Town': 0, 'Arankale Buddhist Monastery': 0, 'Arankelle Forest Monastery': 0, 'Archaeological Museum Mihintale -': 0, 'Archaeology Museum Kotte': 0, "Arthur's Seat View Point, Kandy": 0, 'Arugam Bay Beach': 0, 'Arukuveli Beach': 0, 'Athugala Viharaya': 0, 'Auslink Hotel Walapane Sri Lanka': 0, "Baker's Falls": 0, 'Bakers Falls': 0, 'Balana Fort |': 0, 'Bambarakanda Falls': 0, 'Bambarakiri Ella': 0, 'Bao

In [128]:
# recommendations = get_recommendations(user_id, df_pivot, knn, svd)

# Print the last row using iloc

df_pivot = df_pivot.set_index("User ID")


def get_final_recommendations(user, df_pivot, knn , svd):
  recommendations = get_recommendations(user["User_ID"], df_pivot, knn, svd)
  top_places = get_top_places_for_activities(activities, place_embeddings, top_n=2)

  top_places = [ place_embeddings["name"][place[0]] for place in top_places ]
  result = recommendations[:3]
  result.append(top_places[0])
  return result

print(get_final_recommendations(user, df_pivot, knn, svd))

10001
10000
hi - Parewi Duwa Temple    0.0
Aanda Ella Fall         0.0
Aberdeen Waterfall      0.0
Ahangama                0.0
Ahungalla               0.0
                       ... 
riapla Mask Museum      0.0
rumassala               0.0
|   | Okanda Beach      0.0
| Panama Wewa           0.0
|Kokkilai Beach         0.0
Name: 10001, Length: 444, dtype: float64
[29, 870, 1441, 9715, 9903]
['International Buddhist Museum', 'Isurumuniya Temple', 'Sr Vijayrma Ancient Temple', 'Council Chamber']
