In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
from pymongo import MongoClient
ObjectId = MongoClient().test.ObjectId

In [None]:
client = MongoClient('mongodb+srv://sajansinghthakuri0:wGaubT5OtHl6ygaX@letsgocluster.akeoysu.mongodb.net/')
# db = client['LetsGoDataPre']
db = client['LetsGoDataPre2']
userCollection = db['users']
locationCollection = db['locations']

In [None]:
locationData = pd.DataFrame(list(locationCollection.find()))
locationData

In [None]:
expanded_ratings = locationData.explode('ratings').reset_index(drop=True)
rating_dicts = []
for rating in expanded_ratings['ratings']:
    if isinstance(rating, dict):
        rating_dicts.append({
            'user_id': str(rating['user_id']),  # Convert ObjectId to string
            'rate': rating['rate'],
            'review': rating['review']
        })

ratings_df = pd.DataFrame(rating_dicts)
ratings_df['location_id'] = expanded_ratings['_id']
ratings_df['location_name'] = expanded_ratings['name']

In [None]:
piv = ratings_df.pivot_table(index=['user_id'], columns=['location_name'], values='rate')
piv.head()

In [None]:
piv_norm = piv.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=1)
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm


In [None]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [None]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)


In [None]:
item_sim_df = pd.DataFrame(item_similarity, index=piv_norm.index, columns=piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index=piv_norm.columns, columns=piv_norm.columns)


In [None]:
def top_locations(location_name):
    count = 1
    print('Similar locations to {} include:\n'.format(location_name))
    for item in item_sim_df.sort_values(by=location_name, ascending=False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count += 1

In [None]:
def top_users(user):
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

In [None]:
def similar_user_recs(user):
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]    

In [None]:
def predicted_preference(location_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, location_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)    


In [None]:
top_locations('Billop House')

In [None]:
top_users('6521b61bd210879b55e08e26')

In [None]:
similar_user_recs('6521b61bd210879b55e08e26')

In [None]:
predicted_preference("Billop House", '6521b61bd210879b55e08e26')

In [None]:
import pickle

In [None]:
print(item_sim_df.head())

print(user_sim_df.head())


In [None]:
pickle.dump(user_sim_df,open('dataset/user_similarity.pkl','wb'))
pickle.dump(item_sim_df,open('dataset/item_similarity.pkl','wb'))

In [None]:
pickle.dump(piv_norm,open('dataset/piv_norm.pkl','wb'))