In [1]:
import os
import numpy as np
import pandas as pd

from dotenv import load_dotenv
from IPython.display import display

load_dotenv()

DB_URL = os.getenv('DB_URL')

pd.set_option('display.max_columns', 50)

In [2]:
from pymongo import MongoClient

client = MongoClient(DB_URL)
db = client['alkoholove']

In [3]:
ALCOHOL_COLUMNS = ['_id', 'name', 'kind', 'type', 'alcohol_by_volume', 'color', 'manufacturer', 'country', 'region']

alcohols_collection = db['alcohols']
items_df = pd.DataFrame(
    list(alcohols_collection.find({}, {field_name: 1 for field_name in ALCOHOL_COLUMNS}))
).rename(columns={'_id': 'item_id'})

items_df['item_id'] = items_df['item_id'].apply(lambda x: str(x))
items_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

display(items_df.head(10))

Unnamed: 0,item_id,name,kind,type,alcohol_by_volume,color,manufacturer,country,region
0,62aa2b37e33ccae4961a4daa,Jägermeister,likier,Ziołowy,35.0,ciemny,Mast-Jägermeister SE,Niemcy,
1,62ab28e88a757f60cc3f31db,Biały Bocian Słony Karmel,likier,mleczny,16.0,karmelowy,Polmos,Polska,Bielsko-Biała
2,62ab2a32fd2e7fbd58da41d9,Biały Bocian Advocat,likier,jajeczny,16.0,żółty,Polmos,Polska,Bielsko-Biała
3,62ab611c4a2fcedd4ce86a79,James cook white oversea,rum,biały,37.5,biały,Eckerts Wacholder Brennerei GmbH,Niemcy,
4,62ab638288a7811f65839221,Krupnik Słony Karmel,likier,mleczny,16.0,karmelowy,Sobieski,Polska,
5,62ab661388a7811f65839222,Havana Club Anejo Especial,rum,złoty,37.5,złoty,Havana Club,Kuba,
6,62ab681f88a7811f65839223,Sheridans Coffee Layered Liqueur,likier,kawowy,15.5,czarny i biały,Thomas Sheridan & Sons,Irlandia,Dublin
7,62ab6afe4a2fcedd4ce86a7a,Captain Morgan Original Spiced Gold,rum,złoty,35.0,bursztynowy,Diageo PLC,Anglia,Londyn
8,62ab6cb588a7811f65839224,Captain Morgan White Rum,rum,biały,35.0,biały,Diageo PLC,Anglia,Londyn
9,62ab6eca88a7811f65839225,She Słony Karmel,likier,mleczny,17.0,karmelowy,BZK Alco,Polska,


In [4]:
reviews_collection = db['reviews']

reviews_df = pd.DataFrame(
    list(reviews_collection.find({}, {'_id': 0, 'user_id': 1, 'alcohol_id': 1, 'rating': 1}))
).rename(columns={'alcohol_id': 'item_id'})
display(reviews_df.head(10))


# free memory and close db connection
# del reviews_collection, favourites_collection, wishlist_collection, user_search_collection, user_search
db.client.close()

Unnamed: 0,user_id,item_id,rating
0,6288e2fdd5ab6070dde8db8c,6288e32dd5ab6070dde8db8a,5
1,629f5bee10456c7cbc3af712,62b4c0265aff740017de687a,3
2,629f5bee10456c7cbc3af712,62b5871901bc976edbd13a94,3
3,62ab1fb3fd2e7fbd58da41d3,62ab7d6b88a7811f6583922b,5
4,62ab1fb3fd2e7fbd58da41d3,62b4391cd7995d801996d75f,4
5,62acd4d9ba0c8633231112d9,62fa9edb2eb13eb1b8afd64c,4
6,62aed4f0d20cdefa840bac0b,62b4a1805aff740017de6866,5
7,62acd4d9ba0c8633231112d9,62d1bb5e0f95370144ad09e2,4
8,62ab1b5cfd2e7fbd58da41cf,62d5b5fd20512ec9fa9a0dd6,4
9,62ab1b5cfd2e7fbd58da41cf,62aa2b37e33ccae4961a4daa,5


In [8]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, GridSearchCV

reader = Reader(rating_scale=(1.0, 5.0))

reviews_df['user_id'] = reviews_df['user_id'].apply(lambda x: str(x))
reviews_df['item_id'] = reviews_df['item_id'].apply(lambda x: str(x))
data = Dataset.load_from_df(reviews_df[['user_id', 'item_id', 'rating']], reader)
display(reviews_df)
# data.split(n_folds=5)

# svd
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
gs.fit(data)
# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])
data = data.build_full_trainset()
algo = gs.best_estimator["rmse"]
algo.fit(data)

testset = data.build_anti_testset()
predictions = algo.test(testset)

Unnamed: 0,user_id,item_id,rating
0,6288e2fdd5ab6070dde8db8c,6288e32dd5ab6070dde8db8a,5
1,629f5bee10456c7cbc3af712,62b4c0265aff740017de687a,3
2,629f5bee10456c7cbc3af712,62b5871901bc976edbd13a94,3
3,62ab1fb3fd2e7fbd58da41d3,62ab7d6b88a7811f6583922b,5
4,62ab1fb3fd2e7fbd58da41d3,62b4391cd7995d801996d75f,4
5,62acd4d9ba0c8633231112d9,62fa9edb2eb13eb1b8afd64c,4
6,62aed4f0d20cdefa840bac0b,62b4a1805aff740017de6866,5
7,62acd4d9ba0c8633231112d9,62d1bb5e0f95370144ad09e2,4
8,62ab1b5cfd2e7fbd58da41cf,62d5b5fd20512ec9fa9a0dd6,4
9,62ab1b5cfd2e7fbd58da41cf,62aa2b37e33ccae4961a4daa,5


1.3073778370748912
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}


In [9]:
from collections import defaultdict

# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:5]

print([f"ObjectId('{top[0]}')" for top in top_n['62aed4f0d20cdefa840bac0b']])

["ObjectId('62b494eb5aff740017de6858')", "ObjectId('62b4391cd7995d801996d75f')", "ObjectId('62d1bf360f95370144ad09e3')", "ObjectId('636577e40142f7420f3cc7a5')", "ObjectId('62efdca590ffde236a830dc7')"]


In [10]:
# tutaj zapisać model jako np. pickle
# powyżej są predykcje