In [1]:
import os
import pandas as pd
import numpy as np
from time import time
from tqdm.notebook import tqdm
import random
import pickle

from scipy.sparse import coo_matrix, csr_matrix

from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k, reciprocal_rank

from hyperopt import fmin, hp, tpe, Trials

from sklearn.base import clone

## Normalization
from sklearn.preprocessing import LabelEncoder, MinMaxScaler




In [2]:
item_features = pd.read_csv("dataset/item_features.csv", index_col=0, encoding="UTF-8")
user_features = pd.read_csv("dataset/user_features.csv", index_col=0, encoding="UTF-8")
train_rating = pd.read_csv("dataset/train_rating.csv", index_col=0, encoding="UTF-8")
test_rating = pd.read_csv("dataset/test_rating.csv", index_col=0, encoding="UTF-8")
whisky = pd.read_csv("dataset/whisky.csv", index_col=0, encoding="UTF-8")

In [3]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3536 entries, 0 to 3535
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   whisky_id   3536 non-null   int64  
 1   category    3536 non-null   int64  
 2   price_tier  3536 non-null   int64  
 3   abv         3536 non-null   float64
 4   smoky       3536 non-null   float64
 5   peaty       3536 non-null   float64
 6   spicy       3536 non-null   float64
 7   herbal      3536 non-null   float64
 8   oily        3536 non-null   float64
 9   body        3536 non-null   float64
 10  rich        3536 non-null   float64
 11  sweet       3536 non-null   float64
 12  salty       3536 non-null   float64
 13  vanilla     3536 non-null   float64
 14  tart        3536 non-null   float64
 15  fruity      3536 non-null   float64
 16  floral      3536 non-null   float64
dtypes: float64(14), int64(3)
memory usage: 497.2 KB


In [4]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119516 entries, 0 to 119515
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   user_id     119516 non-null  int64  
 1   price_tier  119516 non-null  int64  
 2   smoky       119516 non-null  float64
 3   peaty       119516 non-null  float64
 4   spicy       119516 non-null  float64
 5   herbal      119516 non-null  float64
 6   oily        119516 non-null  float64
 7   body        119516 non-null  float64
 8   rich        119516 non-null  float64
 9   sweet       119516 non-null  float64
 10  salty       119516 non-null  float64
 11  vanilla     119516 non-null  float64
 12  tart        119516 non-null  float64
 13  fruity      119516 non-null  float64
 14  floral      119516 non-null  float64
dtypes: float64(13), int64(2)
memory usage: 14.6 MB


In [5]:
item_features

Unnamed: 0,whisky_id,category,price_tier,abv,smoky,peaty,spicy,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,0,0,0.000,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000
1,1,1,5,0.260,0.30,0.85,0.50,0.30,0.20,0.8,0.80,0.85,0.166667,0.20,0.25,0.85,0.526316
2,2,12,4,0.260,0.40,0.30,0.40,0.20,0.40,0.7,0.80,0.70,0.444444,0.50,0.50,0.70,0.210526
3,3,5,5,0.542,0.15,0.00,0.20,0.00,0.15,0.8,0.90,0.85,0.055556,0.30,0.25,0.35,0.000000
4,4,5,4,0.569,0.40,0.00,0.65,0.50,0.20,0.6,0.60,0.45,0.000000,0.60,0.60,0.45,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3531,3531,8,1,0.260,0.35,0.00,0.20,0.00,0.00,0.4,0.55,0.85,0.000000,0.25,0.10,0.10,0.000000
3532,3532,2,1,0.200,0.00,0.00,0.40,0.20,0.00,0.3,0.00,1.00,0.000000,1.00,0.00,0.60,0.105263
3533,3533,0,3,0.240,0.90,0.10,0.10,0.10,0.40,0.1,0.10,0.90,0.000000,0.50,0.20,0.30,0.105263
3534,3534,9,2,0.290,0.00,0.00,0.10,1.00,0.80,0.0,0.00,0.40,0.000000,0.20,0.00,0.00,0.000000


In [6]:
item_features = item_features[["whisky_id", "price_tier"] + item_features.columns.tolist()[4:]]

In [7]:
item_features

Unnamed: 0,whisky_id,price_tier,smoky,peaty,spicy,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,0,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000
1,1,5,0.30,0.85,0.50,0.30,0.20,0.8,0.80,0.85,0.166667,0.20,0.25,0.85,0.526316
2,2,4,0.40,0.30,0.40,0.20,0.40,0.7,0.80,0.70,0.444444,0.50,0.50,0.70,0.210526
3,3,5,0.15,0.00,0.20,0.00,0.15,0.8,0.90,0.85,0.055556,0.30,0.25,0.35,0.000000
4,4,4,0.40,0.00,0.65,0.50,0.20,0.6,0.60,0.45,0.000000,0.60,0.60,0.45,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3531,3531,1,0.35,0.00,0.20,0.00,0.00,0.4,0.55,0.85,0.000000,0.25,0.10,0.10,0.000000
3532,3532,1,0.00,0.00,0.40,0.20,0.00,0.3,0.00,1.00,0.000000,1.00,0.00,0.60,0.105263
3533,3533,3,0.90,0.10,0.10,0.10,0.40,0.1,0.10,0.90,0.000000,0.50,0.20,0.30,0.105263
3534,3534,2,0.00,0.00,0.10,1.00,0.80,0.0,0.00,0.40,0.000000,0.20,0.00,0.00,0.000000


# Dataset

In [8]:
dataset = Dataset()

In [9]:
dataset.fit(users=user_features.user_id.unique().tolist(), items=item_features.whisky_id.unique().tolist(), item_features=item_features.columns.tolist()[1:], user_features=user_features.columns.tolist()[1:])

In [10]:
train_rating

Unnamed: 0,user_id,whisky_id,rating
690996,54037,724,8.0
715031,59687,734,10.0
402221,20171,1086,8.0
245393,9477,409,10.0
349470,15773,3221,8.0
...,...,...,...
259178,10263,93,9.0
365838,17113,1645,6.0
131932,4236,818,8.0
671155,50140,1353,8.0


In [11]:
def make_source(data):
    source = []
    for row in data.itertuples(index=False):
        meta = {feat: value for feat, value in zip(data.columns[1:], row[1:])}
        source.append((row[0], meta))
    return source

In [12]:
%time item_source = make_source(item_features)

CPU times: total: 0 ns
Wall time: 23.1 ms


In [13]:
%time user_source = make_source(user_features)

CPU times: total: 297 ms
Wall time: 839 ms


In [14]:
user_source

[(0,
  {'price_tier': 0,
   'smoky': 0.0,
   'peaty': 0.0,
   'spicy': 0.0,
   'herbal': 0.0,
   'oily': 0.0,
   'body': 0.0,
   'rich': 0.0,
   'sweet': 0.0,
   'salty': 0.0,
   'vanilla': 0.0,
   'tart': 0.0,
   'fruity': 0.0,
   'floral': 0.0}),
 (1,
  {'price_tier': 5,
   'smoky': 0.3,
   'peaty': 0.85,
   'spicy': 0.5,
   'herbal': 0.3,
   'oily': 0.2,
   'body': 0.8,
   'rich': 0.8,
   'sweet': 0.85,
   'salty': 0.1666666666666666,
   'vanilla': 0.2,
   'tart': 0.25,
   'fruity': 0.85,
   'floral': 0.5263157894736842}),
 (2,
  {'price_tier': 5,
   'smoky': 0.3,
   'peaty': 0.0,
   'spicy': 0.4,
   'herbal': 0.1,
   'oily': 0.3,
   'body': 0.7000000000000001,
   'rich': 0.8,
   'sweet': 0.8,
   'salty': 0.1111111111111111,
   'vanilla': 0.6,
   'tart': 0.2,
   'fruity': 0.9,
   'floral': 0.1052631578947368}),
 (3,
  {'price_tier': 4,
   'smoky': 0.3,
   'peaty': 0.2,
   'spicy': 0.6,
   'herbal': 0.2,
   'oily': 0.4,
   'body': 0.6,
   'rich': 0.6,
   'sweet': 0.7000000000000001,


In [15]:
%time rating_source = list(zip(train_rating['user_id'], train_rating['whisky_id'], train_rating['rating']))

CPU times: total: 156 ms
Wall time: 246 ms


### build rating

In [16]:
train_interactions, train_weights = dataset.build_interactions(rating_source)

### build item_features

In [17]:
item_meta = dataset.build_item_features(item_source, normalize=False)

### build user_features

In [18]:
user_meta = dataset.build_user_features(user_source, normalize=False)

### Check Correct Build Dataset

In [19]:
train_rating

Unnamed: 0,user_id,whisky_id,rating
690996,54037,724,8.0
715031,59687,734,10.0
402221,20171,1086,8.0
245393,9477,409,10.0
349470,15773,3221,8.0
...,...,...,...
259178,10263,93,9.0
365838,17113,1645,6.0
131932,4236,818,8.0
671155,50140,1353,8.0


In [20]:
print(train_weights)

  (54037, 724)	8.0
  (59687, 734)	10.0
  (20171, 1086)	8.0
  (9477, 409)	10.0
  (15773, 3221)	8.0
  (17797, 2380)	7.0
  (96948, 3024)	6.0
  (45726, 2889)	9.0
  (78664, 2657)	4.0
  (25114, 811)	6.0
  (2104, 778)	6.0
  (4804, 2861)	6.0
  (12226, 1610)	8.0
  (1319, 2056)	7.0
  (1319, 1978)	8.0
  (88245, 2246)	7.0
  (55471, 2933)	6.0
  (9296, 911)	10.0
  (28552, 1369)	8.0
  (17523, 1985)	4.0
  (26872, 148)	9.0
  (26562, 2657)	6.0
  (23456, 98)	8.0
  (73927, 2248)	8.0
  (23913, 2665)	6.0
  :	:
  (81778, 1261)	7.0
  (7329, 1240)	5.0
  (76619, 1091)	8.0
  (14205, 2679)	8.0
  (60506, 1096)	10.0
  (13890, 1988)	8.0
  (80043, 1345)	8.0
  (1925, 1077)	7.0
  (14354, 3058)	9.0
  (1233, 811)	10.0
  (11400, 2660)	6.0
  (6884, 152)	8.0
  (6365, 1348)	8.0
  (2649, 402)	10.0
  (29643, 2674)	6.0
  (4420, 2982)	6.0
  (1576, 2925)	7.0
  (63842, 815)	8.0
  (3556, 2165)	8.0
  (45420, 557)	9.0
  (10263, 93)	9.0
  (17113, 1645)	6.0
  (4236, 818)	8.0
  (50140, 1353)	8.0
  (3891, 57)	10.0


In [21]:
user_features

Unnamed: 0,user_id,price_tier,smoky,peaty,spicy,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,0,0.00,0.00,0.0,0.00,0.0,0.0,0.0,0.00,0.000000,0.00,0.00,0.00,0.000000
1,1,5,0.30,0.85,0.5,0.30,0.2,0.8,0.8,0.85,0.166667,0.20,0.25,0.85,0.526316
2,2,5,0.30,0.00,0.4,0.10,0.3,0.7,0.8,0.80,0.111111,0.60,0.20,0.90,0.105263
3,3,4,0.30,0.20,0.6,0.20,0.4,0.6,0.6,0.70,0.111111,0.60,0.50,0.70,0.421053
4,4,5,0.20,0.00,0.6,0.40,0.3,0.7,0.7,0.60,0.111111,0.40,0.50,0.80,0.315789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119511,119511,1,0.00,0.00,0.4,0.20,0.0,0.3,0.0,1.00,0.000000,1.00,0.00,0.60,0.105263
119512,119512,1,0.00,0.00,0.4,0.20,0.0,0.3,0.0,1.00,0.000000,1.00,0.00,0.60,0.105263
119513,119513,1,0.31,0.00,0.4,0.29,0.0,0.3,0.4,1.00,0.000000,0.55,0.00,0.30,0.105263
119514,119514,1,0.31,0.00,0.4,0.29,0.0,0.3,0.4,1.00,0.000000,0.55,0.00,0.30,0.105263


In [22]:
print(user_meta)

  (0, 0)	1.0
  (0, 119516)	0.0
  (0, 119517)	0.0
  (0, 119518)	0.0
  (0, 119519)	0.0
  (0, 119520)	0.0
  (0, 119521)	0.0
  (0, 119522)	0.0
  (0, 119523)	0.0
  (0, 119524)	0.0
  (0, 119525)	0.0
  (0, 119526)	0.0
  (0, 119527)	0.0
  (0, 119528)	0.0
  (0, 119529)	0.0
  (1, 1)	1.0
  (1, 119516)	5.0
  (1, 119517)	0.3
  (1, 119518)	0.85
  (1, 119519)	0.5
  (1, 119520)	0.3
  (1, 119521)	0.2
  (1, 119522)	0.8
  (1, 119523)	0.8
  (1, 119524)	0.85
  :	:
  (119514, 119520)	0.29
  (119514, 119521)	0.0
  (119514, 119522)	0.3
  (119514, 119523)	0.4
  (119514, 119524)	1.0
  (119514, 119525)	0.0
  (119514, 119526)	0.55
  (119514, 119527)	0.0
  (119514, 119528)	0.3
  (119514, 119529)	0.10526316
  (119515, 119515)	1.0
  (119515, 119516)	1.0
  (119515, 119517)	0.31
  (119515, 119518)	0.0
  (119515, 119519)	0.4
  (119515, 119520)	0.29
  (119515, 119521)	0.0
  (119515, 119522)	0.3
  (119515, 119523)	0.4
  (119515, 119524)	1.0
  (119515, 119525)	0.0
  (119515, 119526)	0.55
  (119515, 119527)	0.0
  (119515, 

In [23]:
item_features

Unnamed: 0,whisky_id,price_tier,smoky,peaty,spicy,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,0,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000
1,1,5,0.30,0.85,0.50,0.30,0.20,0.8,0.80,0.85,0.166667,0.20,0.25,0.85,0.526316
2,2,4,0.40,0.30,0.40,0.20,0.40,0.7,0.80,0.70,0.444444,0.50,0.50,0.70,0.210526
3,3,5,0.15,0.00,0.20,0.00,0.15,0.8,0.90,0.85,0.055556,0.30,0.25,0.35,0.000000
4,4,4,0.40,0.00,0.65,0.50,0.20,0.6,0.60,0.45,0.000000,0.60,0.60,0.45,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3531,3531,1,0.35,0.00,0.20,0.00,0.00,0.4,0.55,0.85,0.000000,0.25,0.10,0.10,0.000000
3532,3532,1,0.00,0.00,0.40,0.20,0.00,0.3,0.00,1.00,0.000000,1.00,0.00,0.60,0.105263
3533,3533,3,0.90,0.10,0.10,0.10,0.40,0.1,0.10,0.90,0.000000,0.50,0.20,0.30,0.105263
3534,3534,2,0.00,0.00,0.10,1.00,0.80,0.0,0.00,0.40,0.000000,0.20,0.00,0.00,0.000000


In [24]:
print(item_meta)

  (0, 0)	1.0
  (0, 3536)	0.0
  (0, 3537)	0.0
  (0, 3538)	0.0
  (0, 3539)	0.0
  (0, 3540)	0.0
  (0, 3541)	0.0
  (0, 3542)	0.0
  (0, 3543)	0.0
  (0, 3544)	0.0
  (0, 3545)	0.0
  (0, 3546)	0.0
  (0, 3547)	0.0
  (0, 3548)	0.0
  (0, 3549)	0.0
  (1, 1)	1.0
  (1, 3536)	5.0
  (1, 3537)	0.3
  (1, 3538)	0.85
  (1, 3539)	0.5
  (1, 3540)	0.3
  (1, 3541)	0.2
  (1, 3542)	0.8
  (1, 3543)	0.8
  (1, 3544)	0.85
  :	:
  (3534, 3540)	1.0
  (3534, 3541)	0.8
  (3534, 3542)	0.0
  (3534, 3543)	0.0
  (3534, 3544)	0.4
  (3534, 3545)	0.0
  (3534, 3546)	0.2
  (3534, 3547)	0.0
  (3534, 3548)	0.0
  (3534, 3549)	0.0
  (3535, 3535)	1.0
  (3535, 3536)	1.0
  (3535, 3537)	0.31
  (3535, 3538)	0.0
  (3535, 3539)	0.4
  (3535, 3540)	0.29
  (3535, 3541)	0.0
  (3535, 3542)	0.3
  (3535, 3543)	0.4
  (3535, 3544)	1.0
  (3535, 3545)	0.0
  (3535, 3546)	0.55
  (3535, 3547)	0.0
  (3535, 3548)	0.3
  (3535, 3549)	0.10526316


## Test data loading

In [26]:
class LightFMResizable(LightFM):
    """A LightFM that resizes the model to accomodate new users,
    items, and features"""

    def fit_partial(
        self,
        interactions,
        user_features=None,
        item_features=None,
        sample_weight=None,
        epochs=1,
        num_threads=1,
        verbose=False,
    ):
        try:
            self._check_initialized()
            self._resize(interactions, user_features, item_features)
        except ValueError:
            # This is the first call so just fit without resizing
            pass

        super().fit_partial(
            interactions,
            user_features,
            item_features,
            sample_weight,
            epochs,
            num_threads,
            verbose,
        )

        return self

    def _resize(self, interactions, user_features=None, item_features=None):
        """Resizes the model to accommodate new users/items/features"""

        no_components = self.no_components
        no_user_features, no_item_features = interactions.shape  # default

        if hasattr(user_features, "shape"):
            no_user_features = user_features.shape[-1]
        if hasattr(item_features, "shape"):
            no_item_features = item_features.shape[-1]

        if (
            no_user_features == self.user_embeddings.shape[0]
            and no_item_features == self.item_embeddings.shape[0]
        ):
            return self

        new_model = clone(self)
        new_model._initialize(no_components, no_item_features, no_user_features)

        # update all attributes from self._check_initialized
        for attr in (
            "item_embeddings",
            "item_embedding_gradients",
            "item_embedding_momentum",
            "item_biases",
            "item_bias_gradients",
            "item_bias_momentum",
            "user_embeddings",
            "user_embedding_gradients",
            "user_embedding_momentum",
            "user_biases",
            "user_bias_gradients",
            "user_bias_momentum",
        ):
            # extend attribute matrices with new rows/cols from
            # freshly initialized model with right shape
            old_array = getattr(self, attr)
            old_slice = [slice(None, i) for i in old_array.shape]
            new_array = getattr(new_model, attr)
            new_array[tuple(old_slice)] = old_array
            setattr(self, attr, new_array)

        return self

## Optuna 사용한 HyperParameter 최적화
---
- learning_rate, alpha 등의 하이퍼 파라미터 값을 작게 설정했을 때보다 높게 설정했을 때 AUC Score가 높게 나오는 경향이 보였습니다.
- HyperOPT는 베이지안 최적화 접근 기반인데 이 부분에 제대로 알지 못하기 때문에 전체를 돌려보는 Optuna 방식으로 변경하겠습니다.
- Optuna의 경우 시각화도 가능하고, GridSearchCV보다 빠르다는 장점이 있습니다.

In [33]:
test_rating

Unnamed: 0,user_id,whisky_id,rating
318988,13721,157,8.0
745931,67057,1345,6.0
536572,31050,2849,6.0
204686,7718,2253,8.0
130137,4168,817,8.0
...,...,...,...
496706,27186,818,10.0
842902,106820,2469,8.0
721143,61181,787,8.0
242880,9381,92,10.0


In [34]:
%time test_rating_source = list(zip(test_rating.user_id, test_rating.whisky_id, test_rating.rating))

CPU times: total: 31.2 ms
Wall time: 45.2 ms


In [35]:
test_interactions, _ = dataset.build_interactions(test_rating_source)

In [28]:
import optuna

In [38]:
def objective(trial):
    
    
    # 조정할 하이퍼 파라미터
    params = {
        "learning_schedule": 'adagrad',
        "loss": "warp",
        "random_state": 42,
        "no_components": trial.suggest_int("no_components", 30, 100, 10),
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'item_alpha': trial.suggest_float("item_alpha", 1e-4, 1e-2, log=True),
        'user_alpha': trial.suggest_float("user_alpha", 1e-4, 1e-2, log=True),
    }

    model = LightFM(**params)

    model.fit(interactions=train_interactions,
              sample_weight=train_weights,
              user_features=user_meta,
              item_features=item_meta,
              epochs=5,
              verbose=True)

    test_mrr = reciprocal_rank(model, test_interactions=test_interactions, user_features=user_meta, item_features=item_meta).mean()
    test_auc = auc_score(model, test_interactions=test_interactions, item_features=item_meta, user_features=user_meta).mean()
    
    print("no_comp: {}, lrn_rate: {:.5f}, item_alpha: {:.5f}, user_alpha: {:.5f}, MRR: {:.8f}, auc_score: {:.8f}".format(
      params["no_components"], params["learning_rate"], params["item_alpha"], params["user_alpha"], test_mrr, test_auc))
    return test_auc

In [39]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
print(study.best_trial.params)

[32m[I 2023-04-02 20:21:13,676][0m A new study created in memory with name: no-name-e8863fc1-dd9e-4275-9b28-a0786df0b133[0m
Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [03:43<00:00, 44.74s/it]
[32m[I 2023-04-02 20:28:39,507][0m Trial 0 finished with value: 0.9329132437705994 and parameters: {'no_components': 60, 'learning_rate': 0.00678062522880763, 'item_alpha': 0.0005683261079733912, 'user_alpha': 0.0006402837957743322}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 60, lrn_rate: 0.00678, item_alpha: 0.00057, user_alpha: 0.00064, MRR: 0.15505448, auc_score: 0.93291324


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [02:49<00:00, 33.84s/it]
[32m[I 2023-04-02 20:34:11,572][0m Trial 1 finished with value: 0.9294021129608154 and parameters: {'no_components': 40, 'learning_rate': 0.005068579804742459, 'item_alpha': 0.00038922929063842977, 'user_alpha': 0.00013421516243059163}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 40, lrn_rate: 0.00507, item_alpha: 0.00039, user_alpha: 0.00013, MRR: 0.14533150, auc_score: 0.92940211


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [06:21<00:00, 76.30s/it]
[32m[I 2023-04-02 20:45:32,520][0m Trial 2 finished with value: 0.9130638241767883 and parameters: {'no_components': 90, 'learning_rate': 0.00018630163787902894, 'item_alpha': 0.00030453547499434316, 'user_alpha': 0.002338221254811873}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 90, lrn_rate: 0.00019, item_alpha: 0.00030, user_alpha: 0.00234, MRR: 0.10406239, auc_score: 0.91306382


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [03:26<00:00, 41.22s/it]
[32m[I 2023-04-02 20:52:06,954][0m Trial 3 finished with value: 0.9289888143539429 and parameters: {'no_components': 50, 'learning_rate': 0.00493575851184934, 'item_alpha': 0.0003491996542223259, 'user_alpha': 0.0034171152692693304}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 50, lrn_rate: 0.00494, item_alpha: 0.00035, user_alpha: 0.00342, MRR: 0.14253208, auc_score: 0.92898881


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [02:44<00:00, 32.93s/it]
[32m[I 2023-04-02 20:57:25,440][0m Trial 4 finished with value: 0.9151421189308167 and parameters: {'no_components': 40, 'learning_rate': 0.0003612201650075271, 'item_alpha': 0.00029128420093905565, 'user_alpha': 0.00030532498092090363}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 40, lrn_rate: 0.00036, item_alpha: 0.00029, user_alpha: 0.00031, MRR: 0.10429528, auc_score: 0.91514212


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [03:21<00:00, 40.21s/it]
[32m[I 2023-04-02 21:03:56,288][0m Trial 5 finished with value: 0.9302500486373901 and parameters: {'no_components': 50, 'learning_rate': 0.005941490778797624, 'item_alpha': 0.00011491892552600027, 'user_alpha': 0.007967612935335907}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 50, lrn_rate: 0.00594, item_alpha: 0.00011, user_alpha: 0.00797, MRR: 0.14928898, auc_score: 0.93025005


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [06:37<00:00, 79.55s/it]
[32m[I 2023-04-02 21:16:35,811][0m Trial 6 finished with value: 0.9256355166435242 and parameters: {'no_components': 100, 'learning_rate': 0.002242654968023683, 'item_alpha': 0.00010639186637524714, 'user_alpha': 0.001012132344754737}. Best is trial 0 with value: 0.9329132437705994.[0m


no_comp: 100, lrn_rate: 0.00224, item_alpha: 0.00011, user_alpha: 0.00101, MRR: 0.13451031, auc_score: 0.92563552


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [02:02<00:00, 24.58s/it]
[32m[I 2023-04-02 21:20:57,593][0m Trial 7 finished with value: 0.9330645799636841 and parameters: {'no_components': 30, 'learning_rate': 0.009596136924069304, 'item_alpha': 0.00017225508600051282, 'user_alpha': 0.0032921006126597407}. Best is trial 7 with value: 0.9330645799636841.[0m


no_comp: 30, lrn_rate: 0.00960, item_alpha: 0.00017, user_alpha: 0.00329, MRR: 0.15494175, auc_score: 0.93306458


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:36<00:00, 55.21s/it]
[32m[I 2023-04-02 21:29:32,795][0m Trial 8 finished with value: 0.9169775247573853 and parameters: {'no_components': 60, 'learning_rate': 0.0006095829344930451, 'item_alpha': 0.0009318774226444935, 'user_alpha': 0.00017553021485336535}. Best is trial 7 with value: 0.9330645799636841.[0m


no_comp: 60, lrn_rate: 0.00061, item_alpha: 0.00093, user_alpha: 0.00018, MRR: 0.10604596, auc_score: 0.91697752


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:08<00:00, 49.69s/it]
[32m[I 2023-04-02 21:37:24,256][0m Trial 9 finished with value: 0.9320172071456909 and parameters: {'no_components': 60, 'learning_rate': 0.006168515700934664, 'item_alpha': 0.00045942896804523263, 'user_alpha': 0.0043603451673757745}. Best is trial 7 with value: 0.9330645799636841.[0m


no_comp: 60, lrn_rate: 0.00617, item_alpha: 0.00046, user_alpha: 0.00436, MRR: 0.15411931, auc_score: 0.93201721


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [02:08<00:00, 25.67s/it]
[32m[I 2023-04-02 21:41:39,632][0m Trial 10 finished with value: 0.9184748530387878 and parameters: {'no_components': 30, 'learning_rate': 0.0014479808354296365, 'item_alpha': 0.0030647829474133915, 'user_alpha': 0.008674557122991042}. Best is trial 7 with value: 0.9330645799636841.[0m


no_comp: 30, lrn_rate: 0.00145, item_alpha: 0.00306, user_alpha: 0.00867, MRR: 0.12002245, auc_score: 0.91847485


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:07<00:00, 61.44s/it]
[32m[I 2023-04-02 21:51:29,636][0m Trial 11 finished with value: 0.9359047412872314 and parameters: {'no_components': 80, 'learning_rate': 0.009090324814321975, 'item_alpha': 0.0018811434442359937, 'user_alpha': 0.0008623774305312679}. Best is trial 11 with value: 0.9359047412872314.[0m


no_comp: 80, lrn_rate: 0.00909, item_alpha: 0.00188, user_alpha: 0.00086, MRR: 0.16295059, auc_score: 0.93590474


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:59<00:00, 59.92s/it]
[32m[I 2023-04-02 22:01:10,863][0m Trial 12 finished with value: 0.9361629486083984 and parameters: {'no_components': 80, 'learning_rate': 0.009836220185325163, 'item_alpha': 0.0023713874072987147, 'user_alpha': 0.0014262540570272317}. Best is trial 12 with value: 0.9361629486083984.[0m


no_comp: 80, lrn_rate: 0.00984, item_alpha: 0.00237, user_alpha: 0.00143, MRR: 0.16469467, auc_score: 0.93616295


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:15<00:00, 63.09s/it]
[32m[I 2023-04-02 22:11:09,335][0m Trial 13 finished with value: 0.9264969825744629 and parameters: {'no_components': 80, 'learning_rate': 0.0027965077700669054, 'item_alpha': 0.0028113005744876953, 'user_alpha': 0.001326313856541654}. Best is trial 12 with value: 0.9361629486083984.[0m


no_comp: 80, lrn_rate: 0.00280, item_alpha: 0.00281, user_alpha: 0.00133, MRR: 0.13455562, auc_score: 0.92649698


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:52<00:00, 58.57s/it]
[32m[I 2023-04-02 22:20:37,821][0m Trial 14 finished with value: 0.9352853894233704 and parameters: {'no_components': 80, 'learning_rate': 0.009547702976402284, 'item_alpha': 0.006985579362292825, 'user_alpha': 0.0005530949537616392}. Best is trial 12 with value: 0.9361629486083984.[0m


no_comp: 80, lrn_rate: 0.00955, item_alpha: 0.00699, user_alpha: 0.00055, MRR: 0.16138050, auc_score: 0.93528539


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:05<00:00, 61.02s/it]
[32m[I 2023-04-02 22:30:19,847][0m Trial 15 finished with value: 0.9268054366111755 and parameters: {'no_components': 80, 'learning_rate': 0.0028649032873376635, 'item_alpha': 0.0017005209000585217, 'user_alpha': 0.0016226153292618086}. Best is trial 12 with value: 0.9361629486083984.[0m


no_comp: 80, lrn_rate: 0.00286, item_alpha: 0.00170, user_alpha: 0.00162, MRR: 0.13521609, auc_score: 0.92680544


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [06:16<00:00, 75.26s/it]
[32m[I 2023-04-02 22:43:03,994][0m Trial 16 finished with value: 0.9187120199203491 and parameters: {'no_components': 100, 'learning_rate': 0.0009983493484451327, 'item_alpha': 0.0013172035219929685, 'user_alpha': 0.0007456348036402391}. Best is trial 12 with value: 0.9361629486083984.[0m


no_comp: 100, lrn_rate: 0.00100, item_alpha: 0.00132, user_alpha: 0.00075, MRR: 0.12487864, auc_score: 0.91871202


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:19<00:00, 63.97s/it]
[32m[I 2023-04-02 22:53:13,870][0m Trial 17 finished with value: 0.9270442724227905 and parameters: {'no_components': 70, 'learning_rate': 0.0035552400999889896, 'item_alpha': 0.009259882333734173, 'user_alpha': 0.00044383872637924386}. Best is trial 12 with value: 0.9361629486083984.[0m


no_comp: 70, lrn_rate: 0.00356, item_alpha: 0.00926, user_alpha: 0.00044, MRR: 0.13525724, auc_score: 0.92704427


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [06:09<00:00, 73.97s/it]
[32m[I 2023-04-02 23:04:30,197][0m Trial 18 finished with value: 0.9362269639968872 and parameters: {'no_components': 90, 'learning_rate': 0.009631461229439779, 'item_alpha': 0.003866305614841751, 'user_alpha': 0.0014993483500464915}. Best is trial 18 with value: 0.9362269639968872.[0m


no_comp: 90, lrn_rate: 0.00963, item_alpha: 0.00387, user_alpha: 0.00150, MRR: 0.16463806, auc_score: 0.93622696


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [06:27<00:00, 77.41s/it]
[32m[I 2023-04-02 23:16:33,589][0m Trial 19 finished with value: 0.9236204624176025 and parameters: {'no_components': 90, 'learning_rate': 0.0018717151982849483, 'item_alpha': 0.005333240617593511, 'user_alpha': 0.001625569311570819}. Best is trial 18 with value: 0.9362269639968872.[0m


no_comp: 90, lrn_rate: 0.00187, item_alpha: 0.00533, user_alpha: 0.00163, MRR: 0.13295339, auc_score: 0.92362046


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:55<00:00, 71.11s/it]
[32m[I 2023-04-02 23:28:06,686][0m Trial 20 finished with value: 0.9299624562263489 and parameters: {'no_components': 90, 'learning_rate': 0.00422828421510483, 'item_alpha': 0.00355875603929301, 'user_alpha': 0.0011327764532981092}. Best is trial 18 with value: 0.9362269639968872.[0m


no_comp: 90, lrn_rate: 0.00423, item_alpha: 0.00356, user_alpha: 0.00113, MRR: 0.14328130, auc_score: 0.92996246


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:37<00:00, 55.59s/it]
[32m[I 2023-04-02 23:36:46,595][0m Trial 21 finished with value: 0.936339259147644 and parameters: {'no_components': 70, 'learning_rate': 0.009891877276517561, 'item_alpha': 0.001984646797898415, 'user_alpha': 0.000831004575438301}. Best is trial 21 with value: 0.936339259147644.[0m


no_comp: 70, lrn_rate: 0.00989, item_alpha: 0.00198, user_alpha: 0.00083, MRR: 0.16392823, auc_score: 0.93633926


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:33<00:00, 54.75s/it]
[32m[I 2023-04-02 23:45:30,133][0m Trial 22 finished with value: 0.928462028503418 and parameters: {'no_components': 70, 'learning_rate': 0.00387309415683631, 'item_alpha': 0.0022294383286992556, 'user_alpha': 0.0018442696344642634}. Best is trial 21 with value: 0.936339259147644.[0m


no_comp: 70, lrn_rate: 0.00387, item_alpha: 0.00223, user_alpha: 0.00184, MRR: 0.14145416, auc_score: 0.92846203


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:13<00:00, 50.62s/it]
[32m[I 2023-04-02 23:53:35,166][0m Trial 23 finished with value: 0.9329577088356018 and parameters: {'no_components': 70, 'learning_rate': 0.007014620210962102, 'item_alpha': 0.004506417344147944, 'user_alpha': 0.0009945113745215468}. Best is trial 21 with value: 0.936339259147644.[0m


no_comp: 70, lrn_rate: 0.00701, item_alpha: 0.00451, user_alpha: 0.00099, MRR: 0.16003096, auc_score: 0.93295771


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:09<00:00, 61.96s/it]
[32m[I 2023-04-03 00:03:34,819][0m Trial 24 finished with value: 0.9375075697898865 and parameters: {'no_components': 90, 'learning_rate': 0.009989585476310485, 'item_alpha': 0.0011060554737823125, 'user_alpha': 0.00039125844441558665}. Best is trial 24 with value: 0.9375075697898865.[0m


no_comp: 90, lrn_rate: 0.00999, item_alpha: 0.00111, user_alpha: 0.00039, MRR: 0.16705135, auc_score: 0.93750757


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:48<00:00, 69.68s/it]
[32m[I 2023-04-03 00:15:07,872][0m Trial 25 finished with value: 0.934459388256073 and parameters: {'no_components': 100, 'learning_rate': 0.0065250302209355, 'item_alpha': 0.0009413023729381945, 'user_alpha': 0.00037575471827829}. Best is trial 24 with value: 0.9375075697898865.[0m


no_comp: 100, lrn_rate: 0.00653, item_alpha: 0.00094, user_alpha: 0.00038, MRR: 0.16234662, auc_score: 0.93445939


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:29<00:00, 65.97s/it]
[32m[I 2023-04-03 00:26:10,359][0m Trial 26 finished with value: 0.9299412965774536 and parameters: {'no_components': 90, 'learning_rate': 0.004036621155111975, 'item_alpha': 0.0014062853396117707, 'user_alpha': 0.00030480105060731857}. Best is trial 24 with value: 0.9375075697898865.[0m


no_comp: 90, lrn_rate: 0.00404, item_alpha: 0.00141, user_alpha: 0.00030, MRR: 0.14403403, auc_score: 0.92994130


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:31<00:00, 66.33s/it]
[32m[I 2023-04-03 00:37:00,584][0m Trial 27 finished with value: 0.9341044425964355 and parameters: {'no_components': 90, 'learning_rate': 0.006963832588338723, 'item_alpha': 0.0038464404422212006, 'user_alpha': 0.000508148358393203}. Best is trial 24 with value: 0.9375075697898865.[0m


no_comp: 90, lrn_rate: 0.00696, item_alpha: 0.00385, user_alpha: 0.00051, MRR: 0.16185331, auc_score: 0.93410444


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:32<00:00, 54.55s/it]
[32m[I 2023-04-03 00:45:44,834][0m Trial 28 finished with value: 0.9315229058265686 and parameters: {'no_components': 70, 'learning_rate': 0.005214156344218886, 'item_alpha': 0.0007123629078545552, 'user_alpha': 0.0007515767632411423}. Best is trial 24 with value: 0.9375075697898865.[0m


no_comp: 70, lrn_rate: 0.00521, item_alpha: 0.00071, user_alpha: 0.00075, MRR: 0.15202473, auc_score: 0.93152291


Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [06:19<00:00, 75.93s/it]
[32m[I 2023-04-03 00:58:03,416][0m Trial 29 finished with value: 0.9349851608276367 and parameters: {'no_components': 100, 'learning_rate': 0.0070418603205294385, 'item_alpha': 0.0012045103952348882, 'user_alpha': 0.0007012082238262066}. Best is trial 24 with value: 0.9375075697898865.[0m


no_comp: 100, lrn_rate: 0.00704, item_alpha: 0.00120, user_alpha: 0.00070, MRR: 0.16108523, auc_score: 0.93498516
{'no_components': 90, 'learning_rate': 0.009989585476310485, 'item_alpha': 0.0011060554737823125, 'user_alpha': 0.00039125844441558665}


BEST no_comp: 120, lrn_rate: 0.01000, item_alpha: 0.05000, user_alpha: 0.01000, precision: 0.00468, recall: 0.02312, auc_score: 0.80381

In [40]:
print("Best Params : {}".format(study.best_params))
print()
print("Best Trials : {}".format(study.best_trials))
print()
print("Best Values : {}".format(study.best_value))

Best Params : {'no_components': 90, 'learning_rate': 0.009989585476310485, 'item_alpha': 0.0011060554737823125, 'user_alpha': 0.00039125844441558665}

Best Trials : [FrozenTrial(number=24, state=TrialState.COMPLETE, values=[0.9375075697898865], datetime_start=datetime.datetime(2023, 4, 2, 23, 53, 35, 167154), datetime_complete=datetime.datetime(2023, 4, 3, 0, 3, 34, 819516), params={'no_components': 90, 'learning_rate': 0.009989585476310485, 'item_alpha': 0.0011060554737823125, 'user_alpha': 0.00039125844441558665}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'no_components': IntDistribution(high=100, log=False, low=30, step=10), 'learning_rate': FloatDistribution(high=0.01, log=True, low=0.0001, step=None), 'item_alpha': FloatDistribution(high=0.01, log=True, low=0.0001, step=None), 'user_alpha': FloatDistribution(high=0.01, log=True, low=0.0001, step=None)}, trial_id=24, value=None)]

Best Values : 0.9375075697898865


### 시간까지 고려했을 때
---
no_comp: 70, lrn_rate: 0.00989, item_alpha: 0.00198, user_alpha: 0.00083, MRR: 0.16392823, auc_score: 0.93633926

In [41]:
model = LightFM(
    no_components=70, learning_rate=0.00989, item_alpha=0.00198, user_alpha=0.00083, learning_schedule='adagrad',loss="warp", random_state=42
)
%time model.fit(interactions=train_interactions, sample_weight=train_weights, item_features=item_meta, user_features=user_meta, epochs=5, verbose=True)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [04:20<00:00, 52.13s/it]

CPU times: total: 1min 46s
Wall time: 4min 20s





<lightfm.lightfm.LightFM at 0x22e4c9bb220>

In [42]:
%time test_rating_source = list(zip(test_rating['user_id'], test_rating['whisky_id'], test_rating['rating']))

CPU times: total: 31.2 ms
Wall time: 83 ms


In [43]:
test_interactions, _ = dataset.build_interactions(test_rating_source)

In [44]:
%time test_precision = precision_at_k(model, test_interactions, user_features=user_meta, item_features=item_meta, k=9).mean()
%time test_recall = recall_at_k(model, test_interactions,  user_features=user_meta, item_features=item_meta, k=9).mean()
%time test_auc = auc_score(model, test_interactions, user_features=user_meta, item_features=item_meta).mean()
%time test_rep = reciprocal_rank(model, test_interactions=test_interactions, user_features=user_meta, item_features=item_meta).mean()


print("Train precision: %.8f" % test_precision)
print("Test recall: %.8f" % test_recall)
print("Test AUC Score : %.8f" % test_auc)
print("Test MRR : %.8f" % test_rep)

CPU times: total: 52 s
Wall time: 2min 10s
CPU times: total: 47.9 s
Wall time: 2min 7s
CPU times: total: 44.2 s
Wall time: 2min 2s
CPU times: total: 42.7 s
Wall time: 2min 8s
Train precision: 0.04280206
Test recall: 0.18193766
Test AUC Score : 0.93622875
Test MRR : 0.16574501


CPU times: total: 52 s          Wall time: 2min 10s        Train precision: 0.04280206 <br>
CPU times: total: 47.9 s        Wall time: 2min 7s        Test precision: 0.18193766 <br>
CPU times: total: 44.2 s        Wall time: 2min 2s        Test AUC Score : 0.93622875 <br>
CPU times: total: 42.7 s        Wall time: 2min 8s        MRR 0.16574501

In [45]:
# save the model to a file
with open('model/whizzle_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [46]:
with open("model/whizzle_dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)