In [1]:
import os
import pandas as pd
import numpy as np
from time import time
from tqdm.notebook import tqdm
import pickle

from scipy.sparse import coo_matrix, csr_matrix

from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k

from hyperopt import fmin, hp, tpe, Trials



In [2]:
item_features = pd.read_csv("dataset/item_features.csv", index_col=0, encoding="UTF-8")
user_features = pd.read_csv("dataset/user_features.csv", index_col=0, encoding="UTF-8")
rating = pd.read_csv("dataset/rating.csv", index_col=0, encoding="UTF-8")
whisky = pd.read_csv("dataset/whisky.csv", index_col=0, encoding="UTF-8")

In [3]:
item_features = csr_matrix(item_features)
user_features = csr_matrix(user_features)

In [8]:
print(user_features)

  (0, 0)	5
  (0, 1)	30
  (0, 2)	85
  (0, 3)	50
  (0, 4)	30
  (0, 5)	20
  (0, 6)	80
  (0, 7)	80
  (0, 8)	85
  (0, 9)	15
  (0, 10)	20
  (0, 11)	25
  (0, 12)	85
  (0, 13)	50
  (1, 0)	5
  (1, 1)	30
  (1, 3)	40
  (1, 4)	10
  (1, 5)	30
  (1, 6)	70
  (1, 7)	80
  (1, 8)	80
  (1, 9)	10
  (1, 10)	60
  (1, 11)	20
  :	:
  (119512, 7)	40
  (119512, 8)	100
  (119512, 10)	55
  (119512, 12)	30
  (119512, 13)	10
  (119513, 0)	1
  (119513, 1)	31
  (119513, 3)	40
  (119513, 4)	29
  (119513, 6)	30
  (119513, 7)	40
  (119513, 8)	100
  (119513, 10)	55
  (119513, 12)	30
  (119513, 13)	10
  (119514, 0)	1
  (119514, 1)	31
  (119514, 3)	40
  (119514, 4)	29
  (119514, 6)	30
  (119514, 7)	40
  (119514, 8)	100
  (119514, 10)	55
  (119514, 12)	30
  (119514, 13)	10


In [9]:
unique_user = rating["user_id"].unique()

In [10]:
unique_user

array([     0,      1,      2, ..., 119512, 119513, 119514], dtype=int64)

In [None]:
rating.groupby()

### make Interactions 

In [11]:
rating.shape

(908176, 3)

In [12]:
def create_user_item_interactions(rating, n_users, n_items):
    interactions = coo_matrix(
        (
            rating['rating'].values,
            (rating['user_id'].values, rating["whisky_id"].values)
        ),
        shape=(n_users, n_items)
    )
    return interactions

In [13]:
interactions = create_user_item_interactions(rating, rating["user_id"].nunique(), whisky["whisky_id"].nunique())

In [14]:
print(interactions)
interactions

  (0, 0)	9.0
  (1, 0)	7.0
  (2, 0)	9.0
  (3, 0)	10.0
  (4, 0)	9.0
  (5, 0)	7.0
  (6, 0)	10.0
  (7, 0)	7.0
  (8, 0)	10.0
  (9, 0)	10.0
  (10, 0)	9.0
  (11, 0)	9.0
  (12, 0)	8.0
  (13, 0)	8.0
  (14, 0)	10.0
  (15, 0)	10.0
  (16, 0)	9.0
  (17, 0)	9.0
  (18, 0)	8.0
  (19, 0)	9.0
  (20, 0)	9.0
  (21, 0)	10.0
  (22, 0)	9.0
  (23, 0)	9.0
  (24, 0)	9.0
  :	:
  (7636, 3531)	4.0
  (56292, 3531)	2.0
  (2670, 3531)	2.0
  (786, 3531)	2.0
  (5942, 3531)	6.0
  (15561, 3531)	2.0
  (25623, 3532)	6.0
  (2885, 3532)	6.0
  (7368, 3533)	2.0
  (69554, 3534)	10.0
  (4484, 3534)	2.0
  (85727, 3534)	5.0
  (43900, 3534)	4.0
  (16070, 3534)	8.0
  (29998, 3534)	8.0
  (7895, 3534)	4.0
  (104052, 3534)	5.0
  (31152, 3534)	2.0
  (119512, 3534)	6.0
  (2509, 3534)	4.0
  (4828, 3534)	4.0
  (119513, 3534)	2.0
  (119514, 3534)	8.0
  (3123, 3534)	4.0
  (95648, 3534)	2.0


<119515x3535 sparse matrix of type '<class 'numpy.float64'>'
	with 908176 stored elements in COOrdinate format>

#### Train_Test data split

In [15]:
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

In [13]:
train_interactions, valid_interactions = random_train_test_split(train_interactions, test_percentage=0.2, random_state=42)

### Hyper Parameter Optimization by using HyperOPT

In [10]:
trials = Trials()
space = [
    hp.choice('no_components', range(10,100,10)),
    hp.uniform('learning_rate', 0.01, 0.05),
    hp.uniform('item_alpha', 1e-05, 5e-05),
]

In [17]:
def objective(params):
    no_components, learning_rate, item_alpha = params

    model = LightFM(no_components=no_components,
                    learning_schedule='adagrad',
                    loss='warp',
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    random_state=0)

    model.fit(interactions=train_interactions,
              user_features=user_features,
              item_features=item_features,
              epochs=5,
              verbose=True)

    test_precision = precision_at_k(model, valid_interactions, k=10, item_features=item_features, user_features=user_features).mean()
    test_recall = recall_at_k(model, valid_interactions, k=10, item_features=item_features, user_features=user_features).mean()
    test_auc = auc_score(model, valid_interactions, item_features=item_features, user_features=user_features).mean()
    
    print("no_comp: {}, lrn_rate: {:.5f}, item_alpha: {:.5f}, precision: {:.5f}, recall: {:.5f}, auc_score: {:.5f}".format(
      no_components, learning_rate, item_alpha, test_precision, test_recall, test_auc))
    output = -test_auc


    return output

In [None]:
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

best_params

  0%|                                                                           | 0/10 [00:00<?, ?trial/s, best loss=?]

Epoch:   0%|                                                                                     | 0/5 [00:00<?, ?it/s]
[A
Epoch:  20%|###############4                                                             | 1/5 [00:55<03:42, 55.65s/it]
[A
Epoch:  40%|##############################8                                              | 2/5 [01:49<02:44, 54.80s/it]
[A
Epoch:  60%|##############################################1                              | 3/5 [02:44<01:49, 54.56s/it]
[A
Epoch:  80%|#############################################################6               | 4/5 [03:38<00:54, 54.56s/it]
[A
Epoch: 100%|#############################################################################| 5/5 [04:36<00:00, 55.74s/it]
[A
Epoch: 100%|#############################################################################| 5/5 [04:36<00:00, 55.31s/it]


no_comp: 70, lrn_rate: 0.01352, item_alpha: 0.00002, precision: 0.00013, recall: 0.00038, auc_score: 0.36489           
 10%|████▌                                        | 1/10 [08:58<1:20:44, 538.23s/trial, best loss: -0.3648923337459564]

Epoch:   0%|                                                                                     | 0/5 [00:00<?, ?it/s]
[A
Epoch:  20%|###############4                                                             | 1/5 [01:10<04:41, 70.27s/it]
[A
Epoch:  40%|##############################8                                              | 2/5 [02:22<03:33, 71.30s/it]
[A
Epoch:  60%|##############################################1                              | 3/5 [03:34<02:23, 71.77s/it]
[A
Epoch:  80%|#############################################################6               | 4/5 [04:49<01:12, 72.87s/it]
[A
Epoch: 100%|#############################################################################| 5/5 [06:01<00:00, 72.57s/it]
[A
Epoch: 100%|#############################################################################| 5/5 [06:01<00:00, 72.24s/it]


no_comp: 90, lrn_rate: 0.02640, item_alpha: 0.00003, precision: 0.00017, recall: 0.00045, auc_score: 0.37131           
 20%|█████████                                    | 2/10 [20:49<1:25:21, 640.23s/trial, best loss: -0.3713105320930481]

Epoch:   0%|                                                                                     | 0/5 [00:00<?, ?it/s]
[A
Epoch:  20%|###############4                                                             | 1/5 [00:44<02:59, 44.76s/it]
[A
Epoch:  40%|##############################8                                              | 2/5 [01:26<02:09, 43.19s/it]
[A
Epoch:  60%|##############################################1                              | 3/5 [02:10<01:27, 43.55s/it]
[A
Epoch:  80%|#############################################################6               | 4/5 [02:54<00:43, 43.45s/it]
[A
Epoch: 100%|#############################################################################| 5/5 [03:38<00:00, 43.73s/it]
[A
Epoch: 100%|#############################################################################| 5/5 [03:38<00:00, 43.67s/it]


In [16]:
model = LightFM(no_components=40,
                learning_schedule='adagrad',
                loss='warp',
                learning_rate=0.011370592645615374,
                item_alpha=1.975734039413079e-05,
                random_state=42)

In [21]:
user_features.shape

(119515, 14)

In [23]:
%time model.fit(interactions=train_interactions, user_features=user_features, item_features=item_features, epochs=10, verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [06:30<00:00, 39.00s/it]

CPU times: total: 6min 27s
Wall time: 6min 30s





<lightfm.lightfm.LightFM at 0x245c12d6d60>

#### Evaluation

In [None]:
# Load the saved model
with open('origin_user_rating_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [26]:
print("Train precision: %.5f" % precision_at_k(model, train_interactions, user_features=user_features, item_features=item_features, k=9).mean())
print("Test precision: %.5f" % precision_at_k(model, test_interactions,  user_features=user_features, item_features=item_features, k=9).mean())

Train precision: 0.00065
Test precision: 0.00027


In [27]:
test_auc = auc_score(model, test_interactions, user_features=user_features, item_features=item_features).mean()
print(test_auc)

0.44155654


#### save

In [None]:
# save the model to a file
with open('origin_user_rating_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [28]:
model2 = LightFM(no_components=20,
                learning_schedule='adagrad',
                loss='warp',
                learning_rate=0.011370592645615374,
                item_alpha=1.975734039413079e-05,
                random_state=42)

In [29]:
%time model2.fit(interactions=train_interactions, user_features=user_features, item_features=item_features, epochs=10, verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [03:11<00:00, 19.14s/it]

CPU times: total: 3min 11s
Wall time: 3min 11s





<lightfm.lightfm.LightFM at 0x245ca0fdfa0>

In [30]:
print("Train precision: %.5f" % precision_at_k(model2, train_interactions, user_features=user_features, item_features=item_features, k=9).mean())
print("Test precision: %.5f" % precision_at_k(model2, test_interactions,  user_features=user_features, item_features=item_features, k=9).mean())

Train precision: 0.00098
Test precision: 0.00043


In [31]:
test_auc = auc_score(model2, test_interactions, user_features=user_features, item_features=item_features).mean()
print(test_auc)

0.46701562


In [32]:
model3 = LightFM(no_components=10,
                learning_schedule='adagrad',
                loss='warp',
                learning_rate=0.011370592645615374,
                item_alpha=1.975734039413079e-05,
                random_state=42)

In [33]:
%time model3.fit(interactions=train_interactions, user_features=user_features, item_features=item_features, epochs=10, verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [01:40<00:00, 10.02s/it]

CPU times: total: 1min 40s
Wall time: 1min 40s





<lightfm.lightfm.LightFM at 0x245ca0fd070>

In [34]:
print("Train precision: %.5f" % precision_at_k(model3, train_interactions, user_features=user_features, item_features=item_features, k=9).mean())
print("Test precision: %.5f" % precision_at_k(model3, test_interactions,  user_features=user_features, item_features=item_features, k=9).mean())

Train precision: 0.00046
Test precision: 0.00022


In [35]:
test_auc = auc_score(model3, test_interactions, user_features=user_features, item_features=item_features).mean()
print(test_auc)

0.5863698
