# 기존 코드 이용
---
- Min-Max Normalization
- user_features : price_tier, flavor(13 columns)
- item_features : Categorical [ cagtegory, price_tier, abv, flavor(13 colmuns) ]
- interactions : 기본 값 [ rating.csv ] + new_user

---
## 재학습 최적화
---
- model_fit_partial이 있지만 시간은 fit과 별 차이가 없음을 확인했습니다.
- dataset.fit_partial 또한, 기존 라이브러리로는 item_features, user_features 재학습에 어려움이 있어 매번 불러와야합니다.
- user_features의 경우 재학습시 학습된 취향 정보가 들어가고 이 후 예측에서는 그 당시 입력받은 취향 입맛을 사용합니다.


In [1]:
import os
import pandas as pd
import numpy as np
from time import time
from tqdm.notebook import tqdm
import pickle

from scipy.sparse import coo_matrix, csr_matrix

from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k

from hyperopt import fmin, hp, tpe, Trials



In [2]:
item_features = pd.read_csv("dataset/item_features.csv", index_col=0, encoding="UTF-8")
user_features = pd.read_csv("dataset/user_features.csv", index_col=0, encoding="UTF-8")
rating = pd.read_csv("../dataset/rating.csv", index_col=0, encoding="UTF-8")
whisky = pd.read_csv("../dataset/whisky.csv", index_col=0, encoding="UTF-8")

In [3]:
item_features

Unnamed: 0,category,price_tier,abv,smoky,peaty,spicy,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,0,0.000,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.000000
1,1,4,0.260,0.30,0.85,0.50,0.30,0.20,0.8,0.80,0.85,0.166667,0.20,0.25,0.85,0.526316
2,12,3,0.260,0.40,0.30,0.40,0.20,0.40,0.7,0.80,0.70,0.444444,0.50,0.50,0.70,0.210526
3,5,4,0.542,0.15,0.00,0.20,0.00,0.15,0.8,0.90,0.85,0.055556,0.30,0.25,0.35,0.000000
4,5,3,0.569,0.40,0.00,0.65,0.50,0.20,0.6,0.60,0.45,0.000000,0.60,0.60,0.45,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3531,8,0,0.260,0.35,0.00,0.20,0.00,0.00,0.4,0.55,0.85,0.000000,0.25,0.10,0.10,0.000000
3532,2,0,0.200,0.00,0.00,0.40,0.20,0.00,0.3,0.00,1.00,0.000000,1.00,0.00,0.60,0.105263
3533,0,2,0.240,0.90,0.10,0.10,0.10,0.40,0.1,0.10,0.90,0.000000,0.50,0.20,0.30,0.105263
3534,9,1,0.290,0.00,0.00,0.10,1.00,0.80,0.0,0.00,0.40,0.000000,0.20,0.00,0.00,0.000000


In [4]:
user_features["price_tier"] = user_features["price_tier"].astype("int")

In [5]:
user_features

Unnamed: 0,price_tier,smoky,peaty,spicy,herbal,oily,body,rich,sweet,salty,vanilla,tart,fruity,floral
0,0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.000000,0.00,0.00,0.00,0.000000
1,5,0.30,0.85,0.50,0.30,0.20,0.8,0.8,0.85,0.166667,0.20,0.25,0.85,0.526316
2,5,0.30,0.00,0.40,0.10,0.30,0.7,0.8,0.80,0.111111,0.60,0.20,0.90,0.105263
3,5,0.25,0.00,0.45,0.20,0.15,0.5,0.5,0.60,0.000000,0.30,0.55,0.70,0.157895
4,5,0.30,0.85,0.50,0.30,0.20,0.8,0.8,0.85,0.166667,0.20,0.25,0.85,0.526316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119511,5,0.00,0.00,0.40,0.20,0.00,0.3,0.0,1.00,0.000000,1.00,0.00,0.60,0.105263
119512,5,0.00,0.00,0.40,0.20,0.00,0.3,0.0,1.00,0.000000,1.00,0.00,0.60,0.105263
119513,5,0.31,0.00,0.40,0.29,0.00,0.3,0.4,1.00,0.000000,0.55,0.00,0.30,0.105263
119514,5,0.31,0.00,0.40,0.29,0.00,0.3,0.4,1.00,0.000000,0.55,0.00,0.30,0.105263


In [6]:
item_features = csr_matrix(item_features)
user_features = csr_matrix(user_features)

In [7]:
print(user_features)

  (1, 0)	5.0
  (1, 1)	0.3
  (1, 2)	0.85
  (1, 3)	0.5
  (1, 4)	0.3
  (1, 5)	0.2
  (1, 6)	0.8
  (1, 7)	0.8
  (1, 8)	0.85
  (1, 9)	0.1666666666666666
  (1, 10)	0.2
  (1, 11)	0.25
  (1, 12)	0.85
  (1, 13)	0.5263157894736842
  (2, 0)	5.0
  (2, 1)	0.3
  (2, 3)	0.4
  (2, 4)	0.1
  (2, 5)	0.3
  (2, 6)	0.7000000000000001
  (2, 7)	0.8
  (2, 8)	0.8
  (2, 9)	0.1111111111111111
  (2, 10)	0.6
  (2, 11)	0.2
  :	:
  (119513, 7)	0.4
  (119513, 8)	1.0
  (119513, 10)	0.55
  (119513, 12)	0.3
  (119513, 13)	0.1052631578947368
  (119514, 0)	5.0
  (119514, 1)	0.31
  (119514, 3)	0.4
  (119514, 4)	0.29
  (119514, 6)	0.3
  (119514, 7)	0.4
  (119514, 8)	1.0
  (119514, 10)	0.55
  (119514, 12)	0.3
  (119514, 13)	0.1052631578947368
  (119515, 0)	5.0
  (119515, 1)	0.31
  (119515, 3)	0.4
  (119515, 4)	0.29
  (119515, 6)	0.3
  (119515, 7)	0.4
  (119515, 8)	1.0
  (119515, 10)	0.55
  (119515, 12)	0.3
  (119515, 13)	0.1052631578947368


In [8]:
unique_user = rating["user_id"].unique()

In [9]:
unique_user

array([     1,      2,      3, ..., 119513, 119514, 119515], dtype=int64)

### make Interactions 

In [10]:
rating

Unnamed: 0,user_id,whisky_id,rating
0,1,1,9.0
1,2,1,7.0
2,3,1,9.0
3,4,1,10.0
4,5,1,9.0
...,...,...,...
908171,4829,3535,4.0
908172,119514,3535,2.0
908173,119515,3535,8.0
908174,3124,3535,4.0


In [11]:
dataset = Dataset()
dataset.fit(users=np.arange(rating.user_id.nunique()+1), items=np.arange(whisky.whisky_id.nunique()+1))

In [12]:
rating

Unnamed: 0,user_id,whisky_id,rating
0,1,1,9.0
1,2,1,7.0
2,3,1,9.0
3,4,1,10.0
4,5,1,9.0
...,...,...,...
908171,4829,3535,4.0
908172,119514,3535,2.0
908173,119515,3535,8.0
908174,3124,3535,4.0


In [13]:
%time interactions, weights = dataset.build_interactions([tuple(x) for x in rating.itertuples(index=False)])

CPU times: total: 2.02 s
Wall time: 2.02 s


In [14]:
print(interactions)

  (1, 1)	1
  (2, 1)	1
  (3, 1)	1
  (4, 1)	1
  (5, 1)	1
  (6, 1)	1
  (7, 1)	1
  (8, 1)	1
  (9, 1)	1
  (10, 1)	1
  (11, 1)	1
  (12, 1)	1
  (13, 1)	1
  (14, 1)	1
  (15, 1)	1
  (16, 1)	1
  (17, 1)	1
  (18, 1)	1
  (19, 1)	1
  (20, 1)	1
  (21, 1)	1
  (22, 1)	1
  (23, 1)	1
  (24, 1)	1
  (25, 1)	1
  :	:
  (7637, 3532)	1
  (56293, 3532)	1
  (2671, 3532)	1
  (787, 3532)	1
  (5943, 3532)	1
  (15562, 3532)	1
  (25624, 3533)	1
  (2886, 3533)	1
  (7369, 3534)	1
  (69555, 3535)	1
  (4485, 3535)	1
  (85728, 3535)	1
  (43901, 3535)	1
  (16071, 3535)	1
  (29999, 3535)	1
  (7896, 3535)	1
  (104053, 3535)	1
  (31153, 3535)	1
  (119513, 3535)	1
  (2510, 3535)	1
  (4829, 3535)	1
  (119514, 3535)	1
  (119515, 3535)	1
  (3124, 3535)	1
  (95649, 3535)	1


In [15]:
print(weights)
weights.toarray()

  (1, 1)	9.0
  (2, 1)	7.0
  (3, 1)	9.0
  (4, 1)	10.0
  (5, 1)	9.0
  (6, 1)	7.0
  (7, 1)	10.0
  (8, 1)	7.0
  (9, 1)	10.0
  (10, 1)	10.0
  (11, 1)	9.0
  (12, 1)	9.0
  (13, 1)	8.0
  (14, 1)	8.0
  (15, 1)	10.0
  (16, 1)	10.0
  (17, 1)	9.0
  (18, 1)	9.0
  (19, 1)	8.0
  (20, 1)	9.0
  (21, 1)	9.0
  (22, 1)	10.0
  (23, 1)	9.0
  (24, 1)	9.0
  (25, 1)	9.0
  :	:
  (7637, 3532)	4.0
  (56293, 3532)	2.0
  (2671, 3532)	2.0
  (787, 3532)	2.0
  (5943, 3532)	6.0
  (15562, 3532)	2.0
  (25624, 3533)	6.0
  (2886, 3533)	6.0
  (7369, 3534)	2.0
  (69555, 3535)	10.0
  (4485, 3535)	2.0
  (85728, 3535)	5.0
  (43901, 3535)	4.0
  (16071, 3535)	8.0
  (29999, 3535)	8.0
  (7896, 3535)	4.0
  (104053, 3535)	5.0
  (31153, 3535)	2.0
  (119513, 3535)	6.0
  (2510, 3535)	4.0
  (4829, 3535)	4.0
  (119514, 3535)	2.0
  (119515, 3535)	8.0
  (3124, 3535)	4.0
  (95649, 3535)	2.0


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 9., 0., ..., 0., 0., 0.],
       [0., 7., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 6.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 8.]], dtype=float32)

In [16]:
interactions

<119516x3536 sparse matrix of type '<class 'numpy.int32'>'
	with 908176 stored elements in COOrdinate format>

In [17]:
rating.shape

(908176, 3)

## Train_Test data split

In [18]:
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2, random_state=42)
train_weights, test_weights = random_train_test_split(weights, test_percentage=0.2, random_state=42)

In [30]:
train_interactions, valid_interactions = random_train_test_split(train_interactions, test_percentage=0.2, random_state=42)
train_weights, valid_weights = random_train_test_split(train_weights, test_percentage=0.2, random_state=42)

In [19]:
train_interactions.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [20]:
train_weights.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 9., 0., ..., 0., 0., 0.],
       [0., 7., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 8.]], dtype=float32)

## Optuna 사용한 HyperParameter 최적화
---
- learning_rate, alpha 등의 하이퍼 파라미터 값을 작게 설정했을 때보다 높게 설정했을 때 AUC Score가 높게 나오는 경향이 보였습니다.
- HyperOPT는 베이지안 최적화 접근 기반인데 이 부분에 제대로 알지 못하기 때문에 전체를 돌려보는 Optuna 방식으로 변경하겠습니다.
- Optuna의 경우 시각화도 가능하고, GridSearchCV보다 빠르다는 장점이 있습니다.

In [None]:
import optuna

In [None]:
def objective(trial):
    
    
    # 조정할 하이퍼 파라미터
    params = {
        "learning_schedule": 'adagrad',
        "loss": "warp",
        "random_state": 42,
        "no_components": trial.suggest_int("no_components", 30, 100, 10),
        'learning_rate': trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True),
        'item_alpha': trial.suggest_float("item_alpha", 1e-5, 1e-2, log=True),
        'user_alpha': trial.suggest_float("user_alpha", 1e-5, 1e-2, log=True),
    }

    model = LightFM(**params)

    model.fit(interactions=train_interactions,
              sample_weight=train_weights,
              user_features=user_features,
              item_features=item_features,
              epochs=5,
              verbose=True)

    test_precision = precision_at_k(model, valid_interactions, k=9, item_features=item_features, user_features=user_features).mean()
    test_recall = recall_at_k(model, valid_interactions, k=9, item_features=item_features, user_features=user_features).mean()
    test_auc = auc_score(model, valid_interactions, item_features=item_features, user_features=user_features).mean()
    
    print("no_comp: {}, lrn_rate: {:.5f}, item_alpha: {:.5f}, user_alpha: {:.5f}, precision: {:.5f}, recall: {:.5f}, auc_score: {:.5f}".format(
      params["no_components"], params["learning_rate"], params["item_alpha"], params["user_alpha"], test_precision, test_recall, test_auc))
    return test_auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print(study.best_trial.params)

BEST no_comp: 120, lrn_rate: 0.01000, item_alpha: 0.05000, user_alpha: 0.01000, precision: 0.00468, recall: 0.02312, auc_score: 0.80381

In [None]:
print("Best Params : {}".format(study.best_params))
print()
print("Best Trials : {}".format(study.best_trials))
print()
print("Best Values : {}".format(study.best_value))

## Best Parameter fitting

In [None]:
model = LightFM(
    no_components=120, learning_rate=0.01000, item_alpha=0.0500, user_alpha=0.01000,learning_schedule='adagrad',loss="warp", random_state=42
)
%time model.fit(interactions=train_interactions, user_features=user_features, item_features=item_features, epochs=10, verbose=True)

In [None]:
test_precision = precision_at_k(model, test_interactions, user_features=user_features, item_features=item_features, k=9).mean()
test_recall = recall_at_k(model, test_interactions,  user_features=user_features, item_features=item_features, k=9).mean()
test_auc = auc_score(model, test_interactions, user_features=user_features, item_features=item_features).mean()

print("Train precision: %.5f" % test_precision)
print("Test precision: %.5f" % test_recall)
print("Test AUC Score : %.5f" % test_auc)

In [None]:
model2 = LightFM(
    no_components=120, learning_rate=0.01000, item_alpha=0.0500, user_alpha=0.01000,learning_schedule='adagrad',loss="warp", random_state=42
)
%time model2.fit(interactions=train_interactions, user_features=user_features, item_features=item_features, epochs=5, verbose=True)

In [None]:
test_precision = precision_at_k(model2, test_interactions, user_features=user_features, item_features=item_features, k=9).mean()
test_recall = recall_at_k(model2, test_interactions,  user_features=user_features, item_features=item_features, k=9).mean()
test_auc = auc_score(model2, test_interactions, user_features=user_features, item_features=item_features).mean()

print("Train precision: %.5f" % test_precision)
print("Test precision: %.5f" % test_recall)
print("Test AUC Score : %.5f" % test_auc)

# 시간 고려한 최적 모델
---
- 시간까지 고려했을 때, 아래 hyper parameter가 좀 더 합리적이다

BEST no_comp: 60, lrn_rate: 0.01000, item_alpha: 0.05000, user_alpha: 0.01000, precision: 0.00430, recall: 0.02030, auc_score: 0.80244

In [21]:
model3 = LightFM(
    no_components=60, learning_rate=0.01000, item_alpha=0.0500, user_alpha=0.01000,learning_schedule='adagrad',loss="warp", random_state=42
)
%time model3.fit(interactions=train_interactions, user_features=user_features, item_features=item_features, epochs=5, verbose=True)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [05:44<00:00, 68.92s/it]

CPU times: total: 5min 42s
Wall time: 5min 44s





<lightfm.lightfm.LightFM at 0x21a3dcf9ac0>

In [22]:
test_precision = precision_at_k(model3, test_interactions, user_features=user_features, item_features=item_features, k=9).mean()
test_recall = recall_at_k(model3, test_interactions,  user_features=user_features, item_features=item_features, k=9).mean()
test_auc = auc_score(model3, test_interactions, user_features=user_features, item_features=item_features).mean()

print("Train precision: %.5f" % test_precision)
print("Test precision: %.5f" % test_recall)
print("Test AUC Score : %.5f" % test_auc)

Train precision: 0.00513
Test precision: 0.02267
Test AUC Score : 0.79337


In [23]:
## New User Predict
scores = model3.predict(user_ids=0, item_ids=[i for i in range(3536)], item_features=item_features, user_features=csr_matrix([1,0.25, 0.50, 0.45, 0.20, 0.15, 0.5, 0.5, 0.60, 0.000000,0.30, 0.55, 0.70, 0.157895]))

In [24]:
scores.sort()

In [25]:
scores

array([-167.91301, -167.16469, -167.12218, ..., -157.2761 , -157.2585 ,
       -156.91304], dtype=float32)

In [26]:
## 기존 user predict
scores = model3.predict(user_ids=5168, item_ids=[i for i in range(3536)], item_features=item_features, user_features=user_features)

In [27]:
scores.sort()

In [28]:
scores

array([-298.1379 , -297.72812, -297.6871 , ..., -291.6734 , -291.65845,
       -291.44577], dtype=float32)

## Test Data Save
---
- 모델 재학습 시  필요한 테스트 데이터로 저장해둡니다.

In [29]:
test_mat = (test_interactions.toarray() * test_weights.toarray())

In [30]:
test_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 6.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
test_df = pd.DataFrame(test_mat)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3526,3527,3528,3529,3530,3531,3532,3533,3534,3535
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
119512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
119514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [32]:
# Get the indices of all non-zero elements in the DataFrame
indices = np.where(test_df != 0)

In [33]:
indices

(array([     1,      1,      2, ..., 119511, 119513, 119514], dtype=int64),
 array([1770, 2452, 1094, ..., 3532, 3535, 3535], dtype=int64))

In [34]:
# Create a new DataFrame to hold the ratings
test_dataset = pd.DataFrame({
    'user_id': indices[0],
    'whisky_id': indices[1],
    'rating': test_df.values[indices]
})
test_dataset

Unnamed: 0,user_id,whisky_id,rating
0,1,1770,8.0
1,1,2452,3.0
2,2,1094,9.0
3,3,95,7.0
4,3,344,7.0
...,...,...,...
178745,119507,3532,6.0
178746,119509,3532,3.0
178747,119511,3532,10.0
178748,119513,3535,6.0


In [35]:
test_dataset.to_csv("dataset/test_rating.csv", encoding="UTF-8")

## Train rating set save

In [36]:
train_mat = (train_interactions.toarray() * train_weights.toarray())

In [37]:
train_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 9., 0., ..., 0., 0., 0.],
       [0., 7., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 8.]], dtype=float32)

In [38]:
train_df = pd.DataFrame(train_mat)
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3526,3527,3528,3529,3530,3531,3532,3533,3534,3535
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0
119513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# Get the indices of all non-zero elements in the DataFrame
indices = np.where(train_df != 0)

In [40]:
indices

(array([     1,      1,      1, ..., 119510, 119512, 119515], dtype=int64),
 array([   1,  547,  812, ..., 3532, 3532, 3535], dtype=int64))

In [41]:
# Create a new DataFrame to hold the ratings
train_dataset = pd.DataFrame({
    'user_id': indices[0],
    'whisky_id': indices[1],
    'rating': train_df.values[indices]
})
train_dataset

Unnamed: 0,user_id,whisky_id,rating
0,1,1,9.0
1,1,547,8.0
2,1,812,9.0
3,1,1371,8.0
4,1,1995,6.0
...,...,...,...
692441,119508,3532,8.0
692442,119509,3532,3.0
692443,119510,3532,3.0
692444,119512,3532,8.0


In [42]:
train_dataset.to_csv("dataset/train_rating.csv", encoding="UTF-8")

In [43]:
# save the model to a file
with open('rec_model.pkl', 'wb') as f:
    pickle.dump(model3, f)