# recommend_system in 1 month(params grid)

### 前置作業(安裝導入套件)

In [None]:
!pip3 install surprise
!pip3 install ml_metrics

from google.colab import drive
import pandas as pd
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp,SVD
from surprise import accuracy
from surprise.model_selection import cross_validate,GridSearchCV,train_test_split
from collections import defaultdict
import numpy as np
import ml_metrics as metrics

def get_top_n(predictions, n=12):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 3.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630191 sha256=e085b6386e9d3607696dce5f22bce730baf613ee733d1f49d8eace563bf85452
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1
Collecting ml_metrics
  Downloading ml_metrics-0.1.4.tar.gz (5.0 kB)
Building wheels for collected packages: ml-metrics
  Building wheel for ml-metrics (setup.py) ... [?25l[?25hdone
  Created wheel for ml-metrics: fil

### 產出訓練資料及測試資料

In [None]:
## 匯入資料集
drive.mount('/content/drive')
train = pd.read_parquet('/content/drive/.shortcut-targets-by-id/1lE1HJEkfMFjVM2VxfpYTkckxqcNJxv8N/data/train_one_month.parquet')
test = pd.read_parquet('/content/drive/.shortcut-targets-by-id/1lE1HJEkfMFjVM2VxfpYTkckxqcNJxv8N/data/val_one_month.parquet')

print('training data customers counts:' + str(len(train['customer_id'].unique())))
print('training data articles counts:' + str(len(train['article_id'].unique())))
print('============================================================================')
print('testing data customers counts:'+ str(len(test['customer_id'].unique())))
print('testing data articles counts:' + str(len(test['article_id'].unique())))
print('============================================================================')
print(train.info())
print(test.info())

Mounted at /content/drive
training data customers counts:245554
training data articles counts:29507
testing data customers counts:75481
testing data articles counts:18684
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1123728 entries, 30398232 to 31521959
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   t_dat             1123728 non-null  object 
 1   customer_id       1123728 non-null  object 
 2   article_id        1123728 non-null  int64  
 3   price             1123728 non-null  float64
 4   sales_channel_id  1123728 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 51.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 266364 entries, 31521960 to 31788323
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   t_dat             266364 non-null  object 
 1   customer_id       266364 non-n

In [None]:
## 產出評分資料
train_rating = train[['customer_id','article_id','price']].groupby(['customer_id','article_id']).count().reset_index()
train_rating.columns = ['customer_id','article_id','rating']

test_rating = test[['customer_id','article_id','price']].groupby(['customer_id','article_id']).count().reset_index()
test_rating.columns = ['customer_id','article_id','rating']

print(train_rating.info())
print(test_rating.info())
print(train_rating.describe())
print(train_rating.head(5))
print(test_rating.head(5))
print(test_rating.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995845 entries, 0 to 995844
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_id  995845 non-null  object
 1   article_id   995845 non-null  int64 
 2   rating       995845 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 22.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236622 entries, 0 to 236621
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_id  236622 non-null  object
 1   article_id   236622 non-null  int64 
 2   rating       236622 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 5.4+ MB
None
         article_id         rating
count  9.958450e+05  995845.000000
mean   8.020474e+08       1.128417
std    1.311309e+08       0.461171
min    1.087750e+08       1.000000
25%    7.519940e+08       1.000000
50%    8.502440e+08       1.000000
75% 

In [None]:
## 讀取評分資料為surprise可以訓練的格式
reader = Reader(rating_scale=(1, 500))
trainset = Dataset.load_from_df(train_rating[['customer_id','article_id','rating']], reader)
testset = Dataset.load_from_df(test_rating[['customer_id','article_id','rating']], reader)
testset2 = [testset.df.loc[i].to_list() for i in range(len(testset.df))]

In [None]:
testset2[:10]

[['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318',
  794321007,
  1],
 ['00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793',
  624486001,
  1],
 ['0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55',
  827487003,
  1],
 ['00040239317e877c77ac6e79df42eb2633ad38fcac09fc0094e549180ddc201c',
  875272011,
  1],
 ['00040239317e877c77ac6e79df42eb2633ad38fcac09fc0094e549180ddc201c',
  875272012,
  1],
 ['000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9',
  640021019,
  1],
 ['000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9',
  757926001,
  1],
 ['000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9',
  788575004,
  1],
 ['000525e3fe01600d717da8423643a8303390a055c578ed8a97256600baf54565',
  874110016,
  1],
 ['000749135ee9aa3a24c2316ea5ae4f495b39c1653c5612bb5b239f1b2a182a2a',
  800691007,
  2]]

In [None]:
## map@k testing需要產的資料
#
test.loc[:,'rating']=0
test_processed = Dataset.load_from_df(test[['customer_id','article_id','rating']], reader) 
NA, test2 = train_test_split(test_processed, test_size=1.0) 

# ======= 消費者的實際購買清單 =======
test['article_id'] = test['article_id'].astype('str')

test_uni = test.drop_duplicates(subset=['customer_id', 'article_id'], keep='first')

buy_n = test_uni[['customer_id','article_id']].groupby('customer_id')['article_id'].apply(list).to_dict()

cust_actual_list = []
for uid, user_ratings in buy_n.items():
    cust_pred_tuple = (uid, [iid for iid in user_ratings])
    cust_actual_list.append(cust_pred_tuple)

print(cust_actual_list[0:10])

[('000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318', ['794321007']), ('00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793', ['624486001']), ('0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55', ['827487003']), ('00040239317e877c77ac6e79df42eb2633ad38fcac09fc0094e549180ddc201c', ['875272011', '875272012']), ('000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9', ['757926001', '788575004', '640021019']), ('000525e3fe01600d717da8423643a8303390a055c578ed8a97256600baf54565', ['874110016']), ('000749135ee9aa3a24c2316ea5ae4f495b39c1653c5612bb5b239f1b2a182a2a', ['800691007', '800691008']), ('00077dbd5c4a4991e092e63893ccf29294a9d5c46e85010e95f2fc10bf9437a4', ['903762001', '879189005', '158340001', '867966009', '915529003', '932798002', '915529005', '486639003', '918171001', '936622001', '907149001', '935892001', '448509014', '799365027']), ('0008daf19b2a7cd6fa00836c717aa0f143c83d88c35e0269a28cbff53664205a', ['786304008']), ('000fb6e772c

### 模型訓練及驗證

In [None]:
## 模型訓練與驗證(svd)
scores = pd.DataFrame()
for factors in [25,50,100,150,200]:
    for iterations in [20,30,40,50]:
        for regularization in [0.01]:

            algo = SVD(n_factors = factors,
                       n_epochs=iterations,
                       reg_all=regularization,
                       random_state=42)

            # 訓練模型
            algo.fit(trainset.build_full_trainset())
            # step3 - testing(train_test_split way)
            
            ##### rmse #####
            predictions = algo.test(testset2)
            rmse = accuracy.rmse(predictions)

            ##### map@k #####
            predictions_map = algo.test(test2)
            est = [i.est for i in predictions_map] 

            ## ======= 消費者的預測清單 =======
            top_n = get_top_n(predictions_map, n=12)

            cust_pred_list = []
            for uid, user_ratings in top_n.items():
                cust_pred_tuple = (uid, [str(iid) for (iid, _) in user_ratings])
                cust_pred_list.append(cust_pred_tuple)
            
            final_list = list(zip(cust_actual_list, cust_pred_list))
            
            #map@k計算 
            mapk_list = []
            for i in range(len(final_list)):
              map_k = metrics.mapk([final_list[i][0][1]],[final_list[i][1][1]],12)
              mapk_list.append(map_k)

            def Average(lst):
                return sum(lst) / len(lst)

            map_k = Average(mapk_list)

            newRow = {
                        # =====填寫參數名稱===============
                        'factors':factors, 
                        'iterations':iterations, 
                        'regularization':regularization, 
                        # ===============================
                        'rmse':rmse,
                        'map@k':map_k
                        }
            print(newRow)
            newDF = pd.DataFrame([newRow])
            scores = pd.concat([scores, newDF], axis=0 ,ignore_index=True)

scores

RMSE: 0.4548
{'factors': 25, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.4548226270871537, 'map@k': 0.0006401611229032329}
RMSE: 0.4567
{'factors': 25, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.45673652012935334, 'map@k': 0.0006485415276085405}
RMSE: 0.4587
{'factors': 25, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.4586807454781601, 'map@k': 0.0006583140702867152}
RMSE: 0.4607
{'factors': 25, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.46073788171593627, 'map@k': 0.000666535696912311}
RMSE: 0.4554
{'factors': 50, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.45544503589409663, 'map@k': 0.000650658184805852}
RMSE: 0.4573
{'factors': 50, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.45732918412556306, 'map@k': 0.000641405850829415}
RMSE: 0.4594
{'factors': 50, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.4594372598242015, 'map@k': 0.0006391421579680883}
RMSE: 0.4615
{'factors': 50, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.4

Unnamed: 0,factors,iterations,regularization,rmse,map@k
0,25,20,0.01,0.454823,0.00064
1,25,30,0.01,0.456737,0.000649
2,25,40,0.01,0.458681,0.000658
3,25,50,0.01,0.460738,0.000667
4,50,20,0.01,0.455445,0.000651
5,50,30,0.01,0.457329,0.000641
6,50,40,0.01,0.459437,0.000639
7,50,50,0.01,0.461475,0.00065
8,100,20,0.01,0.457074,0.000637
9,100,30,0.01,0.459041,0.000669


In [None]:
## 模型訓練與驗證(svdpp)
scores = pd.DataFrame()
for factors in [25,50,100,150,200]:
    for iterations in [20,30,40,50]:
        for regularization in [0.01]:

            algo = SVDpp(n_factors = factors,
                       n_epochs=iterations,
                       reg_all=regularization,
                       random_state=42)

            # 訓練模型
            algo.fit(trainset.build_full_trainset())
            # step3 - testing(train_test_split way)
            
            ##### rmse #####
            predictions = algo.test(testset2)
            rmse = accuracy.rmse(predictions)

            ##### map@k #####
            predictions_map = algo.test(test2)
            est = [i.est for i in predictions_map] 

            ## ======= 消費者的預測清單 =======
            top_n = get_top_n(predictions_map, n=12)

            cust_pred_list = []
            for uid, user_ratings in top_n.items():
                cust_pred_tuple = (uid, [str(iid) for (iid, _) in user_ratings])
                cust_pred_list.append(cust_pred_tuple)
            
            final_list = list(zip(cust_actual_list, cust_pred_list))
            
            #map@k計算 
            mapk_list = []
            for i in range(len(final_list)):
              map_k = metrics.mapk([final_list[i][0][1]],[final_list[i][1][1]],12)
              mapk_list.append(map_k)

            def Average(lst):
                return sum(lst) / len(lst)

            map_k = Average(mapk_list)

            newRow = {
                        # =====填寫參數名稱===============
                        'factors':factors, 
                        'iterations':iterations, 
                        'regularization':regularization, 
                        # ===============================
                        'rmse':rmse,
                        'map@k':map_k
                        }
            print(newRow)
            newDF = pd.DataFrame([newRow])
            scores = pd.concat([scores, newDF], axis=0 ,ignore_index=True)

scores

RMSE: 0.4573
{'factors': 25, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.45727025437074675, 'map@k': 0.0006413898324098377}
RMSE: 0.4604
{'factors': 25, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.4604083691233089, 'map@k': 0.0006387123363186727}
RMSE: 0.4631
{'factors': 25, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.4631426941395734, 'map@k': 0.0006456418143512538}
RMSE: 0.4653
{'factors': 25, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.4652862745415788, 'map@k': 0.0006443218025082533}
RMSE: 0.4582
{'factors': 50, 'iterations': 20, 'regularization': 0.01, 'rmse': 0.45818523589484517, 'map@k': 0.0006560901894377496}
RMSE: 0.4613
{'factors': 50, 'iterations': 30, 'regularization': 0.01, 'rmse': 0.46128653870598385, 'map@k': 0.0006496227936888508}
RMSE: 0.4636
{'factors': 50, 'iterations': 40, 'regularization': 0.01, 'rmse': 0.4636215873854068, 'map@k': 0.000653250585650793}
RMSE: 0.4653
{'factors': 50, 'iterations': 50, 'regularization': 0.01, 'rmse': 0.