### Практическая работа к уроку №5

#### Сделать grid search текущей модели, смотрите на метрику precision@5, считаем на тесте нашей функцией

In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Функции из 1-ого вебинара
from metrics import precision_at_k as custom_precision, recall_at_k
from utils import prefilter_items

### Prepare data

In [2]:
data = pd.read_csv('../data/retail_train.csv')

item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [4]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [5]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [6]:
for column in user_features.drop('user_id', 1).columns:
    print('#' * 30)
    print(column)
    print(user_features[column].unique())

##############################
age_desc
['65+' '45-54' '25-34' '35-44' '19-24' '55-64']
##############################
marital_status_code
['A' 'U' 'B']
##############################
income_desc
['35-49K' '50-74K' '25-34K' '75-99K' 'Under 15K' '100-124K' '15-24K'
 '125-149K' '150-174K' '250K+' '175-199K' '200-249K']
##############################
homeowner_desc
['Homeowner' 'Unknown' 'Renter' 'Probable Renter' 'Probable Owner']
##############################
hh_comp_desc
['2 Adults No Kids' '2 Adults Kids' 'Single Female' 'Unknown'
 'Single Male' '1 Adult Kids']
##############################
household_size_desc
['2' '3' '4' '1' '5+']
##############################
kid_category_desc
['None/Unknown' '1' '2' '3+']


In [43]:
class PrepareData:
   
    @classmethod
    def prepare_train_data(cls, data_train, item_features):
        data_train_filtered = cls.filter_data(data_train, item_features)
        user_item_matrix = cls.get_user_item_matrix(data_train_filtered)
        
        return user_item_matrix, data_train_filtered
    
    @classmethod
    def prepare_test_data(cls, data_test, data_train):
        data_test = data_test[data_test['item_id']\
                              .isin(data_train['item_id'].unique())]
        user_item_matrix = cls.get_user_item_matrix(data_test)
        
        return user_item_matrix
    
    @staticmethod
    def filter_data(data, item_features):
        n_items_before = data['item_id'].nunique()
        data_filtered = prefilter_items(data, take_n_popular=5000,
                                        item_features=item_features)
        n_items_after = data_filtered['item_id'].nunique()
        print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))
        
        return data_filtered
    
    @staticmethod
    def get_user_item_matrix(data):
        user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
        user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
        
        return user_item_matrix
    
    @staticmethod
    def get_sparse_matrix(matrix):
        return csr_matrix(matrix).tocsr()
    
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
    
    @staticmethod
    def create_dummies_features(data):
        data_dummies = pd.get_dummies(data, columns=data.columns.tolist())
        
        return data_dummies
    
    @classmethod
    def prepare_user_features(cls, user_item_matrix, user_features):
        user_feat = pd.DataFrame(user_item_matrix.index)
        user_feat = user_feat.merge(user_features, on='user_id', how='left')
        user_feat.set_index('user_id', inplace=True)
        # for model (lightfm)
        user_feat_dummies = cls.create_dummies_features(user_feat)
        
        return user_feat_dummies
    
    @classmethod
    def prepare_item_features(cls, user_item_matrix, item_features):
        item_feat = pd.DataFrame(user_item_matrix.columns)
        item_feat = item_feat.merge(item_features, on='item_id', how='left')
        item_feat.set_index('item_id', inplace=True)
        # for model (lightfm)
        item_feat_dummies = cls.create_dummies_features(item_feat)
        
        return item_feat_dummies

In [44]:
Data = PrepareData()

In [45]:
user_item_matrix, data_train_filtered = Data.prepare_train_data(data_train, item_features)

sparse_user_item = Data.get_sparse_matrix(user_item_matrix)

user_item_matrix.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 86865 to 5001


item_id,117847,818981,819255,819308,819400,819487,819590,819594,819840,819845,...,15926775,15926844,15926886,15972074,15972298,15972565,15972790,16100266,16729299,16729415
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# test_user_item_matrix = Data.prepare_test_data(data_test, data_train)

In [21]:
id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = Data.prepare_dicts(user_item_matrix)

In [22]:
user_feat = Data.prepare_user_features(user_item_matrix, user_features)
user_feat.head(2)

Unnamed: 0_level_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,marital_status_code_A,marital_status_code_B,marital_status_code_U,income_desc_100-124K,...,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
item_feat = Data.prepare_item_features(user_item_matrix, item_features)
item_feat.head(2)

Unnamed: 0_level_0,manufacturer_2.0,manufacturer_5.0,manufacturer_16.0,manufacturer_20.0,manufacturer_26.0,manufacturer_33.0,manufacturer_35.0,manufacturer_36.0,manufacturer_42.0,manufacturer_43.0,...,curr_size_of_product_L 16 OZ,curr_size_of_product_L 7.75 OZ,curr_size_of_product_L 13.25 OZ,curr_size_of_product_LB,curr_size_of_product_N 12 OZ,curr_size_of_product_N 40 OZ,curr_size_of_product_PINT,curr_size_of_product_PK,curr_size_of_product_PT,curr_size_of_product_QT
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
117847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
818981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
class Model:
    def __init__(self, config, userid_to_id, itemid_to_id, verbose=False):
        self.config = config
        self.userid_to_id = userid_to_id
        self.itemid_to_id = itemid_to_id
        self.verbose = verbose
        
        self.model = self.get_model()
        
    def get_model(self):
        if self.verbose:
            print('Creating model...')
        return LightFM(**self.config)
    
    def fit(self, user_item_matrix, sparse_user_item,
                  user_feat, item_feat, weight=False):
        if self.verbose:
            print('Start fiting...')
            
        if weight:
            sample_weight = coo_matrix(user_item_matrix)
        else:
            sample_weight = None
        
        self.model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
                       sample_weight=sample_weight,
                       user_features=csr_matrix(user_feat.values).tocsr(),
                       item_features=csr_matrix(item_feat.values).tocsr(),
                       epochs=20, 
                       num_threads=20,
                       verbose=True)
    
    def train_precision(self, sparse_user_item, user_feat, item_feat):
        if self.verbose:
            print('Compute train precision@5')
        precision = precision_at_k(self.model, sparse_user_item, 
                                   user_features=csr_matrix(user_feat.values).tocsr(),
                                   item_features=csr_matrix(item_feat.values).tocsr(),
                                   k=5).mean()
        return precision
    
    def test_precision(self, result, data_train_filtered, user_feat, item_feat):
        if self.verbose:
            print('Compute test precision@5')
        users_ids_row, items_ids_row = self.prepare_id_for_predict(data_train_filtered)
        
        data_train_filtered['score'] = self.get_predictions(users_ids_row, items_ids_row,
                                                            user_feat, item_feat)
        
        predict_result = data_train_filtered[['user_id','item_id','score']]\
                                            [data_train_filtered.item_id != 999999]\
                                            .drop_duplicates()\
                                            .sort_values(by=['user_id','score'], ascending=False)\
                                            .groupby('user_id')['item_id']\
                                            .unique().reset_index()
        
        df_result_for_metrics = result.merge(predict_result, on='user_id', how='inner')
        
        precision = df_result_for_metrics.apply(lambda row: custom_precision(row['item_id'], row['actual'], 
                                                                             k=5), axis=1).mean()
        
        return precision
        
    
    def prepare_id_for_predict(self, data_train_filtered):
        # подготавливаемм id для юзеров и товаров в порядке пар user-item
        users_ids_row = data_train_filtered['user_id']\
                            .apply(lambda x: self.userid_to_id[x]).values.astype(int)
        items_ids_row = data_train_filtered['item_id']\
                            .apply(lambda x: self.itemid_to_id[x]).values.astype(int)
        
        return users_ids_row, items_ids_row
    
    def get_predictions(self, users_ids_row, items_ids_row,
                        user_feat, item_feat):
        return self.model.predict(user_ids=users_ids_row,
                                  item_ids=items_ids_row,
                                  user_features=csr_matrix(user_feat.values).tocsr(),
                                  item_features=csr_matrix(item_feat.values).tocsr(),
                                  num_threads=10)

In [121]:
template_config = {
    'no_components': 40,
    'loss': 'bpr',
    'learning_rate': 0.01, 
    'item_alpha': 0.4,
    'user_alpha': 0.1, 
    'random_state': 42,
    'k': 5,
    'n': 15,
    'max_sampled': 100
}

In [122]:
def train_lightfm_model(config, weight, verbose):
    LightFM_model = Model(config, userid_to_id, itemid_to_id, verbose)
    
    LightFM_model.fit(user_item_matrix, sparse_user_item,
                      user_feat, item_feat, weight=weight)
    
    train_precision = LightFM_model.train_precision(sparse_user_item, user_feat, item_feat)
    
    test_precision = LightFM_model.test_precision(result, data_train_filtered, user_feat, item_feat)
    
    return train_precision, test_precision

In [123]:
train_lightfm_model(template_config, weight=True, verbose=True)

Creating model...
Start fiting...


Epoch: 100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


Compute train precision@5
Compute test precision@5


(0.43852624, 0.14229885057471092)

In [124]:
import itertools
"""
logistic loss реализовать не получилось. Модель падает с ошибкой:
    Not all estimated parameters are finite, your model may have diverged.
    Try decreasing the learning rate or normalising feature values and sample weights
"""

weight_list = [True, False]
loss_list = ["bpr", "warp"]
no_components_list = [32, 64]

grid_search_params = [weight_list, loss_list, no_components_list]

grid_search_params = list(itertools.product(*grid_search_params))
grid_search_params

[(True, 'bpr', 32),
 (True, 'bpr', 64),
 (True, 'warp', 32),
 (True, 'warp', 64),
 (False, 'bpr', 32),
 (False, 'bpr', 64),
 (False, 'warp', 32),
 (False, 'warp', 64)]

In [125]:
names_params = ['loss', 'no_components']
dct_results = {}

for params in grid_search_params:
    iteration_name = "_".join(list(map(str, params)))
    print(iteration_name)
    
    weight = params[0]
    dict_params = {names_params[i]: param for i, param in enumerate(params[1:])}

    config = template_config.copy()
    config.update(**dict_params)
    
    train_precision, test_precision = train_lightfm_model(config, weight=weight, verbose=False)
    
    dct_results[iteration_name] = [train_precision, test_precision]

True_bpr_32


Epoch: 100%|██████████| 20/20 [00:26<00:00,  1.35s/it]


True_bpr_64


Epoch: 100%|██████████| 20/20 [00:46<00:00,  2.32s/it]


True_warp_32


Epoch: 100%|██████████| 20/20 [00:29<00:00,  1.45s/it]


True_warp_64


Epoch: 100%|██████████| 20/20 [00:51<00:00,  2.56s/it]
Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

False_bpr_32


Epoch: 100%|██████████| 20/20 [00:31<00:00,  1.59s/it]
Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

False_bpr_64


Epoch: 100%|██████████| 20/20 [00:55<00:00,  2.75s/it]
Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

False_warp_32


Epoch: 100%|██████████| 20/20 [00:30<00:00,  1.53s/it]
Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

False_warp_64


Epoch: 100%|██████████| 20/20 [00:51<00:00,  2.58s/it]


In [142]:
result_df = pd.DataFrame.from_dict(dct_results, orient='index',
                                   columns=['train precision@5', 'test precision@5'])
result_df.sort_values('test precision@5', ascending=False, inplace=True)
result_df.index.name = 'weight_loss_no_components'
result_df

Unnamed: 0_level_0,train precision@5,test precision@5
weight_loss_no_components,Unnamed: 1_level_1,Unnamed: 2_level_1
True_bpr_32,0.438526,0.144466
True_bpr_64,0.438526,0.143777
True_warp_64,0.294113,0.143777
False_warp_64,0.139447,0.140722
True_warp_32,0.294113,0.135402
False_bpr_64,0.111414,0.11225
False_warp_32,0.347217,0.103481
False_bpr_32,0.113817,0.077373
