In [105]:
import pandas as pd
import numpy as np
import pyodbc;
import random
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cross_validation import StratifiedKFold
from scipy.sparse.linalg import svds

from IPython.core.display import display,HTML
display(HTML("<style>.container{width:90% !important;}</style>"))

conn = pyodbc.connect(r'DRIVER={SQL Server Native Client 11.0};'r'SERVER=WIN-5G5AUOCEJPK;'r'DATABASE=Gold585;'r'Trusted_Connection=yes;')

In [125]:
sql_query = "SELECT Distinct MaterialID \
       ,[Name] contentId \
       ,[GroupID] \
       ,[CollectionID] \
       ,[WeightNet] \
       ,[WearType] \
       ,[Proba] \
       ,[PriceSegment] \
       ,[Metal] \
       ,[MetalColor] \
  FROM [Gold585].[dbo].[Purchase]"
articles_df = pd.read_sql(sql_query,conn)

In [163]:
sql_query = "SELECT [PurchaseDateTime] \
      ,[PartnerID] personId\
      ,[labels] \
      ,MaterialID contentId\
      ,GroupID \
      ,[Quantity] \
      ,[SummaAfterDiscount] eventStrength\
  FROM [Gold585].[dbo].[Purchase] \
  Where PartnerID <> 0 \
  and PartnerID IN (Select Distinct PartnerID from [Gold585].[dbo].[Purchase] \
  where [PurchaseDateTime] >= '20180101' and [PurchaseDateTime] < '20180301') \
  order by [PurchaseDateTime]"
interactions_df = pd.read_sql(sql_query,conn)

In [164]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 195410
# users with at least 5 interactions: 63700


In [165]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 1045553
# of interactions from users with at least 5 interactions: 739167


In [166]:
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum().reset_index() \
#                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 671860


Unnamed: 0,personId,contentId,eventStrength
0,4,13872,559.0
1,4,31486,359.0
2,4,32918,249.0
3,4,58583,379.0
4,4,105627,489.0
5,4,107756,1079.0
6,4,111566,269.0
7,4,132909,199.0
8,4,133695,1742.0
9,4,203294,1790.0


In [167]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.25,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 503895
# interactions on Test set: 167965


In [168]:
interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

In [169]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [170]:
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    

In [171]:
item_popularity_df = interactions_full_df.groupby('contentId')['eventStrength'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,contentId,eventStrength
0,139016,8680231.63
1,133458,6867354.66
2,124884,5697558.05
3,124399,4872443.19
4,134607,4548259.81
5,124886,3863326.97
6,137405,3059506.6
7,77328,2934369.53
8,87662,2767637.65
9,136889,2679515.11


In [172]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                               .sort_values('eventStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['Name', 'GroupID', 'WearType', 'Metal', 'PriceSegment']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, articles_df)

In [173]:
%%time
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
63699 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 1.0, 'recall@10': 1.0}
Wall time: 2h 48min 54s


In [176]:
pop_detailed_results_df.head(10)

Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
186,751187,166,166,166,1.0,1.0
6191,5319346,166,166,166,1.0,1.0
201,3098073,150,150,150,1.0,1.0
159,578359,136,136,136,1.0,1.0
717,994451,127,127,127,1.0,1.0
3503,2377429,121,121,121,1.0,1.0
180,757761,115,115,115,1.0,1.0
1631,5283999,108,108,108,1.0,1.0
246,459110,104,104,104,1.0,1.0
1455,743027,98,98,98,1.0,1.0


In [177]:
interactions_test_df.head()

Unnamed: 0,personId,contentId,eventStrength
535372,4927097,299765,40982.0
254894,2344243,121719,2272.6
131039,1489322,57749,1855.0
359335,3343224,42322,339.0
244632,2274837,282908,2217.0


In [183]:
popularity_model.recommend_items(interactions_test_df['personId'])

Unnamed: 0,contentId,eventStrength
0,139016,8680231.63
1,133458,6867354.66
2,124884,5697558.05
3,124399,4872443.19
4,134607,4548259.81
5,124886,3863326.97
6,137405,3059506.6
7,77328,2934369.53
8,87662,2767637.65
9,136889,2679515.11


In [182]:
popularity_model.recommend_items(interactions_test_df['personId'])

Unnamed: 0,contentId,eventStrength
0,139016,8680231.63
1,133458,6867354.66
2,124884,5697558.05
3,124399,4872443.19
4,134607,4548259.81
5,124886,3863326.97
6,137405,3059506.6
7,77328,2934369.53
8,87662,2767637.65
9,136889,2679515.11


In [157]:
users_items_pivot_matrix_df = data.pivot(index='PartnerID', 
                                                          columns='GroupID', 
                                                          values='Quantity').fillna(0)

users_items_pivot_matrix_df.head(10)

KeyError: 'PartnerID'

In [44]:
users_items_pivot_matrix_df.shape

(2728975, 311)

In [45]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix[:10]

  """Entry point for launching an IPython kernel.


array([[   72.,    40., 13480., ...,     0.,     0.,   343.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       ...,
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.],
       [    0.,     0.,     0., ...,     0.,     0.,     0.]])

In [46]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[0, 2, 4, 6, 9, 10, 12, 14, 15, 16]

In [47]:
users_items_pivot_matrix_df.shape

(2728975, 311)

In [48]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 50
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [49]:
U.shape

(2728975, 50)

In [50]:
Vt.shape

(50, 311)

In [51]:
sigma = np.diag(sigma)
sigma.shape

(50, 50)

In [52]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 7.20155676e+01,  4.00441014e+01,  1.34799950e+04, ...,
         1.00145453e-04,  3.24107038e-04,  3.42953227e+02],
       [ 4.56223406e-04, -1.12605168e-03, -5.67852600e-04, ...,
        -1.83534053e-05, -4.89479535e-08,  1.92628263e-03],
       [-1.12312380e-03,  7.77010698e-04,  1.90864934e-03, ...,
        -2.31447233e-05, -9.07505034e-06,  1.61789802e-03],
       ...,
       [ 1.77874931e-04,  4.03554720e-03, -2.91474090e-04, ...,
         1.07629201e-05, -7.03718503e-06,  9.73191802e-05],
       [ 5.62497704e-04, -1.89372253e-04, -4.91743119e-04, ...,
        -7.18770134e-07,  2.26744812e-06,  7.90836741e-04],
       [ 1.00231771e-03,  3.87567124e-03,  1.00145303e+00, ...,
        -2.83259978e-06, -1.90354917e-05,  7.71274316e-04]])

In [53]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,0,2,4,6,9,10,12,14,15,16,...,6582910,6582911,6582913,6582914,6582917,6582920,6582925,6582926,6582930,6582931
GroupID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,72.015568,0.000456,-0.001123,-4.2e-05,0.000156,-0.000383,0.000902,0.000498,0.001741,0.000152,...,0.000207,0.0006568402,5.3e-05,0.00077,0.000103,-4.2e-05,0.000871,0.000178,0.000562,0.001002
214,40.044101,-0.001126,0.000777,-0.000771,-0.001102,0.00075,0.000813,-0.001205,-0.003167,-0.000375,...,0.000803,0.0002262165,-7e-06,-0.000779,-0.002934,-0.000771,-0.000429,0.004036,-0.000189,0.003876
215,13479.994957,-0.000568,0.001909,-0.00044,-0.000331,0.000211,-0.001661,-0.001185,-0.003098,-0.000189,...,-0.000374,-0.0003285173,-0.000168,-0.000851,-0.000499,-0.00044,-0.0024,-0.000291,-0.000492,1.001453
216,11634.99852,-0.00027,0.0024,-0.000151,-8.3e-05,-0.000398,-0.000564,-0.000369,-0.000834,-9e-05,...,-0.000118,-0.0003672343,-0.00011,-0.000254,-0.00015,-0.000151,-0.00191,0.99984,-0.000131,0.003659
217,6880.000492,-0.000614,0.001802,-0.000331,-0.000227,0.000374,-0.001318,-0.000739,-0.001905,-0.000205,...,-0.000262,5.981815e-05,-0.000193,-0.000656,-0.000437,-0.000331,-0.002179,-0.00018,-0.000284,0.001016
218,4705.982541,-0.001829,0.001889,-0.001452,-0.001474,0.001467,-0.006014,-0.004622,0.985863,-0.00061,...,-0.001271,-0.0004508898,-0.000541,-0.003469,-0.002594,-0.001452,-0.015212,-0.000763,-0.002126,-0.002991
219,420.031907,-0.001502,0.001438,0.000856,0.002559,-0.000824,0.000452,0.004459,0.012543,-0.000501,...,0.000239,-0.000181613,-6.1e-05,-0.000139,0.004513,0.000856,0.00149,0.000626,0.001334,0.001774
221,3.003972,-0.000277,0.00067,-0.000121,-9.6e-05,0.000196,0.000154,-0.000182,-0.000229,-9.2e-05,...,0.00012,5.556629e-08,-3e-05,0.000319,-0.00026,-0.000121,7.2e-05,8.5e-05,-5e-06,0.00025
222,34788.999598,-0.000357,0.001452,-0.000252,-0.000176,-7.6e-05,0.999138,-0.000542,-0.00132,-0.000119,...,0.999806,-0.000100368,-0.000116,-0.000447,-0.000311,-0.000252,-0.000848,-0.000118,-0.000214,0.000604
223,27097.999049,-0.000449,0.001866,-0.000291,-0.000238,5.2e-05,0.99895,-0.000695,-0.001836,-0.00015,...,-0.000224,-3.188106e-05,-9.3e-05,1.999408,-0.000445,-0.000291,-0.00166,-0.000127,-0.000288,1.000634


In [54]:
len(cf_preds_df.columns)

2728975

In [55]:
cf_preds_df.shape

(311, 2728975)

In [56]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df)


In [91]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...


KeyError: 'contentId'