In [2]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

import pickle
import joblib

In [2]:
df_interactions = pd.read_csv("cars/interactions_personalize.csv")

In [3]:
df_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056960 entries, 0 to 1056959
Data columns (total 3 columns):
USER_ID      1056960 non-null int64
ITEM_ID      1056960 non-null int64
TIMESTAMP    1056960 non-null int64
dtypes: int64(3)
memory usage: 24.2 MB


## Remoción de outliers (usuarios con menos de 5 interacciones)

In [4]:
items_df = pd.read_csv("cars/car-items.csv")
items_df

Unnamed: 0,ITEM_ID,MARCA,MODELO,PUERTAS,TIPO,PRECIO,ANO,KILOMETRAJE,MARCA_MODELO
0,0,Chevrolet,Aveo,4,Sedan,20000000,2010,251215,Chevrolet Aveo
1,1,Chevrolet,Aveo,4,Sedan,20500000,2011,248475,Chevrolet Aveo
2,2,Chevrolet,Aveo,4,Sedan,21000000,2012,191022,Chevrolet Aveo
3,3,Chevrolet,Aveo,4,Sedan,21500000,2013,147422,Chevrolet Aveo
4,4,Chevrolet,Aveo,4,Sedan,22000000,2014,71111,Chevrolet Aveo
...,...,...,...,...,...,...,...,...,...
435,435,Nissan,Pathfinder,4,Camioneta,128000000,2016,66704,Nissan Pathfinder
436,436,Nissan,Pathfinder,4,Camioneta,128500000,2017,72730,Nissan Pathfinder
437,437,Nissan,Pathfinder,4,Camioneta,129000000,2018,46542,Nissan Pathfinder
438,438,Nissan,Pathfinder,4,Camioneta,129500000,2019,14990,Nissan Pathfinder


In [5]:
users_interactions_count = df_interactions.groupby(["USER_ID","ITEM_ID"]).size().groupby("USER_ID").size()
users_interactions_count

USER_ID
0        11
1        28
2        30
3        35
4        10
         ..
19995    19
19996    13
19997    12
19998    28
19999    37
Length: 20000, dtype: int64

In [6]:
users_filtered = users_interactions_count[users_interactions_count >= 5].reset_index()[["USER_ID"]]
users_filtered

Unnamed: 0,USER_ID
0,0
1,1
2,2
3,3
4,4
...,...
19490,19995
19491,19996
19492,19997
19493,19998


In [7]:
print('# of interactions: %d' % len(df_interactions))
interactions_from_selected_users_df = df_interactions.merge(users_filtered, 
               how = 'right',
               left_on = 'USER_ID',
               right_on = 'USER_ID')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 1056960
# of interactions from users with at least 5 interactions: 1055147


In [8]:
interactions_from_selected_users_df

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
0,0,62,1545008405
1,0,64,1545008450
2,0,61,1545008505
3,0,55,1545008530
4,0,55,1545008565
...,...,...,...
1055142,19999,345,1557172825
1055143,19999,60,1557172855
1055144,19999,387,1557172870
1055145,19999,49,1557172895


## Split dataset

In [9]:
interactions_train_df, interactions_test_df = train_test_split(interactions_from_selected_users_df,
                                   stratify=interactions_from_selected_users_df['USER_ID'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 844117
# interactions on Test set: 211030


INdexing

In [10]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_from_selected_users_df.set_index('USER_ID')
interactions_train_indexed_df = interactions_train_df.set_index('USER_ID')
interactions_test_indexed_df = interactions_test_df.set_index('USER_ID')

In [12]:
def get_items_interacted(user_id, interactions_df):
    interacted_items = interactions_df.loc[user_id]['ITEM_ID']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [13]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, user_id, sample_size, seed=42):
        interacted_items = get_items_interacted(user_id, interactions_full_indexed_df)
        all_items = set(items_df['ITEM_ID'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, user_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[user_id]
        if type(interacted_values_testset['ITEM_ID']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['ITEM_ID'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['ITEM_ID'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(user_id, items_to_ignore=get_items_interacted(user_id, interactions_train_indexed_df), topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(user_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['ITEM_ID'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['ITEM_ID'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, user_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, user_id)  
            person_metrics['_user_id'] = user_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()   

### Popularity model

In [14]:
#Computes the most popular items
item_popularity_df = interactions_from_selected_users_df.groupby('ITEM_ID').size().sort_values(ascending=False).reset_index()
item_popularity_df.columns = ["ITEM_ID","POPULARITY_COUNT"]
item_popularity_df.head()
# .sort_values(ascending=False).reset_index()


Unnamed: 0,ITEM_ID,POPULARITY_COUNT
0,137,2646
1,197,2616
2,334,2591
3,189,2583
4,99,2580


In [15]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df["ITEM_ID"].isin(items_to_ignore)].sort_values("POPULARITY_COUNT",ascending=False).reset_index().head(topn)
        
#         recommendations_df = self.popularity_df[~self.popularity_df['ITEM_ID'].isin(items_to_ignore)] \
#                                .size().sort_values(ascending=False).reset_index() \
#                                .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'ITEM_ID', 
                                                          right_on = 'ITEM_ID')[['MARCA', 'MODELO', 'PUERTAS', 'TIPO', 'PRECIO','ANO','KILOMETRAJE']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, items_df)

In [16]:
items_to_ignore = get_items_interacted(user_id,interactions_train_indexed_df)
# popularity_model.recommend_items(user_id,get_items_interacted(user_id,interactions_train_indexed_df))

In [17]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...


KeyboardInterrupt: 

In [18]:
user_id = 0
popularity_model.recommend_items(user_id,get_items_interacted(user_id,interactions_train_indexed_df))

Unnamed: 0,index,ITEM_ID,POPULARITY_COUNT
0,0,137,2646
1,1,197,2616
2,2,334,2591
3,3,189,2583
4,4,99,2580
5,5,340,2572
6,6,141,2569
7,7,142,2569
8,8,139,2564
9,9,232,2562


In [19]:
get_items_interacted(user_id,interactions_train_indexed_df)

{55, 56, 59, 61, 62, 64, 65, 261, 361, 434}

## Matrix factorization

In [55]:
df_interactions = pd.read_csv("cars/interactions_personalize.csv",usecols=["USER_ID","ITEM_ID"])

In [56]:
df_interactions["CLICK"] = np.ones(df_interactions.values.shape[0])
df_interactions

Unnamed: 0,USER_ID,ITEM_ID,CLICK
0,0,62,1.0
1,0,64,1.0
2,0,61,1.0
3,0,55,1.0
4,0,55,1.0
...,...,...,...
1056955,19999,345,1.0
1056956,19999,60,1.0
1056957,19999,387,1.0
1056958,19999,49,1.0


Sampling

In [57]:
interactions_sample = df_interactions.sample(n=50000)
interactions_sample

Unnamed: 0,USER_ID,ITEM_ID,CLICK
444589,8408,362,1.0
953611,18063,37,1.0
633421,11974,225,1.0
938548,17784,303,1.0
558617,10572,57,1.0
...,...,...,...
209192,3956,400,1.0
518670,9809,320,1.0
588892,11148,209,1.0
247167,4650,11,1.0


Removing outliers

In [71]:
users_interactions_count = interactions_sample.groupby(["USER_ID","ITEM_ID"]).size().groupby("USER_ID").size()
users_interactions_count

USER_ID
0        2
1        4
2        4
3        2
5        2
        ..
19995    3
19996    2
19997    2
19998    5
19999    3
Length: 16507, dtype: int64

In [73]:
users_filtered = users_interactions_count[users_interactions_count >= 5].reset_index()[["USER_ID"]]
users_filtered.head(5)

Unnamed: 0,USER_ID
0,29
1,31
2,50
3,54
4,79


In [74]:
print('# of interactions: %d' % len(interactions_sample))
interactions_from_selected_users_df = interactions_sample.merge(users_filtered, 
               how = 'right',
               left_on = 'USER_ID',
               right_on = 'USER_ID')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 50000
# of interactions from users with at least 5 interactions: 16054


In [61]:
interactions_from_selected_users_df

Unnamed: 0,USER_ID,ITEM_ID,CLICK
0,2332,424,1.0
1,2332,425,1.0
2,2332,419,1.0
3,2332,20,1.0
4,2332,418,1.0
...,...,...,...
16049,7835,155,1.0
16050,7835,154,1.0
16051,7835,158,1.0
16052,7835,43,1.0


In [173]:
interactions_full_df = interactions_from_selected_users_df.groupby(['USER_ID', 'ITEM_ID'])['CLICK'].sum().reset_index()
interactions_full_df
# print('# of unique user/item interactions: %d' % len(interactions_full_df))
# interactions_full_df.head(10)

Unnamed: 0,USER_ID,ITEM_ID,CLICK
0,29,0,2.0
1,29,6,1.0
2,29,7,1.0
3,29,8,2.0
4,29,9,1.0
...,...,...,...
14578,19998,282,1.0
14579,19998,401,1.0
14580,19998,403,1.0
14581,19998,404,1.0


## Split dataset

In [82]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['USER_ID'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 11666
# interactions on Test set: 2917


INdexing

In [64]:
# #Indexing by personId to speed up the searches during evaluation
# interactions_full_indexed_df = interactions_from_selected_users_df.set_index('USER_ID')
# interactions_train_indexed_df = interactions_train_df.set_index('USER_ID')
# interactions_test_indexed_df = interactions_test_df.set_index('USER_ID')

In [83]:
def get_items_interacted(user_id, interactions_df):
    interacted_items = interactions_df.loc[user_id]['ITEM_ID']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

### Create user-item matrix

In [85]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index='USER_ID', 
                                                          columns='ITEM_ID', 
                                                          values='CLICK').fillna(0)

users_items_pivot_matrix_df

ITEM_ID,0,1,2,3,4,5,6,7,8,9,...,430,431,432,433,434,435,436,437,438,439
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19986,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix[:10]

  """Entry point for launching an IPython kernel.


array([[2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [99]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[29, 31, 50, 54, 79, 85, 87, 91, 104, 121]

In [89]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix

<2544x440 sparse matrix of type '<class 'numpy.float64'>'
	with 11666 stored elements in Compressed Sparse Row format>

In [90]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [91]:
U.shape

(2544, 15)

In [92]:
Vt.shape

(15, 440)

In [93]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [94]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 6.65584175e-01,  3.69919494e-01,  2.79507701e-01, ...,
         6.22306618e-15,  4.43730671e-15,  3.48953409e-15],
       [ 2.59092722e-15,  1.23167815e-15,  1.22001016e-15, ...,
         2.81993966e-02,  2.16586699e-02,  1.95458420e-02],
       [ 4.60772470e-15,  2.17571128e-15,  2.18208383e-15, ...,
         4.65066677e-02,  3.44968643e-02,  2.87041899e-02],
       ...,
       [ 4.76821935e-15,  2.22058526e-15,  2.24941685e-15, ...,
         7.56057668e-03,  9.73982005e-03,  8.04500945e-03],
       [ 1.67451279e-15,  7.87836464e-16,  7.73740135e-16, ...,
        -2.60024436e-02, -7.89644342e-03, -5.16529339e-03],
       [ 6.82275207e-02,  4.43718992e-02,  2.94416139e-02, ...,
        -1.93222269e-16, -1.47663122e-16, -1.00585250e-16]])

In [95]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [97]:


#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df

Unnamed: 0_level_0,29,31,50,54,79,85,87,91,104,121,...,19956,19958,19969,19971,19978,19986,19989,19992,19994,19998
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.482174,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,...,0.168934,0.168934,0.168934,0.168934,0.196687,0.351192,0.168934,0.168934,0.168934,0.201043
1,0.343027,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,...,0.168934,0.168934,0.168934,0.168934,0.188799,0.271165,0.168934,0.168934,0.168934,0.189816
2,0.300477,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,...,0.168934,0.168934,0.168934,0.168934,0.180496,0.245152,0.168934,0.168934,0.168934,0.182790
3,0.370610,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,...,0.168934,0.168934,0.168934,0.168934,0.191767,0.286825,0.168934,0.168934,0.168934,0.193192
4,0.379518,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,0.168934,...,0.168934,0.168934,0.168934,0.168934,0.188763,0.291819,0.168934,0.168934,0.168934,0.191760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,0.168934,0.184086,0.193883,0.182396,0.204741,0.179305,0.162780,0.181791,0.182575,0.168934,...,0.168934,0.185770,0.168934,0.168934,0.168934,0.168934,0.181418,0.174357,0.157929,0.168934
436,0.168934,0.183905,0.192887,0.180847,0.178038,0.185840,0.159547,0.181417,0.181718,0.168934,...,0.168934,0.184510,0.168934,0.168934,0.168934,0.168934,0.180967,0.172084,0.153885,0.168934
437,0.168934,0.182205,0.190821,0.180490,0.177503,0.185321,0.161335,0.179896,0.180127,0.168934,...,0.168934,0.173351,0.168934,0.168934,0.168934,0.168934,0.179693,0.172492,0.156697,0.168934
438,0.168934,0.179127,0.185169,0.179447,0.179025,0.179848,0.169460,0.178076,0.178342,0.168934,...,0.168934,0.188403,0.168934,0.168934,0.168934,0.168934,0.177483,0.173518,0.165218,0.168934


In [191]:
def get_items_interacted(user_id, interactions_df):
    interacted_items = interactions_df.loc[user_id]['ITEM_ID']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [192]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['ITEM_ID'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'ITEM_ID', 
                                                          right_on = 'ITEM_ID')[['recStrength', 'MARCA', 'MODELO', 'PUERTAS', 'TIPO', 'PRECIO', 'ANO','KILOMETRAJE', 'MARCA_MODELO']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, items_df)

In [193]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
2543 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.40969162995594716, 'recall@10': 0.5494021397105098}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_user_id
389,2,2,3,0.666667,0.666667,6239
287,3,3,3,1.0,1.0,7248
1274,0,0,3,0.0,0.0,12707
643,0,1,3,0.0,0.333333,2971
501,2,2,3,0.666667,0.666667,11670
438,0,0,3,0.0,0.0,14499
364,2,2,3,0.666667,0.666667,3066
846,2,3,3,0.666667,1.0,15928
284,0,0,3,0.0,0.0,6684
1346,2,2,3,0.666667,0.666667,7366


In [194]:
def get_items_interacted(user_id,interactions_df):
    return set(interactions_full_df[interactions_full_df["USER_ID"] == user_id]["ITEM_ID"].values.tolist())

In [203]:
user_id = users_ids[9]
items_to_ignore = get_items_interacted(user_id,interactions_full_df)
print("Usuario {}".format(user_id))
cf_recommender_model.recommend_items(user_id,items_to_ignore,topn=10,verbose=True)

Usuario 121


Unnamed: 0,recStrength,MARCA,MODELO,PUERTAS,TIPO,PRECIO,ANO,KILOMETRAJE,MARCA_MODELO
0,0.204615,Kia,Rio,4,Hatchback,47500000,2015,104355,Kia Rio
1,0.203026,Renault,Twingo,2,Hatchback,18500000,2013,119386,Renault Twingo
2,0.199331,Kia,Rio,4,Hatchback,45500000,2011,234000,Kia Rio
3,0.198059,Kia,Rio,4,Hatchback,49500000,2019,27783,Kia Rio
4,0.197987,Renault,Sandero,4,Hatchback,37000000,2014,110210,Renault Sandero
5,0.196413,Renault,Twingo,2,Hatchback,17500000,2011,105144,Renault Twingo
6,0.195455,Kia,Picanto,4,Hatchback,31000000,2016,56262,Kia Picanto
7,0.195173,Volkswagen,Beetle,4,Hatchback,77000000,2014,161311,Volkswagen Beetle
8,0.194719,Renault,Twingo,2,Hatchback,22000000,2020,631,Renault Twingo
9,0.19453,Renault,Clio,4,Hatchback,22000000,2018,34984,Renault Clio


### Item to item

In [206]:
#Creating a sparse pivot table with users in rows and items in columns
pivot = interactions_train_df.pivot(index='ITEM_ID',columns='USER_ID',values='CLICK').fillna(0)

In [207]:
pivot_norm = pivot.apply(lambda x: x - np.nanmean(x), axis=1)
pivot_norm.head()

USER_ID,29,31,50,54,79,85,87,91,104,121,...,19956,19958,19969,19971,19978,19986,19989,19992,19994,19998
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.98467,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,...,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533,-0.01533
1,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,...,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006
2,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,...,-0.009041,-0.009041,-0.009041,-0.009041,-0.009041,0.990959,-0.009041,-0.009041,-0.009041,-0.009041
3,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,...,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972,-0.012972
4,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,...,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186,-0.012186


In [280]:
# convert into dataframe to make it easier
item_sim_df = pd.DataFrame(cosine_similarity(pivot_norm, pivot_norm), index=pivot_norm.index, columns=pivot_norm.index)
item_sim_df.head()

ITEM_ID,0,1,2,3,4,5,6,7,8,9,...,430,431,432,433,434,435,436,437,438,439
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.308451,0.240392,0.210786,0.217422,0.307375,0.169883,0.302812,0.290069,0.178587,...,-0.009518,-0.011919,-0.011249,-0.010543,-0.011699,-0.011919,-0.011785,-0.010225,-0.010371,-0.01097
1,0.308451,1.0,0.17662,0.048078,0.110456,0.264735,0.174187,0.175993,0.049681,0.215981,...,-0.008088,-0.010129,-0.009559,-0.008959,-0.009942,-0.010129,-0.010015,-0.008689,-0.008814,-0.009322
2,0.240392,0.17662,1.0,0.128212,0.096818,0.111688,0.279558,0.099742,0.200985,0.197484,...,-0.007834,-0.009811,-0.009259,-0.008678,-0.00963,-0.009811,-0.009701,-0.008416,-0.008537,-0.009029
3,0.210786,0.048078,0.128212,1.0,0.241624,0.084775,0.103605,0.219969,0.100007,0.104915,...,-0.008874,-0.011113,-0.010488,-0.00983,-0.010908,-0.011113,-0.010988,-0.009533,-0.00967,-0.010227
4,0.217422,0.110456,0.096818,0.241624,1.0,0.219355,0.048569,0.197047,0.245782,0.253846,...,-0.008567,-0.010729,-0.010126,-0.00949,-0.010531,-0.010729,-0.010609,-0.009204,-0.009336,-0.009874


In [303]:
def get_item(item_id,topn):
    if item_id not in pivot_norm.index:
        return None, None
    else:
        sim_items = item_sim_df.sort_values(by=item_id, ascending=False).index[7:topn+7].tolist()
        sim_score = item_sim_df.sort_values(by=item_id, ascending=False).loc[:, item_id].tolist()[7:topn+7]
        return sim_items, sim_score

In [304]:
def get_name(items_df,item_id):
    lista = items_df[items_df["ITEM_ID"] == item_id].values[0].tolist()[1:]
    lista = [str(i) for i in lista]
    return " ".join(lista)

In [305]:
item_id = 24
sim_items,sim_score = get_item(item_id,10)
item_names = [get_name(items_df,item) for item in sim_items]
item_names

['Chevrolet Optra 4 Sedan 26500000 2017 52203 Chevrolet Optra',
 'Chevrolet Aveo 4 Sedan 22500000 2015 96358 Chevrolet Aveo',
 'Chevrolet Optra 4 Sedan 26000000 2016 82874 Chevrolet Optra',
 'Chevrolet Optra 4 Sedan 24500000 2013 132891 Chevrolet Optra',
 'Toyota Corolla 4 Sedan 76500000 2019 29085 Toyota Corolla',
 'Kia Cerato 4 Sedan 65500000 2011 175407 Kia Cerato',
 'Mazda Seis 4 Sedan 108500000 2017 36868 Mazda Seis',
 'Chevrolet Optra 4 Sedan 23500000 2011 113878 Chevrolet Optra',
 'Toyota Corolla 4 Sedan 75000000 2016 85971 Toyota Corolla',
 'Volkswagen Jetta 4 Sedan 80000000 2010 191277 Volkswagen Jetta']

In [None]:
{
    "items": item_names,
    "scores": sim_score
}

In [6]:
import pandas as pd
import numpy as np


def get_item(item_id,topn):
    if item_id not in pivot_norm.index:
        return None, None
    else:
        sim_items = item_sim_df.sort_values(by=item_id, ascending=False).index[7:topn+7].tolist()
        sim_score = item_sim_df.sort_values(by=item_id, ascending=False).loc[:, item_id].tolist()[7:topn+7]
        return sim_items, sim_score

def get_name(items_df,item_id):
    lista = items_df[items_df["ITEM_ID"] == item_id].values[0].tolist()[1:]
    lista = [str(i) for i in lista]
    return " ".join(lista)

In [7]:

min_interactions = 5
sample = 50000

# data = request.get_json(silent=True, force=True)
item_id = 20

items_df = pd.read_csv("cars/car-items.csv")
interactions_df = pd.read_csv("cars/interactions_personalize.csv")
interactions_df["CLICK"] = np.ones(interactions_df.values.shape[0])
interactions_sample = interactions_df.sample(n=sample)

# Filter users with more than 5 interactions
users_interactions_count = interactions_sample.groupby(["USER_ID","ITEM_ID"]).size().groupby("USER_ID").size()
users_filtered = users_interactions_count[users_interactions_count >= min_interactions].reset_index()[["USER_ID"]]
interactions_from_selected_users_df = interactions_sample.merge(users_filtered,how = 'right',left_on = 'USER_ID',right_on = 'USER_ID')

interactions_full_df = interactions_from_selected_users_df.groupby(['USER_ID', 'ITEM_ID'])['CLICK'].sum().reset_index()

# Split 
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,stratify=interactions_full_df['USER_ID'], test_size=0.20,random_state=42)
# Matriz de usuarios-item ()
pivot_mat = interactions_train_df.pivot(index='ITEM_ID',columns='USER_ID',values='CLICK').fillna(0)
# pivot_norm = pivot_mat.apply(lambda x: x - np.nanmean(x), axis=1)
pivot_norm = pivot_mat.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# matriz de similitud
item_sim_df = pd.DataFrame(cosine_similarity(pivot_norm, pivot_norm), index=pivot_norm.index, columns=pivot_norm.index)

topn = 20
sim_items,sim_score = get_item(item_id,topn)
item_names = [get_name(items_df,item) for item in sim_items]
response = {
    "items": item_names,
    "scores": sim_score
}

In [11]:
cosine_similarity(pivot_norm, pivot_norm).shape

(440, 440)

In [17]:
pickle_out = open("pivot-norm.pkl","wb")
pickle.dump(item_sim_df, pickle_out)
pickle_out.close()

In [15]:
pickle_out = open("sim-mat.pkl","wb")
pickle.dump(item_sim_df, pickle_out)
pickle_out.close()

In [12]:
filename = "sim-mat.joblib"
joblib.dump(item_sim_df,filename)

['sim-mat.joblib']

In [16]:
pickle_in = open("sim-mat.pkl","rb")
example = pickle.load(pickle_in)
example

ITEM_ID,0,1,2,3,4,5,6,7,8,9,...,430,431,432,433,434,435,436,437,438,439
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.105122,0.146097,0.215724,0.146643,0.038605,0.162976,0.215512,0.200379,0.251415,...,-0.010922,-0.010443,-0.009872,-0.009053,-0.010134,-0.009148,-0.008291,-0.009872,-0.009422,-0.009386
1,0.105122,1.000000,0.369842,0.243149,0.200876,0.045015,0.069551,0.148923,0.264789,0.137635,...,-0.009382,-0.008971,-0.008481,-0.007777,-0.008705,-0.007858,-0.007123,-0.008481,-0.008094,-0.008063
2,0.146097,0.369842,1.000000,0.263291,0.255009,0.133932,0.361845,0.294202,0.314000,0.115456,...,-0.013039,-0.012468,-0.011786,-0.010808,-0.012098,-0.010922,-0.009899,-0.011786,-0.011248,-0.011206
3,0.215724,0.243149,0.263291,1.000000,0.287918,0.036535,0.187668,0.258707,0.157560,0.177255,...,-0.010813,-0.010339,-0.009774,-0.008963,-0.010033,-0.009057,-0.008209,-0.009774,-0.009328,-0.009293
4,0.146643,0.200876,0.255009,0.287918,1.000000,0.120136,0.276673,0.068466,0.088011,0.141366,...,-0.011281,-0.010787,-0.010197,-0.009351,-0.010467,-0.009449,-0.008565,-0.010197,-0.009732,-0.009695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,-0.009148,-0.007858,-0.010922,-0.009057,-0.009449,-0.007699,-0.008224,-0.009703,-0.008737,-0.009542,...,0.225900,0.152633,0.125885,0.099832,0.087283,1.000000,0.247559,0.234832,0.107563,0.129648
436,-0.008291,-0.007123,-0.009899,-0.008209,-0.008565,-0.006978,-0.007454,-0.008795,-0.007919,-0.008649,...,0.096735,0.183796,0.184210,0.065394,0.122866,0.247559,1.000000,0.239508,0.140053,0.274226
437,-0.009872,-0.008481,-0.011786,-0.009774,-0.010197,-0.008308,-0.008875,-0.010472,-0.009429,-0.010298,...,0.192568,0.176481,0.059430,0.178491,0.157553,0.234832,0.239508,1.000000,0.166752,0.129880
438,-0.009422,-0.008094,-0.011248,-0.009328,-0.009732,-0.007929,-0.008470,-0.009994,-0.008999,-0.009828,...,0.184481,0.191608,0.141415,0.091927,0.140826,0.107563,0.140053,0.166752,1.000000,0.119629


In [14]:
joblib_model = joblib.load(filename)

ITEM_ID,0,1,2,3,4,5,6,7,8,9,...,430,431,432,433,434,435,436,437,438,439
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.105122,0.146097,0.215724,0.146643,0.038605,0.162976,0.215512,0.200379,0.251415,...,-0.010922,-0.010443,-0.009872,-0.009053,-0.010134,-0.009148,-0.008291,-0.009872,-0.009422,-0.009386
1,0.105122,1.000000,0.369842,0.243149,0.200876,0.045015,0.069551,0.148923,0.264789,0.137635,...,-0.009382,-0.008971,-0.008481,-0.007777,-0.008705,-0.007858,-0.007123,-0.008481,-0.008094,-0.008063
2,0.146097,0.369842,1.000000,0.263291,0.255009,0.133932,0.361845,0.294202,0.314000,0.115456,...,-0.013039,-0.012468,-0.011786,-0.010808,-0.012098,-0.010922,-0.009899,-0.011786,-0.011248,-0.011206
3,0.215724,0.243149,0.263291,1.000000,0.287918,0.036535,0.187668,0.258707,0.157560,0.177255,...,-0.010813,-0.010339,-0.009774,-0.008963,-0.010033,-0.009057,-0.008209,-0.009774,-0.009328,-0.009293
4,0.146643,0.200876,0.255009,0.287918,1.000000,0.120136,0.276673,0.068466,0.088011,0.141366,...,-0.011281,-0.010787,-0.010197,-0.009351,-0.010467,-0.009449,-0.008565,-0.010197,-0.009732,-0.009695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,-0.009148,-0.007858,-0.010922,-0.009057,-0.009449,-0.007699,-0.008224,-0.009703,-0.008737,-0.009542,...,0.225900,0.152633,0.125885,0.099832,0.087283,1.000000,0.247559,0.234832,0.107563,0.129648
436,-0.008291,-0.007123,-0.009899,-0.008209,-0.008565,-0.006978,-0.007454,-0.008795,-0.007919,-0.008649,...,0.096735,0.183796,0.184210,0.065394,0.122866,0.247559,1.000000,0.239508,0.140053,0.274226
437,-0.009872,-0.008481,-0.011786,-0.009774,-0.010197,-0.008308,-0.008875,-0.010472,-0.009429,-0.010298,...,0.192568,0.176481,0.059430,0.178491,0.157553,0.234832,0.239508,1.000000,0.166752,0.129880
438,-0.009422,-0.008094,-0.011248,-0.009328,-0.009732,-0.007929,-0.008470,-0.009994,-0.008999,-0.009828,...,0.184481,0.191608,0.141415,0.091927,0.140826,0.107563,0.140053,0.166752,1.000000,0.119629


In [39]:
response

{'items': ['Chevrolet Spark GT 4 Hatchback 30500000 2011 218475 Chevrolet Spark GT',
  'Chevrolet Spark GT 4 Hatchback 33000000 2016 55209 Chevrolet Spark GT',
  'Chevrolet Spark GT 4 Hatchback 35000000 2020 947 Chevrolet Spark GT',
  'Ford Fiesta 4 Hatchback 59500000 2019 24868 Ford Fiesta',
  'Nissan March 4 Hatchback 41500000 2013 149205 Nissan March',
  'Renault Twingo 2 Hatchback 19500000 2015 68078 Renault Twingo',
  'Volkswagen Golf 4 Hatchback 76000000 2018 34925 Volkswagen Golf',
  'Volkswagen Gol 4 Hatchback 32500000 2015 90543 Volkswagen Gol',
  'Volkswagen Golf 4 Hatchback 74000000 2014 146731 Volkswagen Golf',
  'Kia Rio 4 Hatchback 47500000 2015 104355 Kia Rio',
  'Nissan March 4 Hatchback 40000000 2010 312318 Nissan March',
  'Volkswagen Golf 4 Hatchback 72000000 2010 149012 Volkswagen Golf',
  'Volkswagen Gol 4 Hatchback 34500000 2019 20265 Volkswagen Gol',
  'Renault Twingo 2 Hatchback 21500000 2019 16572 Renault Twingo',
  'Ford Fiesta 4 Hatchback 57500000 2015 101496

In [31]:
items_df

Unnamed: 0,ITEM_ID,MARCA,MODELO,PUERTAS,TIPO,PRECIO,ANO,KILOMETRAJE,MARCA_MODELO
0,0,Chevrolet,Aveo,4,Sedan,20000000,2010,251215,Chevrolet Aveo
1,1,Chevrolet,Aveo,4,Sedan,20500000,2011,248475,Chevrolet Aveo
2,2,Chevrolet,Aveo,4,Sedan,21000000,2012,191022,Chevrolet Aveo
3,3,Chevrolet,Aveo,4,Sedan,21500000,2013,147422,Chevrolet Aveo
4,4,Chevrolet,Aveo,4,Sedan,22000000,2014,71111,Chevrolet Aveo
...,...,...,...,...,...,...,...,...,...
435,435,Nissan,Pathfinder,4,Camioneta,128000000,2016,66704,Nissan Pathfinder
436,436,Nissan,Pathfinder,4,Camioneta,128500000,2017,72730,Nissan Pathfinder
437,437,Nissan,Pathfinder,4,Camioneta,129000000,2018,46542,Nissan Pathfinder
438,438,Nissan,Pathfinder,4,Camioneta,129500000,2019,14990,Nissan Pathfinder


In [7]:
response

{'items': ['Chevrolet Aveo 4 Sedan 21000000 2012 191022 Chevrolet Aveo',
  'Chevrolet Aveo 4 Sedan 23000000 2016 96141 Chevrolet Aveo',
  'Nissan Sentra 4 Sedan 61500000 2013 103004 Nissan Sentra',
  'Mazda Tres 4 Sedan 70000000 2010 154690 Mazda Tres',
  'Nissan Sentra 4 Sedan 62000000 2014 168036 Nissan Sentra',
  'Mazda Tres 4 Sedan 74000000 2018 31970 Mazda Tres',
  'Mazda Tres 4 Sedan 73500000 2017 83644 Mazda Tres',
  'Toyota Corolla 4 Sedan 76500000 2019 29085 Toyota Corolla',
  'Chevrolet Sail 4 Sedan 31500000 2013 139919 Chevrolet Sail',
  'Toyota Corolla 4 Sedan 76000000 2018 36924 Toyota Corolla',
  'Chevrolet Sail 4 Sedan 30000000 2010 158056 Chevrolet Sail',
  'Chevrolet Aveo 4 Sedan 21500000 2013 147422 Chevrolet Aveo',
  'Toyota Corolla 4 Sedan 75000000 2016 85971 Toyota Corolla',
  'Volkswagen Jetta 4 Sedan 84500000 2019 20466 Volkswagen Jetta',
  'Chevrolet Sail 4 Sedan 34500000 2019 18669 Chevrolet Sail',
  'Nissan Sentra 4 Sedan 60500000 2011 281923 Nissan Sentra',
 

In [3]:
min_interactions = 5
sample = 50000

# data = request.get_json(silent=True, force=True)
item_id = 20

items_df = pd.read_csv("cars/car-items.csv")
interactions_df = pd.read_csv("cars/interactions_personalize.csv")
interactions_df["CLICK"] = np.ones(interactions_df.values.shape[0])
interactions_sample = interactions_df.sample(n=sample)

# Filter users with more than 5 interactions
users_interactions_count = interactions_sample.groupby(["USER_ID","ITEM_ID"]).size().groupby("USER_ID").size()
users_filtered = users_interactions_count[users_interactions_count >= min_interactions].reset_index()[["USER_ID"]]
interactions_from_selected_users_df = interactions_sample.merge(users_filtered,how = 'right',left_on = 'USER_ID',right_on = 'USER_ID')

interactions_full_df = interactions_from_selected_users_df.groupby(['USER_ID', 'ITEM_ID'])['CLICK'].sum().reset_index()

# Split 
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,stratify=interactions_full_df['USER_ID'], test_size=0.20,random_state=42)
# Matriz de usuarios-item ()
pivot_mat = interactions_train_df.pivot(index='ITEM_ID',columns='USER_ID',values='CLICK').fillna(0)

In [4]:
pivot_mat

USER_ID,1,6,9,20,23,24,36,40,65,72,...,19946,19948,19954,19956,19958,19969,19970,19972,19974,19984
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:


# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = pivot_mat.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


# Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]



In [43]:
piv_sparse = csr_matrix(piv_norm.values)

In [52]:


# item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)



In [53]:
# Inserting the similarity matricies into dataframe objects

# item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)