<a href="https://colab.research.google.com/github/parthgarg123/flipkart_personalized_prod_recommendation/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import hashlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import mean_squared_error

from scipy.sparse.linalg import svds # for sparse matrices

In [2]:
#Import the data set
df = pd.read_csv('/content/events.csv', header=None) #There are no headers in the data file

df.columns = ['event_time', 'event_type', 'prod_id', 'cat_id','cat_code','brand','price','user_id','user_sess'] #Adding column names

df = df.drop(['event_time','cat_code','user_sess','brand'], axis=1) #Dropping timestamp

df_copy = df.copy(deep=True) #Copying the data to another dataframe

In [44]:
def hash_to_float(input_str, min_value, max_value):
    # Use SHA-256 hash function
    sha256_hash = hashlib.sha256(repr(input_str).encode()).hexdigest()

    # Convert the hash to a floating-point value and map it to the desired range
    hash_float = int(sha256_hash, 16) / float(int('F'*64, 16))  # Convert hex hash to float
    mapped_float = min_value + (hash_float * (max_value - min_value))  # Map to the desired float range
    return mapped_float

# Define the range for float assignment
min_float = 1.5
max_float = 5.0

# Generate hash-based floats for each product
df['ratings'] = df['prod_id'].apply(lambda x: hash_to_float(x, min_float, max_float))

df.head()

#random_rating_len = len(df)

#random_rating = np.random.uniform(low=2, high=5, size=random_rating_len)

#df['ratings'] = random_rating*/

Unnamed: 0,event_type,prod_id,cat_id,price,user_id,ratings
0,event_type,product_id,category_id,price,user_id,4.03267
1,view,1996170,2144415922528452715,31.90,1515915625519388267,2.252158
2,view,139905,2144415926932472027,17.16,1515915625519380411,2.976226
3,view,215454,2144415927158964449,9.81,1515915625513238515,2.613014
4,view,635807,2144415923107266682,113.81,1515915625519014356,4.878066


In [30]:
rows, columns = df.shape
print("No of rows = ", rows)
print("No of columns = ", columns)

No of rows =  885130
No of columns =  6


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 885130 entries, 0 to 885129
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   event_type  885130 non-null  object 
 1   prod_id     885130 non-null  object 
 2   cat_id      885130 non-null  object 
 3   price       885130 non-null  object 
 4   user_id     885130 non-null  object 
 5   ratings     885130 non-null  float64
dtypes: float64(1), object(5)
memory usage: 40.5+ MB


In [32]:
df.isna().sum()

event_type    0
prod_id       0
cat_id        0
price         0
user_id       0
ratings       0
dtype: int64

In [40]:
print('Number of unique USERS in Raw data = ', df['user_id'].nunique())
print('Number of unique ITEMS in Raw data = ', df['prod_id'].nunique())
print('Number of unique ratings in Raw data = ', df['ratings'].nunique())

Number of unique USERS in Raw data =  409025
Number of unique ITEMS in Raw data =  68566
Number of unique ratings in Raw data =  68566


In [34]:
most_active = df.groupby('user_id').size().sort_values(ascending=False)[:10]
most_active

user_id
1515915625554995474    572
1515915625527763086    424
1515915625591251010    363
1515915625591659523    339
1515915625537803839    329
1515915625568273951    312
1515915625599852988    288
1515915625598794428    260
1515915625536567608    259
1515915625593948004    246
dtype: int64

In [57]:
counts = df['user_id'].value_counts()
df_final = df[df['user_id'].isin(counts[counts >= 500].index)]

In [58]:
print('The number of observations in the final data =', len(df_final))
print('Number of unique USERS in the final data = ', df_final['user_id'].nunique())
print('Number of unique PRODUCTS in the final data = ', df_final['prod_id'].nunique())

The number of observations in the final data = 572
Number of unique USERS in the final data =  1
Number of unique PRODUCTS in the final data =  9


In [59]:
best_prod_cat = df_final.groupby('cat_id').size().sort_values(ascending=False)[:10]
best_prod_cat
print('Number of unique ITEMS in final data = ', df_final['prod_id'].nunique())

Number of unique ITEMS in final data =  9


In [60]:
duplicates = df_final[df_final.duplicated(['user_id', 'prod_id'], keep=False)]
# Display duplicates
print(duplicates)

       event_type  prod_id               cat_id   price              user_id  \
410323       view  3979136  2144415923744800906  297.19  1515915625554995474   
410324       view  3979025  2144415923744800906   350.6  1515915625554995474   
410325       view  3978887  2144415923744800906  369.17  1515915625554995474   
410395       view   483337  2144415923744800906  365.25  1515915625554995474   
410396       view   669304  2144415923744800906  478.59  1515915625554995474   
...           ...      ...                  ...     ...                  ...   
825627       view   247085  2144415923744800906  262.48  1515915625554995474   
825700       view   483337  2144415923744800906  365.25  1515915625554995474   
825701       view   847412  2144415923744800906   120.0  1515915625554995474   
825702       view    37774  2144415923744800906  167.62  1515915625554995474   
825704       view   247085  2144415923744800906  262.48  1515915625554995474   

         ratings  
410323  3.765014  
4

In [63]:
df_final_agg = df_final.groupby(['user_id', 'prod_id'], as_index=False)['ratings'].mean()  # Or 'sum()' instead of 'mean()'
df_final_agg

Unnamed: 0,user_id,prod_id,ratings
0,1515915625554995474,37774,3.125587
1,1515915625554995474,247085,3.477133
2,1515915625554995474,322353,2.016978
3,1515915625554995474,483337,4.927815
4,1515915625554995474,669304,2.292295
5,1515915625554995474,847412,1.761482
6,1515915625554995474,3978887,4.074295
7,1515915625554995474,3979025,4.417428
8,1515915625554995474,3979136,3.765014


In [65]:
#Creating the interaction matrix of products and users based on ratings and replacing NaN value with 0
final_ratings_matrix = df_final_agg.pivot(index = 'user_id', columns ='prod_id', values = 'ratings').fillna(0)
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

#Finding the number of non-zero entries in the interaction matrix
given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)

#Finding the possible number of ratings as per the number of users and products
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)

#Density of ratings
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

final_ratings_matrix.head()

Shape of final_ratings_matrix:  (1, 9)
given_num_of_ratings =  9
possible_num_of_ratings =  9
density: 100.00%


prod_id,37774,247085,322353,483337,669304,847412,3978887,3979025,3979136
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1515915625554995474,3.125587,3.477133,2.016978,4.927815,2.292295,1.761482,4.074295,4.417428,3.765014
