In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import hashlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import mean_squared_error

from scipy.sparse.linalg import svds # for sparse matrices

In [None]:
#Import the data set
df = pd.read_csv('/content/events (1).csv', header=None) #There are no headers in the data file

df.columns = ['event_time', 'event_type', 'prod_id', 'cat_id','cat_code','brand','price','user_id','user_sess'] #Adding column names

df = df.drop(['event_time','cat_code','user_sess','brand'], axis=1) #Dropping timestamp

df_copy = df.copy(deep=True) #Copying the data to another dataframe

In [None]:
def hash_to_float(input_str, min_value, max_value):
    # Use SHA-256 hash function
    sha256_hash = hashlib.sha256(repr(input_str).encode()).hexdigest()

    # Convert the hash to a floating-point value and map it to the desired range
    hash_float = int(sha256_hash, 16) / float(int('F'*64, 16))  # Convert hex hash to float
    mapped_float = min_value + (hash_float * (max_value - min_value))  # Map to the desired float range
    return mapped_float

# Define the range for float assignment
min_float = 1.5
max_float = 5.0

# Generate hash-based floats for each product
df['ratings'] = df['prod_id'].apply(lambda x: hash_to_float(x, min_float, max_float))

df.head()

#random_rating_len = len(df)

#random_rating = np.random.uniform(low=2, high=5, size=random_rating_len)

#df['ratings'] = random_rating*/

Unnamed: 0,event_type,prod_id,cat_id,price,user_id,ratings
0,event_type,product_id,category_id,price,user_id,4.03267
1,view,1996170,2144415922528452715,31.90,1515915625519388267,2.252158
2,view,139905,2144415926932472027,17.16,1515915625519380411,2.976226
3,view,215454,2144415927158964449,9.81,1515915625513238515,2.613014
4,view,635807,2144415923107266682,113.81,1515915625519014356,4.878066


In [None]:
rows, columns = df.shape
print("No of rows = ", rows)
print("No of columns = ", columns)

No of rows =  61979
No of columns =  6


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61979 entries, 0 to 61978
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   event_type  61979 non-null  object 
 1   prod_id     61979 non-null  object 
 2   cat_id      61978 non-null  object 
 3   price       61978 non-null  object 
 4   user_id     61978 non-null  object 
 5   ratings     61979 non-null  float64
dtypes: float64(1), object(5)
memory usage: 2.8+ MB


In [None]:
df.isna().sum()

event_type    0
prod_id       0
cat_id        1
price         1
user_id       1
ratings       0
dtype: int64

In [None]:
print('Number of unique USERS in Raw data = ', df['user_id'].nunique())
print('Number of unique ITEMS in Raw data = ', df['prod_id'].nunique())
print('Number of unique ratings in Raw data = ', df['ratings'].nunique())

Number of unique USERS in Raw data =  32683
Number of unique ITEMS in Raw data =  16486
Number of unique ratings in Raw data =  16486


In [None]:
most_active = df.groupby('user_id').size().sort_values(ascending=False)[:10]
most_active

user_id
1515915625444050265    100
1515915625517787550    100
1515915625353514897     86
1515915625522463421     81
1515915625522779180     76
1515915625360775587     66
1515915625520712323     65
1515915625522082569     64
1515915625520191157     63
1515915625425059411     61
dtype: int64

In [None]:
counts = df['user_id'].value_counts()
df_final = df[df['user_id'].isin(counts[counts >= 65].index)]

In [None]:
print('The number of observations in the final data =', len(df_final))
print('Number of unique USERS in the final data = ', df_final['user_id'].nunique())
print('Number of unique PRODUCTS in the final data = ', df_final['prod_id'].nunique())

The number of observations in the final data = 574
Number of unique USERS in the final data =  7
Number of unique PRODUCTS in the final data =  213


In [None]:
best_prod_cat = df_final.groupby('cat_id').size().sort_values(ascending=False)[:10]
best_prod_cat
print('Number of unique ITEMS in final data = ', df_final['prod_id'].nunique())

Number of unique ITEMS in final data =  213


In [None]:
df_final['count'] = df.groupby(['user_id', 'prod_id'])['prod_id'].transform('count')
df_final['event_count'] = df_final[df_final['event_type'] == 'view'].groupby(['event_type', 'user_id', 'prod_id'])['event_type'].transform('count').fillna(0)


In [None]:
duplicates = df_final[df_final.duplicated(['user_id', 'prod_id'], keep=False)]
# Display duplicates
print(duplicates)

      event_type  prod_id               cat_id   price              user_id  \
331         view  1507368  2144415922016747613  172.86  1515915625360775587   
389         view  1803694  2144415923694469257  240.98  1515915625360775587   
428         view  1803694  2144415923694469257  240.98  1515915625360775587   
458         view  1803694  2144415923694469257  240.98  1515915625360775587   
2840        view  3979136  2144415923744800906  297.19  1515915625353514897   
...          ...      ...                  ...     ...                  ...   
59218       cart  4079420  2144415922427789416  449.51  1515915625522779180   
59226       view  1821824  2144415922427789416  399.46  1515915625522779180   
59227       cart  1821824  2144415922427789416  399.46  1515915625522779180   
59257       view  3550606  2144415922427789416  509.54  1515915625522779180   
60118       view  4155385  2144415922427789416  539.57  1515915625520712323   

        ratings  count  event_count  
331    3.7821

In [None]:
df_final_agg = df_final.groupby(['user_id', 'prod_id','cat_id','price','count','event_count'], as_index=False)['ratings'].mean()  # Or 'sum()' instead of 'mean()'
df_final_agg


Unnamed: 0,user_id,prod_id,cat_id,price,count,event_count,ratings
0,1515915625353514897,247085,2144415923744800906,262.48,12.0,12.0,1.671026
1,1515915625353514897,322353,2144415923744800906,575.73,6.0,6.0,4.864812
2,1515915625353514897,37774,2144415923744800906,167.62,12.0,12.0,1.962064
3,1515915625353514897,3978887,2144415923744800906,369.17,6.0,6.0,3.955855
4,1515915625353514897,3979025,2144415923744800906,350.60,7.0,7.0,4.123565
...,...,...,...,...,...,...,...
209,1515915625522779180,4102474,2144415926806642904,57.21,1.0,1.0,3.021860
210,1515915625522779180,4171147,2144415973346640379,73.24,4.0,2.0,4.326947
211,1515915625522779180,5913,2144415922167742561,49.57,4.0,2.0,2.644628
212,1515915625522779180,880499,2144415926806642904,47.32,5.0,3.0,3.238865


In [None]:
mapping = {'view': 2, 'cart': 3, 'others': 1}
df_final_agg['encoded_event'] = df['event_type'].map(mapping).fillna(0)
df_final_agg['encoded_event'].sum()
df_final_agg

Unnamed: 0,user_id,prod_id,cat_id,price,count,event_count,ratings,encoded_event
0,1515915625353514897,247085,2144415923744800906,262.48,12.0,12.0,1.671026,0.0
1,1515915625353514897,322353,2144415923744800906,575.73,6.0,6.0,4.864812,2.0
2,1515915625353514897,37774,2144415923744800906,167.62,12.0,12.0,1.962064,2.0
3,1515915625353514897,3978887,2144415923744800906,369.17,6.0,6.0,3.955855,2.0
4,1515915625353514897,3979025,2144415923744800906,350.60,7.0,7.0,4.123565,2.0
...,...,...,...,...,...,...,...,...
209,1515915625522779180,4102474,2144415926806642904,57.21,1.0,1.0,3.021860,2.0
210,1515915625522779180,4171147,2144415973346640379,73.24,4.0,2.0,4.326947,2.0
211,1515915625522779180,5913,2144415922167742561,49.57,4.0,2.0,2.644628,2.0
212,1515915625522779180,880499,2144415926806642904,47.32,5.0,3.0,3.238865,2.0


In [None]:
duplicates = df_final_agg[df_final_agg.duplicated(['user_id', 'prod_id'], keep=False)]
# Display duplicates
print(duplicates)
df_final_agg['count'].sum()
df_final_agg

Empty DataFrame
Columns: [user_id, prod_id, cat_id, price, count, event_count, ratings, encoded_event]
Index: []


Unnamed: 0,user_id,prod_id,cat_id,price,count,event_count,ratings,encoded_event
0,1515915625353514897,247085,2144415923744800906,262.48,12.0,12.0,1.671026,0.0
1,1515915625353514897,322353,2144415923744800906,575.73,6.0,6.0,4.864812,2.0
2,1515915625353514897,37774,2144415923744800906,167.62,12.0,12.0,1.962064,2.0
3,1515915625353514897,3978887,2144415923744800906,369.17,6.0,6.0,3.955855,2.0
4,1515915625353514897,3979025,2144415923744800906,350.60,7.0,7.0,4.123565,2.0
...,...,...,...,...,...,...,...,...
209,1515915625522779180,4102474,2144415926806642904,57.21,1.0,1.0,3.021860,2.0
210,1515915625522779180,4171147,2144415973346640379,73.24,4.0,2.0,4.326947,2.0
211,1515915625522779180,5913,2144415922167742561,49.57,4.0,2.0,2.644628,2.0
212,1515915625522779180,880499,2144415926806642904,47.32,5.0,3.0,3.238865,2.0


In [None]:
#Creating the interaction matrix of products and users based on ratings and replacing NaN value with 0
final_ratings_matrix = df_final_agg.pivot(index = 'user_id', columns ='prod_id', values = 'event_count').fillna(0)
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

#Finding the number of non-zero entries in the interaction matrix
given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)

#Finding the possible number of ratings as per the number of users and products
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)

#Density of ratings
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

final_ratings_matrix.head()

Shape of final_ratings_matrix:  (7, 213)
given_num_of_ratings =  214
possible_num_of_ratings =  1491
density: 14.35%


prod_id,1009727,1012952,1013041,1013042,1013047,1013048,1013051,1013052,1040135,1044429,...,877060,880499,889270,903858,919584,937789,938118,940654,947169,952888
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1515915625353514897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625360775587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1515915625444050265,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0
1515915625517787550,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625520712323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Calculate the average rating for each product
average_rating = df_final.groupby('prod_id').mean()['ratings']

#Calculate the count of ratings for each product
count_rating = df_final.groupby('prod_id').count()['ratings']

#Create a dataframe with calculated average and count of ratings
final_rating = pd.DataFrame({'avg_rating':average_rating, 'rating_count':count_rating})

#Sort the dataframe by average of ratings
final_rating = final_rating.sort_values(by='avg_rating',ascending=False)

final_rating.head()

Unnamed: 0_level_0,avg_rating,rating_count
prod_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4013214,4.995559,6
3627528,4.982337,3
4102186,4.976405,2
3506278,4.970301,1
1354359,4.933766,1


In [None]:
#defining a function to get the top n products based on highest average rating and minimum interactions
def top_n_products(final_rating, n, min_interaction):

    #Finding products with minimum number of interactions
    recommendations = final_rating[final_rating['rating_count']>min_interaction]

    #Sorting values w.r.t average rating
    recommendations = recommendations.sort_values('avg_rating',ascending=False)

    return recommendations.index[:n]

In [None]:
list(top_n_products(final_rating, 2, 50))

['483337', '1821775', '3829912', '1079510', '4155385']

In [None]:
list(top_n_products(final_rating, 5, 100))

[]

In [None]:
final_ratings_matrix.head()

prod_id,1009727,1012952,1013041,1013042,1013047,1013048,1013051,1013052,1040135,1044429,...,877060,880499,889270,903858,919584,937789,938118,940654,947169,952888
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1515915625353514897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625360775587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1515915625444050265,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0
1515915625517787550,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515915625520712323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
