In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('data/users_corrected.csv')

In [3]:
df.head

<bound method NDFrame.head of     user_id username                   join_date  reputation
0         1    Aarav  2022-05-18 07:38:17.917924         371
1         2     Diya  2022-10-17 07:38:17.917954         342
2         3   Aditya  2023-08-02 07:38:17.917961          80
3         4     Pari  2022-09-28 07:38:17.917966         231
4         5    Aarav  2023-09-14 07:38:17.917970          19
..      ...      ...                         ...         ...
95       96     Sara  2024-03-12 07:38:17.918394         229
96       97  Krishna  2023-05-04 07:38:17.918399          12
97       98      Ira  2024-01-10 07:38:17.918403         261
98       99  Shanaya  2022-06-24 07:38:17.918407           2
99      100  Shanaya  2022-06-27 07:38:17.918412         451

[100 rows x 4 columns]>

In [4]:
df_1=pd.read_csv('data/items.csv')

In [5]:
df_2=pd.read_csv('data/interactions.csv')

In [6]:
df_1.head

<bound method NDFrame.head of     item_id item_name                           category  \
0       101   item101                      TIE-up sarees   
1       102   item102                         Nike shoes   
2       103   item103                          Black tie   
3       104   item104                          Black tie   
4       105   item105                Cocktail dress code   
5       106   item106                               Suit   
6       107   item107                         Nike shoes   
7       108   item108                      TIE-up sarees   
8       109   item109                          Chudidhar   
9       110   item110                Cocktail dress code   
10      111   item111                    Business casual   
11      112   item112                          Chudidhar   
12      113   item113                      TIE-up sarees   
13      114   item114                      TIE-up sarees   
14      115   item115                               Suit   
15      11

In [7]:
df_2.head

<bound method NDFrame.head of       user_id  item_id interaction_type                   timestamp
0         428      181             like  2022-03-05 11:33:33.879798
1         174      114             view  2022-07-11 11:33:33.879828
2          80      101          comment  2023-01-03 11:33:33.879834
3         114      103             view  2023-04-22 11:33:33.879839
4         358      126             like  2021-10-20 11:33:33.879843
...       ...      ...              ...                         ...
4995      484      191          comment  2023-04-07 11:33:33.898077
4996      452      183             view  2024-06-18 11:33:33.898078
4997      437      124          comment  2023-08-30 11:33:33.898080
4998       67      190             like  2024-05-15 11:33:33.898082
4999      308      187             like  2022-11-23 11:33:33.898084

[5000 rows x 4 columns]>

In [8]:
df.shape

(100, 4)

In [9]:
df_1.shape

(50, 4)

In [10]:
df_2.shape

(5000, 4)

In [11]:
final=df.merge(df_2,on="user_id")
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 0 to 1002
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           1003 non-null   int64 
 1   username          1003 non-null   object
 2   join_date         1003 non-null   object
 3   reputation        1003 non-null   int64 
 4   item_id           1003 non-null   int64 
 5   interaction_type  1003 non-null   object
 6   timestamp         1003 non-null   object
dtypes: int64(3), object(4)
memory usage: 55.0+ KB


In [12]:
final.shape

(1003, 7)

In [13]:
recomm=final.merge(df_1,on="item_id")
recomm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           489 non-null    int64 
 1   username          489 non-null    object
 2   join_date         489 non-null    object
 3   reputation        489 non-null    int64 
 4   item_id           489 non-null    int64 
 5   interaction_type  489 non-null    object
 6   timestamp         489 non-null    object
 7   item_name         489 non-null    object
 8   category          489 non-null    object
 9   description       489 non-null    object
dtypes: int64(3), object(7)
memory usage: 38.3+ KB


In [14]:
#finding missing data, to drop them 'dropna' is used
recomm.isnull().sum()

user_id             0
username            0
join_date           0
reputation          0
item_id             0
interaction_type    0
timestamp           0
item_name           0
category            0
description         0
dtype: int64

In [15]:
recomm.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
484    False
485    False
486    False
487    False
488    False
Length: 489, dtype: bool

In [16]:
# Calculate implicit ratings
user_item_interactions = df_2.groupby(['user_id', 'item_id']).size().reset_index(name='total_interactions')
max_interactions_per_user = user_item_interactions.groupby('user_id')['total_interactions'].max().reset_index(name='max_interactions')
ratings = pd.merge(user_item_interactions, max_interactions_per_user, on='user_id')
ratings['implicit_rating'] = ratings['total_interactions'] / ratings['max_interactions']

# Pivot the ratings to create a user-item matrix
user_item_matrix = ratings.pivot(index='user_id', columns='item_id', values='implicit_rating').fillna(0)


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Convert the user-item matrix to a sparse matrix format
user_item_sparse_matrix = csr_matrix(user_item_matrix.values)

# Calculate the cosine similarity between users
user_similarity = cosine_similarity(user_item_sparse_matrix)

# Create a DataFrame for the user similarity
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


In [37]:
def get_user_recommendations(user_id, user_similarity_df, user_item_matrix, items_df, num_recommendations=5):
    # Get the user's similarity scores
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)

    # Get items interacted with by the similar users
    similar_users_interactions = user_item_matrix.loc[similar_users.index]
    
    # Calculate the weighted sum of ratings for each item
    item_recommendations = similar_users_interactions.T.dot(similar_users).div(similar_users.sum(), axis=0)
    
    # Exclude items the user has already interacted with
    user_interactions = user_item_matrix.loc[user_id]
    item_recommendations = item_recommendations[user_interactions == 0]
    
    # Get the top recommendations
    top_recommendations = item_recommendations.sort_values(ascending=False).head(num_recommendations)
    
    # Join with the items dataframe to get item names and categories
    recommended_items = top_recommendations.reset_index().merge(items_df, on='item_id', how='left')
    
    return recommended_items[['item_id', 'item_name', 'category']]

# Example: Get recommendations for AARAV
user_id = 1
recommendations = get_user_recommendations(user_id, user_similarity_df, user_item_matrix, df_1)
print(recommendations)


   item_id item_name             category
0      114   item114        TIE-up sarees
1      104   item104            Black tie
2      105   item105  Cocktail dress code
3      113   item113        TIE-up sarees
4      191       NaN                  NaN
