# 1. Modules and functions

In [1]:
import numpy as np
import pandas as pd

from itertools import islice, cycle, product

import warnings
warnings.filterwarnings('ignore')

## 1. 1. Helper functions to avoid copy paste

# 2. Data

## 2. 1. Load data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [3]:
# interactions data
interactions = pd.read_csv('interactions.csv', index_col = 0)
interactions.head()

Unnamed: 0,platform,utc_event_time,utc_event_date,user_id,event_type,price,quantity,item_id,main_category,sub_category,...,timestamp_event_time,year,month,day,hour,lag_event_timestamp,is_first_event,timestamp_first_event,session_id,session_duration
136385,Android,2023-07-31 12:00:05+00:00,2023-07-31,173259092511508332961805537433278356613,view_item,163800.0,1.0,260726654727418165900362987409768274857,144086449872839020031157148875692321280,173321378319243508149201199923244637487,...,2023-07-31 15:00:05,2023,7,31,15,2023-07-31 05:21:49,1,2023-07-31 15:00:05,25769803996,0
178110,Android,2023-07-31 16:50:47+00:00,2023-07-31,175531225445750135224629001292094625386,view_item,329000.0,1.0,62874459636212199856820592742465687422,154366424108017296963249370561478064618,60270833747106288434434595877071461453,...,2023-07-31 19:50:47,2023,7,31,19,,1,2023-07-31 19:50:47,5694,0
178097,Site,2023-07-31 18:51:29+00:00,2023-07-31,175456934034865273087601583309920541917,view_item,94700.0,1.0,301113677287200808879250957235197355461,13798607401255592813561468416979820649,266864194786171476250208354353832955366,...,2023-07-31 21:51:29,2023,7,31,21,,1,2023-07-31 21:51:29,3703,0
47270,Site,2023-07-31 06:35:50+00:00,2023-07-31,136129747179209875179444036984951419826,view_item,30500.0,1.0,146938954500183309632143980050512715097,45117301157169820192891250731273658842,138272492718411940153002045605569451150,...,2023-07-31 09:35:50,2023,7,31,9,,1,2023-07-31 09:35:50,25769805241,0
178078,Android,2023-07-31 10:19:04+00:00,2023-07-31,175191682770652346432502214192078716096,view_item,45500.0,1.0,303613093764450740807978528504906103484,226036533030553532728399561714766831,200033172283332864904098097188143308177,...,2023-07-31 13:19:04,2023,7,31,13,,1,2023-07-31 13:19:04,8589940079,0


## 2.2 Data preparation

The objective of this step is to identify movies from two datasets that were watched by users.

In [4]:
interactions.dtypes

platform                  object
utc_event_time            object
utc_event_date            object
user_id                   object
event_type                object
price                    float64
quantity                 float64
item_id                   object
main_category             object
sub_category              object
id                         int64
timestamp_event_time      object
year                       int64
month                      int64
day                        int64
hour                       int64
lag_event_timestamp       object
is_first_event             int64
timestamp_first_event     object
session_id                 int64
session_duration           int64
dtype: object

In [5]:
# create users input
users = interactions[['user_id']].drop_duplicates().reset_index(drop = True)

# 3. Model

Let's define our baseline popularity recommender BaselineRecommender - top items based on amount of purchases with possibility to get by any group(s)

The pipeline will be similar to most python ML modules -- it will have two methods in the end: fit() and recommend()
1. The logic of fit() as follow:
- Initiate recommendation based on total amount of purchases from all items rating from all items with purchases;
- Prepare list of interacted items by users (?)
- If we set groups - we get recommendations i.e. calculate movie ratings by groups:
    - If we get NaN, we fill with base recommendations 
    - If we get less than required number of candidates, we populate from base recommendations

2. The logic of recommend():
- Return base recommendations if users data is not set;
- In case of category wise requirement -- we get results of our fit

In [6]:
def compute_popularity(df: pd.DataFrame, item_id: str, max_candidates: int):
    """
    calculates amount of purchases to define popular items
    """
    popular_items = interactions[interactions['event_type'] == 'purchase'].groupby('item_id')\
    .agg({'quantity': sum})\
    .sort_values(['quantity'], ascending = False)\
    .head(max_candidates)\
    .index.values

    return popular_items

## 3.1. Fit

In [7]:
# first, we define how many candidates we want to get
MAX_CANDIDATES = 20
ITEM_COLUMN = 'item_id'
USER_COLUMN = 'user_id'

In [8]:
# then, we extract top 20 movies by aggregating movies and averaging rating column across all users
base_recommendations = compute_popularity(interactions, ITEM_COLUMN, MAX_CANDIDATES)
base_recommendations

array(['95588577945262351643987306457854212185',
       '101693639666891461420614362676113822951',
       '268867123658507796037972027312924025943',
       '316369578318160560517812220790535630342',
       '60426019840232385179488698169503208671',
       '191180173927754570603598163968691356636',
       '199976022065644129694791276458615697311',
       '1854017877261602621243941119775670879',
       '243944327735751792067441062316392777002',
       '23223659722259697606848103371319884224',
       '209694451399118209703987396195677911233',
       '223391820131290559319659201996644117551',
       '61118984235327609842558983124465174189',
       '103954496700216569216246542168969772444',
       '284779626044522946752291083811020083301',
       '74398489580630128190128657232740994345',
       '187865096833877580148648513620446719019',
       '59756841062034782769557521102575647456',
       '17513903966991871569120606022168857610',
       '273142829746466974559179499534975979937'], dtype=ob

Thus, we got 20 items with highest amount of purchases

Now, as we discussed earlier, in movies recommendations there is no need to recommend the same film which user has already watched. Let's implement it as well

In [9]:
# we get all interacted items for each user and save it in dictionary {'userId': [items list]}
known_items = interactions[interactions['event_type'] == 'purchase'].groupby(USER_COLUMN)[ITEM_COLUMN].apply(list).to_dict()
len(known_items)

3907

In [14]:
# let's check it for one userId = 1
# known_items['100035225509401999812828901832806148332']

In [15]:
def compute_popularity(df: pd.DataFrame, item_id: str, initial_candidates: int):
    """
    calculates amount of purchases to define popular items
    """
    popular_items = interactions[interactions['event_type'] == 'purchase'].groupby(item_id)\
    .agg({'quantity': sum})\
    .sort_values(['quantity'], ascending = False)\
    .head(initial_candidates)\
    .index.values

    return popular_items

In [28]:
def popular_without_purchases(userid: str, initial_candidates: list,
                              final_candidates: int, known_items: dict):
    if userid not in known_items.keys():
        ppular_items_corrected = base_recommendations
    else:
        ppular_items_corrected = [item for item in base_recommendations if item not in known_items[userid]]
    return ppular_items_corrected[:final_candidates]

In [17]:
base_recommendations = compute_popularity(interactions, ITEM_COLUMN, 1000)

In [18]:
users.head()

Unnamed: 0,user_id
0,173259092511508332961805537433278356613
1,175531225445750135224629001292094625386
2,175456934034865273087601583309920541917
3,136129747179209875179444036984951419826
4,175191682770652346432502214192078716096


In [19]:
recs = list(islice(cycle([base_recommendations]), len(users['user_id'])))
users['rekkos'] = recs
users.head()

Unnamed: 0,user_id,rekkos
0,173259092511508332961805537433278356613,"[95588577945262351643987306457854212185, 10169..."
1,175531225445750135224629001292094625386,"[95588577945262351643987306457854212185, 10169..."
2,175456934034865273087601583309920541917,"[95588577945262351643987306457854212185, 10169..."
3,136129747179209875179444036984951419826,"[95588577945262351643987306457854212185, 10169..."
4,175191682770652346432502214192078716096,"[95588577945262351643987306457854212185, 10169..."


In [20]:
# lets add artifical binary group to check BaselineRecommender
group = [np.random.random_integers(2) for x in range(len(users))]
users['group'] = group

In [21]:
users.groupby('group')['user_id'].count()

group
1    55359
2    55401
Name: user_id, dtype: int64

In [22]:
data = pd.merge(interactions, users, how = 'left', on = USER_COLUMN)
data.head()
# data.groupby('group').apply(lambda x: popular_without_purchases(x['user_id'], base_recommendations, 5, known_items), axis = 1)

Unnamed: 0,platform,utc_event_time,utc_event_date,user_id,event_type,price,quantity,item_id,main_category,sub_category,...,month,day,hour,lag_event_timestamp,is_first_event,timestamp_first_event,session_id,session_duration,rekkos,group
0,Android,2023-07-31 12:00:05+00:00,2023-07-31,173259092511508332961805537433278356613,view_item,163800.0,1.0,260726654727418165900362987409768274857,144086449872839020031157148875692321280,173321378319243508149201199923244637487,...,7,31,15,2023-07-31 05:21:49,1,2023-07-31 15:00:05,25769803996,0,"[95588577945262351643987306457854212185, 10169...",1
1,Android,2023-07-31 16:50:47+00:00,2023-07-31,175531225445750135224629001292094625386,view_item,329000.0,1.0,62874459636212199856820592742465687422,154366424108017296963249370561478064618,60270833747106288434434595877071461453,...,7,31,19,,1,2023-07-31 19:50:47,5694,0,"[95588577945262351643987306457854212185, 10169...",1
2,Site,2023-07-31 18:51:29+00:00,2023-07-31,175456934034865273087601583309920541917,view_item,94700.0,1.0,301113677287200808879250957235197355461,13798607401255592813561468416979820649,266864194786171476250208354353832955366,...,7,31,21,,1,2023-07-31 21:51:29,3703,0,"[95588577945262351643987306457854212185, 10169...",2
3,Site,2023-07-31 06:35:50+00:00,2023-07-31,136129747179209875179444036984951419826,view_item,30500.0,1.0,146938954500183309632143980050512715097,45117301157169820192891250731273658842,138272492718411940153002045605569451150,...,7,31,9,,1,2023-07-31 09:35:50,25769805241,0,"[95588577945262351643987306457854212185, 10169...",2
4,Android,2023-07-31 10:19:04+00:00,2023-07-31,175191682770652346432502214192078716096,view_item,45500.0,1.0,303613093764450740807978528504906103484,226036533030553532728399561714766831,200033172283332864904098097188143308177,...,7,31,13,,1,2023-07-31 13:19:04,8589940079,0,"[95588577945262351643987306457854212185, 10169...",2


Now we have all necessary components: base recommendations without groups with possibility to filter already watched items

Also, if we want to get recommendations based on some user groups we can easily do the same with groupby() method and same approach

In [23]:
# data = pd.merge(interactions_filtered, users, how='left', on = USER_COLUMN)
group_recommendations = data.groupby('group').apply(compute_popularity, ITEM_COLUMN, MAX_CANDIDATES)
group_recommendations.head()

group
1    [95588577945262351643987306457854212185, 10169...
2    [95588577945262351643987306457854212185, 10169...
dtype: object

In [24]:
group_recommendations.reset_index()

Unnamed: 0,group,0
0,1,"[95588577945262351643987306457854212185, 10169..."
1,2,"[95588577945262351643987306457854212185, 10169..."


In the output we have two rows with a list of film ids for each binary group 

Next, we have to implement recommned() method which will use 

## 3. 2. Recommend

In [25]:
# if we do not have groups, then it means we give the same recommendations for all users i.e. base_recommendations
# recs = list(islice(cycle([base_recommendations]), len(users['userId'])))
# users['rekkos'] = recs
users.head()

Unnamed: 0,user_id,rekkos,group
0,173259092511508332961805537433278356613,"[95588577945262351643987306457854212185, 10169...",1
1,175531225445750135224629001292094625386,"[95588577945262351643987306457854212185, 10169...",1
2,175456934034865273087601583309920541917,"[95588577945262351643987306457854212185, 10169...",2
3,136129747179209875179444036984951419826,"[95588577945262351643987306457854212185, 10169...",2
4,175191682770652346432502214192078716096,"[95588577945262351643987306457854212185, 10169...",2


In [26]:
# and let's have an example with groups we created earlier
group_recommendations = group_recommendations.reset_index()
group_rekkos = pd.merge(users, group_recommendations, how ='left', on = 'group')
group_rekkos.rename(columns = {0: 'grouped_rekkos'}, inplace = True)
group_rekkos.head()

Unnamed: 0,user_id,rekkos,group,grouped_rekkos
0,173259092511508332961805537433278356613,"[95588577945262351643987306457854212185, 10169...",1,"[95588577945262351643987306457854212185, 10169..."
1,175531225445750135224629001292094625386,"[95588577945262351643987306457854212185, 10169...",1,"[95588577945262351643987306457854212185, 10169..."
2,175456934034865273087601583309920541917,"[95588577945262351643987306457854212185, 10169...",2,"[95588577945262351643987306457854212185, 10169..."
3,136129747179209875179444036984951419826,"[95588577945262351643987306457854212185, 10169...",2,"[95588577945262351643987306457854212185, 10169..."
4,175191682770652346432502214192078716096,"[95588577945262351643987306457854212185, 10169...",2,"[95588577945262351643987306457854212185, 10169..."


We got our groupwise recommendations from 3.1. part and just joined them by group of users are assigned to

In [29]:
group_rekkos['person_rekkos'] = group_rekkos.apply(lambda x: popular_without_purchases(x['user_id'], x['grouped_rekkos'], 5, known_items) , axis = 1)

In [30]:
group_rekkos[['user_id', 'group', 'grouped_rekkos', 'person_rekkos']].head()

Unnamed: 0,user_id,group,grouped_rekkos,person_rekkos
0,173259092511508332961805537433278356613,1,"[95588577945262351643987306457854212185, 10169...","[95588577945262351643987306457854212185, 10169..."
1,175531225445750135224629001292094625386,1,"[95588577945262351643987306457854212185, 10169...","[95588577945262351643987306457854212185, 10169..."
2,175456934034865273087601583309920541917,2,"[95588577945262351643987306457854212185, 10169...","[95588577945262351643987306457854212185, 10169..."
3,136129747179209875179444036984951419826,2,"[95588577945262351643987306457854212185, 10169...","[95588577945262351643987306457854212185, 10169..."
4,175191682770652346432502214192078716096,2,"[95588577945262351643987306457854212185, 10169...","[95588577945262351643987306457854212185, 10169..."


## 3.3. Wrap everything into pretty functions

### 3.3.1 Fit part

In [23]:
def fit(
    data: pd.DataFrame,
    item_col: str, groups: list = None,
    max_candidates: int = 20
    ):
    """
    function runs all pipeline to generate recommendations based on given group
    :data: dataframe of interactions
    :item_col: item column name
    :groups: optional, list of groups column names to get recommendations
    :max_candidates: number of recommendations to return
    """
    
    if groups is not None:
        recommendations = data.groupby(groups).apply(compute_popularity, item_col, max_candidates)
    else:
        recommendations = compute_popularity(data, item_col, max_candidates)

    return recommendations

In [24]:
# check base
fit(data, item_col=ITEM_COLUMN)

array(['74727', '128846', '702', '127728', '65216', '43267', '8675',
       '80717', '86817', '8699', '872', '27724', '26791', '876', '64278',
       '301', '59392', '3021', '3112', '1933'], dtype=object)

In [25]:
# check group-wise
fit(data, item_col=ITEM_COLUMN, groups=['group'])

group
1    [850, 40226, 86817, 2984, 26326, 64499, 5765, ...
2    [702, 667, 91690, 3966, 2287, 1939, 1859, 4928...
dtype: object

### 3.3.2 Recommend part

In [26]:
def recommend(
    users: pd.DataFrame,
    recommendations: pd.DataFrame,
    groups: list = None,
    K: int = 10):
    """
    recommends items for a given list of users
    :users: series / list of users to recommend
    :recommendations: output of fit() function
    :groups: optional, list of groups column names to get recommendations
    :K: number of items to recommend (not always we want to show dozens of items instantly)
    """
    if groups is not None:
        output = pd.merge(users, recommendations.reset_index(), how = 'left', on = 'group')

    else:
        output = users.copy(deep = True)
        recs = list(islice(cycle([recommendations]), len(users['userId'])))
        output['rekkos'] = recs

    return output


In [27]:
# check
recs = fit(data, item_col=ITEM_COLUMN)
check_recs = recommend(users[['userId', 'group']], recs)
check_recs.head()

Unnamed: 0,userId,group,rekkos
0,1,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
1,2,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
2,3,2,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
3,4,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."
4,5,1,"[74727, 128846, 702, 127728, 65216, 43267, 867..."


In [28]:
# check group-wise
recs = fit(data, item_col=ITEM_COLUMN, groups = ['group'])
check_recs = recommend(users[['userId', 'group']], recs, ['group'])
check_recs.head()

Unnamed: 0,userId,group,0
0,1,1,"[850, 40226, 86817, 2984, 26326, 64499, 5765, ..."
1,2,2,"[702, 667, 91690, 3966, 2287, 1939, 1859, 4928..."
2,3,2,"[702, 667, 91690, 3966, 2287, 1939, 1859, 4928..."
3,4,1,"[850, 40226, 86817, 2984, 26326, 64499, 5765, ..."
4,5,1,"[850, 40226, 86817, 2984, 26326, 64499, 5765, ..."


Congrats! Your first basic recommender system is ready!!