# Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Info

In [2]:
df = pd.read_csv('../data/00_raw/transactions_train.csv', nrows=500000)

In [3]:
articles = pd.read_csv('../data/00_raw/articles.csv')

In [4]:
articles = articles[['article_id', 'product_code']]

In [5]:
df = df.merge(articles, how='left', on='article_id')

In [6]:
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,663713
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,541518
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,505221
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,685687
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,685687


In [7]:
df.shape

(500000, 6)

In [None]:
# df.info()

In [None]:
# df.describe()

## Collaborative Filtering

### Creating collab filter df

In [8]:
ohe_article = pd.get_dummies(df['product_code'])

In [9]:
len(df.customer_id.unique())

119904

In [10]:
article_names = list(ohe_article.columns)

In [11]:
cf_df = pd.concat([df['customer_id'], ohe_article], axis=1)

In [12]:
cf_df = cf_df.groupby(['customer_id'])[article_names].sum().reset_index()

In [13]:
len(cf_df.columns)

13139

In [14]:
cf_df.head()

Unnamed: 0,customer_id,108775,110065,111565,111586,111593,111609,114428,116379,118458,...,725253,725676,727676,727754,728111,728146,728162,729931,740234,740237
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Test train split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
y = cf_df['customer_id']
X = cf_df.drop(columns='customer_id')

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

In [18]:
X_train.shape

(107913, 13138)

In [19]:
X_test.shape

(11991, 13138)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
rankings = cosine_similarity(X_test, X_train, dense_output=False)

In [22]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
recommendations = {}
for i, pred in enumerate(rankings):
    idxs = list(np.argpartition(np.array(pred), -20)[-20:])
    recs = []
    for idx in idxs:
        cid = y_train.loc[[idx]].values[0]
        rec = list(df[df['customer_id'] == cid].groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
        recs.extend(rec)

    recs = list(set(recs))
    
    try:
        recs = recs[:12]
    except:
        pass
    
    cur_customer = y_test.iloc[[i]].values[0]
    recommendations[cur_customer] = recs

In [None]:
mAP_list = []
for customer in recommendations:
    top_recs = recommendations[customer]
    actual_bought = list(df[df['customer_id'] == customer].groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
    correct = len(set(actual_bought) - set(top_recs))
    ap = correct / len(top_recs)
    mAP_list.append(ap)

In [None]:
np.array(mAP_list).mean()

In [None]:
pop_mAP_list = []
for customer in recommendations:
    pop_recs = df.groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values[:12]
    actual_bought = list(df[df['customer_id'] == customer].groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
    correct = len(set(actual_bought) - set(pop_recs))
    ap = correct / len(pop_recs)
    pop_mAP_list.append(ap)

In [None]:
np.array(pop_mAP_list).mean()