# Collaborative Filtering

In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'torch'

## Data Info

In [None]:
df = pd.read_csv('../data/00_raw/transactions_train.csv', nrows=100)

In [None]:
articles = pd.read_csv('../data/00_raw/articles.csv')

In [None]:
articles = articles[['article_id', 'colour_group_code', 'colour_group_name']]

In [None]:
df.head()

In [None]:
data = df.groupby(['customer_id', 'article_id'])['customer_id'].count().reset_index(name='total_bought')

In [None]:
data = data.merge(articles, how='left', on='article_id')

In [None]:
# Encode the genres data
encoder = LabelEncoder()
encoder.fit(data['customer_id'])
data['encoded_customer_id'] = encoder.transform(data['customer_id'])

In [None]:
X = data.loc[:,['encoded_customer_id','article_id','colour_group_code']]
y = data.loc[:,'total_bought']

# Split our data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)

In [None]:
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
    # Convert training and test data to TensorDatasets
    trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(), 
                            torch.from_numpy(np.array(y_train)).float())
    valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(), 
                            torch.from_numpy(np.array(y_val)).float())

    # Create Dataloaders for our training and test data to allow us to iterate over minibatches 
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader

batchsize = 64
trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)

## Collaborative Filtering

### Creating collab filter df

In [None]:
ohe_article = pd.get_dummies(df['product_code'])

In [None]:
len(df.customer_id.unique())

In [None]:
article_names = list(ohe_article.columns)

In [None]:
cf_df = pd.concat([df['customer_id'], ohe_article], axis=1)

In [None]:
cf_df = cf_df.groupby(['customer_id'])[article_names].sum().reset_index()

In [None]:
len(cf_df.columns)

In [None]:
cf_df.head()

### Test train split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = cf_df['customer_id']
X = cf_df.drop(columns='customer_id')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
rankings = cosine_similarity(X_test, X_train, dense_output=False)

In [None]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
recommendations = {}
for i, pred in enumerate(rankings):
    idxs = list(np.argpartition(np.array(pred), -20)[-20:])
    recs = []
    for idx in idxs:
        cid = y_train.loc[[idx]].values[0]
        rec = list(df[df['customer_id'] == cid].groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
        recs.extend(rec)

    recs = list(set(recs))
    
    try:
        recs = recs[:12]
    except:
        pass
    
    cur_customer = y_test.iloc[[i]].values[0]
    recommendations[cur_customer] = recs

In [None]:
mAP_list = []
for customer in recommendations:
    top_recs = recommendations[customer]
    actual_bought = list(df[df['customer_id'] == customer].groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
    correct = len(set(actual_bought) - set(top_recs))
    ap = correct / len(actual_bought)
    mAP_list.append(ap)

In [None]:
np.array(mAP_list).mean()

In [None]:
pop_mAP_list = []
for customer in recommendations:
    pop_recs = df.groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values[:12]
    actual_bought = list(df[df['customer_id'] == customer].groupby(['article_id'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
    correct = len(set(actual_bought) - set(pop_recs))
    ap = correct / len(actual_bought)
    pop_mAP_list.append(ap)

In [None]:
np.array(pop_mAP_list).mean()