# Collaborative Filtering

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import time

## Data Info

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print('yes')
else:
  device = torch.device("cpu")
  print('no')


yes


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/transactions_train.csv', nrows=1000000)

In [None]:
articles = pd.read_csv('drive/MyDrive/Colab Notebooks/articles.csv')

In [None]:
articles = articles[['article_id', 'prod_name', 'colour_group_name']]

In [None]:
df = df.merge(articles, how='left', on='article_id')

In [None]:
# create unbought data
df2 = pd.DataFrame()
df2['customer_id'] = df['customer_id'].sample(n=len(df))
df2['prod_name'] = df['prod_name'].sample(n=len(df))
df2['total_bought'] = 0

In [None]:
data = df.groupby(['customer_id', 'prod_name'])['customer_id'].count().reset_index(name='total_bought')

In [None]:
data = pd.concat([data, df2]).reset_index(drop=True)

In [None]:
data['total_bought'] = np.where(data['total_bought'] > 2, 2, data['total_bought'])

In [None]:
color_join = df.drop_duplicates(subset=['customer_id', 'prod_name'])
data = data.merge(color_join, how='left', on=['customer_id', 'prod_name'])

In [None]:
# Encode the genres data
encoder = LabelEncoder()
encoder.fit(data['customer_id'])
data['encoded_customer_id'] = encoder.transform(data['customer_id'])

In [None]:
# Encode the genres data
encoder = LabelEncoder()
encoder.fit(data['prod_name'])
data['encoded_prod_name'] = encoder.transform(data['prod_name'])

In [None]:
# Encode the genres data
encoder = LabelEncoder()
encoder.fit(data['colour_group_name'])
data['encoded_colour_group_name'] = encoder.transform(data['colour_group_name'])

In [None]:
X = data.loc[:,['encoded_customer_id','encoded_prod_name','encoded_colour_group_name']]
y = data.loc[:,'total_bought']

# Split our data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.05)

In [None]:
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
    # Convert training and test data to TensorDatasets
    trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(), 
                            torch.from_numpy(np.array(y_train)).float())
    valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(), 
                            torch.from_numpy(np.array(y_val)).float())

    # Create Dataloaders for our training and test data to allow us to iterate over minibatches 
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader

batchsize = 64
trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)

In [None]:
class NNHybridFiltering(nn.Module):
    
    def __init__(self, n_users, n_items, n_genres, embdim_users, embdim_items, embdim_genres, n_activations, rating_range):
        super().__init__()
        self.user_embeddings = nn.Embedding(num_embeddings=n_users,embedding_dim=embdim_users)
        self.item_embeddings = nn.Embedding(num_embeddings=n_items,embedding_dim=embdim_items)
        self.genre_embeddings = nn.Embedding(num_embeddings=n_genres,embedding_dim=embdim_genres)
        self.fc1 = nn.Linear(embdim_users+embdim_items+embdim_genres,n_activations)
        self.fc2 = nn.Linear(n_activations,1)
        self.rating_range = rating_range

    def forward(self, X):
        # Get embeddings for minibatch
        embedded_users = self.user_embeddings(X[:,0])
        embedded_items = self.item_embeddings(X[:,1])
        embedded_genres = self.genre_embeddings(X[:,2])
        # Concatenate user, item and genre embeddings
        embeddings = torch.cat([embedded_users,embedded_items,embedded_genres],dim=1)
        # Pass embeddings through network
        preds = self.fc1(embeddings)
        preds = F.relu(preds)
        preds = self.fc2(preds)
        # Scale predicted ratings to target-range [low,high]
        preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
        return preds


In [None]:
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
    model = model.to(device) # Send model to GPU if available
    since = time.time()

    costpaths = {'train':[],'val':[]}

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            # Get the inputs and labels, and send to GPU if available
            for (inputs,labels) in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the weight gradients
                optimizer.zero_grad()

                # Forward pass to get outputs and calculate loss
                # Track gradient only for training data
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model.forward(inputs).view(-1)
                    loss = criterion(outputs, labels)

                    # Backpropagation to get the gradients with respect to each weight
                    # Only if in train
                    if phase == 'train':
                        loss.backward()
                        # Update the weights
                        optimizer.step()

                # Convert loss into a scalar and add it to running_loss
                running_loss += np.sqrt(loss.item()) * labels.size(0)

            # Step along learning rate scheduler when in train
            if (phase == 'train') and (scheduler is not None):
                scheduler.step()

            # Calculate and display average loss and accuracy for the epoch
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            costpaths[phase].append(epoch_loss)
            print('{} loss: {:.4f}'.format(phase, epoch_loss))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    return costpaths


In [None]:
# Train the model
dataloaders = {'train':trainloader, 'val':valloader}
n_users = X.loc[:,'encoded_customer_id'].max()+1
n_items = X.loc[:,'encoded_prod_name'].max()+1
n_genres = X.loc[:,'encoded_colour_group_name'].max()+1
model = NNHybridFiltering(n_users,
                       n_items,
                       n_genres,
                       embdim_users=50, 
                       embdim_items=50, 
                       embdim_genres=25,
                       n_activations = 100,
                       rating_range=[0., 2.])
criterion = nn.MSELoss()
lr=0.001
n_epochs=10
wd=1e-3
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
device = torch.device("cuda")

cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)


Epoch 0/9
----------
train loss: 0.6253
val loss: 0.6259
Epoch 1/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 2/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 3/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 4/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 5/9
----------
train loss: 0.6249
val loss: 0.6260
Epoch 6/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 7/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 8/9
----------
train loss: 0.6249
val loss: 0.6259
Epoch 9/9
----------
train loss: 0.6249
val loss: 0.6259
Training complete in 30m 3s


In [None]:
def predict_rating(model, userId, movieId, genre, encoder, device):
    # Encode genre
    genre = encoder.transform(np.array(genre).reshape(-1))
    # Get predicted rating
    model = model.to(device)
    with torch.no_grad():
        model.eval()
        X = torch.Tensor([userId,movieId,genre]).long().view(1,-1)
        X = X.to(device)
        pred = model.forward(X)
        return pred

# Get the predicted rating for a random user-item pair
rating = predict_rating(model,userId=133785,movieId=2156,genre='Green',encoder=encoder, device=device)
print('Predicted rating is {:.1f}'.format(rating.detach().cpu().item()))

Predicted rating is 0.5


In [None]:
test_customers = list(X_val.encoded_customer_id.unique())[:100]
test_prod = list(data.encoded_prod_name.unique())[:1000]
colors = list(data.colour_group_name.unique())

recommendations = {}
for customer in test_customers:
  customer_recs = {}
  prod_color = {}
  for prod in test_prod:
    init_rating = 0
    prod_col = None
    # for color in colors:
    rating = predict_rating(model,userId=customer,movieId=prod,genre='Black',encoder=encoder, device=device)
      # print(prod, color, rating)
      # if rating >= init_rating:
      #   init_rating = rating
      #   top_color = color
    customer_recs[prod] = init_rating
    # prod_color[prod] = top_color
  sorted_customer_recs = {k: v for k, v in sorted(customer_recs.items(), key=lambda item: item[1])}
  recs = list(sorted_customer_recs.keys())[:12]
  recommendations[customer] = recs

In [None]:
mAP_list = []
for customer in recommendations:
    top_recs = recommendations[customer]
    actual_bought = list(data[data['encoded_customer_id'] == customer].groupby(['encoded_prod_name'])['customer_id'].count().sort_values(ascending=False).index.values.astype('int'))
    correct = len(set(actual_bought).intersection(set(top_recs)))
    ap = correct / len(actual_bought)
    mAP_list.append(ap)
np.array(mAP_list).mean()

0.009964285714285714