In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
transactions = pd.read_csv('data/purchase/transactions.csv', nrows=5e7)
transactions = transactions[['id', 'dept', 'category']]

In [3]:
item_groups = transactions.groupby(['dept', 'category'])
id_groups = transactions.groupby('id')
items = item_groups.groups.keys()

In [4]:
def binarize(df):
    vals = pd.Series(0, index = pd.Index(items, tupleize_cols = False))
    for row in df.itertuples(index = False):
        vals[row] = 1
    return vals
df = id_groups.apply(binarize)
train, test = train_test_split(df.values, train_size = 8000, test_size = 2000)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
purchase_2 = KMeans(2, random_state = 0).fit_predict(train)
purchase_10 = KMeans(10, random_state = 0).fit_predict(train)
purchase_20 = KMeans(20, random_state = 0).fit_predict(train)
purchase_50 = KMeans(50, random_state = 0).fit_predict(train)
purchase_100 = KMeans(100, random_state = 0).fit_predict(train)

In [10]:
def train_model(features, labels, epochs):
    input_dim = features.shape[1]
    output_dim = len(np.unique(labels))
    
    # make model
    torch.manual_seed(0)
    class Net(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(Net, self).__init__()
            # an affine operation: y = Wx + b
            self.fc1 = nn.Linear(input_dim, 1024)
            self.fc2 = nn.Linear(1024, 1024)
            self.fc3 = nn.Linear(1024, 512)
            self.fc4 = nn.Linear(512, 512)
            self.fc5 = nn.Linear(512, 512)
            self.fc6 = nn.Linear(512, output_dim)
            self.softmax = nn.LogSoftmax(dim=-1)

        def forward(self, x):
            # Max pooling over a (2, 2) window
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = F.relu(self.fc3(x))
            x = F.relu(self.fc4(x))
            x = F.relu(self.fc5(x))
            x = self.softmax(self.fc6(x))
            return x
    
    net = Net(input_dim, output_dim)
    net.to(device)
    
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(net.parameters())
    
    labels = torch.Tensor(labels)
    
    # train model
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, (inputs, label) in enumerate(zip(torch.Tensor(features), labels), 0):
            inputs = inputs.to(device)
            label = label.to(device)
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs.reshape(1,-1), label.reshape(-1).long())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / features.shape[0]))

    print('Finished Training')
    
    return net

In [11]:
net_2 = train_model(train, purchase_2, 15)
net_10 = train_model(train, purchase_10, 15)
net_20 = train_model(train, purchase_20, 15)
net_50 = train_model(train, purchase_50, 15)
net_100 = train_model(train, purchase_100, 15)

[1,  8000] loss: 0.457
[2,  8000] loss: 0.236
[3,  8000] loss: 0.169
[4,  8000] loss: 0.141
[5,  8000] loss: 0.122
[6,  8000] loss: 0.116
[7,  8000] loss: 0.158
[8,  8000] loss: 0.070
[9,  8000] loss: 0.048
[10,  8000] loss: 0.147
[11,  8000] loss: 0.040
[12,  8000] loss: 0.041
[13,  8000] loss: 0.045
[14,  8000] loss: 0.061
[15,  8000] loss: 0.042
Finished Training
[1,  8000] loss: 1.498
[2,  8000] loss: 1.004
[3,  8000] loss: 0.710
[4,  8000] loss: 0.508
[5,  8000] loss: 0.443
[6,  8000] loss: 0.345
[7,  8000] loss: 0.288
[8,  8000] loss: 0.281
[9,  8000] loss: 0.242
[10,  8000] loss: 0.224
[11,  8000] loss: 0.197
[12,  8000] loss: 0.177
[13,  8000] loss: 0.207
[14,  8000] loss: 0.144
[15,  8000] loss: 0.155
Finished Training
[1,  8000] loss: 2.226
[2,  8000] loss: 1.947
[3,  8000] loss: 1.869
[4,  8000] loss: 1.798
[5,  8000] loss: 1.714
[6,  8000] loss: 1.645
[7,  8000] loss: 1.584
[8,  8000] loss: 1.537
[9,  8000] loss: 1.475
[10,  8000] loss: 1.483
[11,  8000] loss: 1.443
[12,  8

In [12]:
def auc(model, train, test, sample_size):
    train_sample = train[np.random.choice(train.shape[0], size = sample_size)]
    test_sample = test[np.random.choice(test.shape[0], size = sample_size)]
    train_confidence = torch.max(torch.exp(model(torch.Tensor(train_sample)).detach()), axis=1)[0]
    test_confidence = torch.max(torch.exp(model(torch.Tensor(test_sample)).detach()), axis=1)[0]
    
    actual = [1 for i in range(sample_size)] + [0 for i in range(sample_size)]
    estimated = torch.cat([train_confidence, test_confidence])
    return roc_auc_score(torch.Tensor(actual), estimated)

In [13]:
auc_2 = auc(net_2, train, test, 1000)
auc_10 = auc(net_10, train, test, 1000)
auc_20 = auc(net_20, train, test, 1000)
auc_50 = auc(net_50, train, test, 1000)
auc_100 = auc(net_100, train, test, 1000)

In [14]:
print(auc_2)
print(auc_10)
print(auc_20)
print(auc_50)
print(auc_100)

0.5497479999999999
0.6209025
0.5340335
0.5563750000000001
0.5000855


In [None]:
def hamming(s1, s2)