In [36]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
transactions = pd.read_csv('data/purchase/transactions.csv', nrows= 4e5) #5e7)
transactions = transactions[['id', 'dept', 'category']]

In [4]:
item_groups = transactions.groupby(['dept', 'category'])
id_groups = transactions.groupby('id')
items = item_groups.groups.keys()

In [40]:
def binarize(df):
    vals = pd.Series(0, index = pd.Index(items, tupleize_cols = False))
    for row in df.itertuples(index = False):
        vals[row] = 1
    return vals
df = id_groups.apply(binarize)
train, test = train_test_split(df.values, train_size = 0.8)

In [41]:
purchase_2 = KMeans(2, random_state = 0).fit_predict(train)
purchase_10 = KMeans(10, random_state = 0).fit_predict(train)
purchase_20 = KMeans(20, random_state = 0).fit_predict(train)
purchase_50 = KMeans(50, random_state = 0).fit_predict(train)
purchase_100 = KMeans(100, random_state = 0).fit_predict(train)

In [48]:
def train_model(features, labels, epochs):
    input_dim = features.shape[1]
    output_dim = len(np.unique(labels))
    
    # make model
    torch.manual_seed(0)
    class Net(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(Net, self).__init__()
            # an affine operation: y = Wx + b
            self.fc1 = nn.Linear(input_dim, 1024)
            self.fc2 = nn.Linear(1024, 1024)
            self.fc3 = nn.Linear(1024, 512)
            self.fc4 = nn.Linear(512, 512)
            self.fc5 = nn.Linear(512, 512)
            self.fc6 = nn.Linear(512, output_dim)
            self.softmax = nn.LogSoftmax(dim=-1)

        def forward(self, x):
            # Max pooling over a (2, 2) window
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = F.relu(self.fc3(x))
            x = F.relu(self.fc4(x))
            x = F.relu(self.fc5(x))
            x = self.softmax(self.fc6(x))
            return x
    
    net = Net(input_dim, output_dim)
    
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(net.parameters())
    
    labels = torch.Tensor(labels)
    
    # train model
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, (inputs, label) in enumerate(zip(torch.Tensor(features), labels), 0):
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs.reshape(1,-1), label.reshape(-1).long())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / features.shape[0]))

    print('Finished Training')
    
    return net

In [49]:
net = train_model(train, purchase_2, 15)

[1,   321] loss: 0.658
[2,   321] loss: 0.620
[3,   321] loss: 0.607
[4,   321] loss: 0.339
[5,   321] loss: 0.412
[6,   321] loss: 0.320
[7,   321] loss: 0.170
[8,   321] loss: 0.250
[9,   321] loss: 0.151
[10,   321] loss: 0.302
[11,   321] loss: 0.042
[12,   321] loss: 0.034
[13,   321] loss: 0.011
[14,   321] loss: 0.000
[15,   321] loss: 0.000
Finished Training


In [126]:
def auc(model, train, test, sample_size):
    train_sample = train[np.random.choice(train.shape[0], size = sample_size)]
    test_sample = test[np.random.choice(test.shape[0], size = sample_size)]
    train_confidence = torch.max(torch.exp(model(torch.Tensor(train_sample)).detach()), axis=1)[0]
    test_confidence = torch.max(torch.exp(model(torch.Tensor(test_sample)).detach()), axis=1)[0]
    
    actual = [1 for i in range(sample_size)] + [0 for i in range(sample_size)]
    estimated = torch.cat([train_confidence, test_confidence])
    return roc_auc_score(torch.Tensor(actual), estimated)

In [127]:
auc(net, train, test, 100)

0.6643