In [1]:
from typing import NamedTuple, List
from collections import Counter

def read_basket_data(file):
    dataset = []
    with open(file) as f:
        for line in f.readlines():
            products = [int(p)-1 for p in line.split(',')]
            if len(products) > 1:
                dataset.append(products)
    return dataset

def build_vocab(dataset: List[List[int]]):
    counter = Counter()
    for basket in dataset:
        counter.update(basket)
    return list(counter.keys()), counter

In [33]:
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Tuple

def to_size(data: List[int], size: int):
    if len(data) > size:
        return np.random.choice(data, size=size, replace=False)
    else:
        return np.random.choice(data, size=size, replace=True)

class BasketData(NamedTuple):
    dataset: Tuple[np.ndarray, np.ndarray]
    vocab: List[int]
    counter: Counter
    vocab_size: int
    max_basket_length: int
    
    @staticmethod
    def build(file: str):
        dataset = read_basket_data(file)
        print(f"Read {len(dataset)} baskets from {file}")
        
        vocab, counter = build_vocab(dataset)
        print(f"Number of distinct products {len(vocab)}")
        
        max_basket_length = max(len(b) for b in dataset)
        print(f"Max basket size {max_basket_length}")
        
        dataset = BasketData.build_input_and_labels(dataset, max_basket_length)
        
        print(f"Done building dataset")
        return BasketData(dataset, vocab, counter, len(vocab), max_basket_length)
    
    @staticmethod
    def build_input_and_labels(baskets: List[List[int]], max_length: int) -> Tuple[np.ndarray, np.ndarray]:
        inputs = []
        labels = []
        for basket in baskets:
            input_basket = basket[:-1]
            label_product = basket[-1]

            inputs.append(to_size(input_basket, max_length))
            labels.append(label_product)

        inputs = np.array(inputs)
        labels = np.array(labels)
        return inputs, labels

In [34]:
from pathlib import Path

data_path = Path("data/")

basket_data = BasketData.build(data_path / "1_100_100_100_apparel_regs.csv")

Read 8102 baskets from data/1_100_100_100_apparel_regs.csv
Number of distinct products 100
Max basket size 21
Done building dataset


In [35]:
from keras.layers import *
import keras.backend as K
from keras.models import Model

In [99]:
def BasketCNN(max_sequence_length, vocab_size, embedding_dim=100, num_filters=16, dropout_rate=0.25):
    """
    Input:
        - max_sequence_length: maximum length of baskets
        - vocab_size: number of distinct products
        - embedding_layer: embedding layer of Keras created by model type and static flags
        - dropout_rate: dropout rate for flattened pooled outputs
    Returns:
        - model: Model class created with specified inputs
    """        
    x_input = Input(shape=(max_sequence_length,), dtype='int32')

    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                embeddings_initializer='uniform')

    x = embedding_layer(x_input)

    kernel_sizes = [3, 5, 7]
    pooled = []

    for kernel in kernel_sizes:

        conv = Conv1D(filters=num_filters,
                      kernel_size=kernel,
                      padding='valid',
                      strides=1,
                      kernel_initializer='he_uniform',
                      activation='relu')(x)
        
        pool = MaxPooling1D(pool_size=max_sequence_length - kernel + 1)(conv)

        pooled.append(pool)

    merged = Concatenate(axis=-1)(pooled)

    flatten = Flatten()(merged)

    drop = Dropout(rate=dropout_rate)(flatten)
    
    x_output = Dense(vocab_size, kernel_initializer='he_uniform', activation='softmax')(drop)

    return Model(inputs=x_input, outputs=x_output)

In [100]:
model = TextCNN(basket_data.max_bakset_length, basket_data.vocab_size, dropout_rate=0.25)

In [101]:
from functools import partial
from keras.metrics import *

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='Adam',
              metrics=['sparse_categorical_accuracy', sparse_top_k_categorical_accuracy])

In [102]:
X, y = basket_data.dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=10, verbose=1)

Train on 6481 samples, validate on 1621 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13ece96d8>

In [103]:
most_common_product, max_count = basket_data.counter.most_common(1)[0]

In [104]:
naive_labels = np.zeros_like(y_test, dtype=np.float32)
naive_labels = most_common_product

In [105]:
(naive_labels == y_test).mean()

0.23442319555829735