In [2]:
import math
from typing import List, Tuple, Union

import numpy as np
import torch
import tqdm
from scipy.sparse import csr_matrix
from torch import nn, optim
from torch.nn import Linear, ReLU
from torch.utils.data import DataLoader, Dataset


In [3]:
class DNN(nn.Module):
    def __init__(self, layer_sizes: List[int]):
        """
        The first layer size is the input dimension, and the last layer size is the output dimension.
        :param layer_sizes:
        """
        super(DNN, self).__init__()

        self.layers = nn.ModuleList()

        for i in range(0, len(layer_sizes) - 1):
            self.layers.append(Linear(in_features=layer_sizes[i],
                                      out_features=layer_sizes[i + 1]))
            self.layers.append(ReLU())

    def forward(self, x: torch.Tensor):
        for layer in self.layers:
            # print(f'Layer {layer} with input size {x.size()}')
            x = layer(x)
        return x

In [4]:
import pickle

with open('x_train_transformed.pkl', 'rb') as f:
    X_train = pickle.load(f)

with open('x_test_transformed.pkl', 'rb') as f:
    X_test = pickle.load(f)

with open('y_train.pkl', 'rb') as f:
    Y_train = pickle.load(f)

with open('y_test.pkl', 'rb') as f:
    Y_test = pickle.load(f)

In [5]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [6]:
X_train.getrow(0)

<1x755851 sparse matrix of type '<class 'numpy.float64'>'
	with 52 stored elements in Compressed Sparse Row format>

In [7]:
class SparseDataset(Dataset):
    """
    Custom Dataset class for scipy sparse matrix
    """

    def __init__(self, data: Union[np.ndarray, csr_matrix],
                 targets: Union[np.ndarray, csr_matrix]):
        # Transform data coo_matrix to csr_matrix for indexing
        # if type(data) == coo_matrix:
        #     self.data = data.tocsr()
        # else:
        self.data = data
        self.datatype = type(data)

        # Transform targets coo_matrix to csr_matrix for indexing
        # if type(targets) == coo_matrix:
        #     self.targets = targets.tocsr()
        # else:
        self.targets = targets
        self.targettype = type(targets)

    def __getitem__(self, index: int):
        # return self.data[index], self.targets[index]
        x = self.data.getrow(index) if self.datatype == csr_matrix else self.data[index]
        y = self.targets.getrow(index) if self.targettype == csr_matrix else self.targets[index]
        return x, y

    def __len__(self):
        return self.data.shape[0]

In [9]:
dataloader = DataLoader(
    SparseDataset(data=X_train, targets=Y_train),
    batch_size=128,
    shuffle=True
)

In [8]:
X_train.shape[1]

755851

In [12]:
def train_model(model, train: Tuple[csr_matrix, np.ndarray], validation: Tuple[csr_matrix, np.ndarray],
                batch_size: int, epochs: int):
    optimizer = optim.Adam(model.parameters())

    X_train, Y_train = train
    n_samples = X_train.shape[0]
    n_batches = math.ceil(n_samples * 1.0 / batch_size)
    display = tqdm.trange(epochs)
    validation_losses = []
    for _ in display:
        permutation = np.random.permutation(n_samples)

        # train in this epoch
        for batch_n in range(n_batches):
            batch_indices = permutation[batch_n * batch_size:
                                        min((batch_n + 1) * batch_size, len(permutation))]
            batch_x = torch.as_tensor(X_train[batch_indices].todense(), dtype=torch.float32)
            batch_y = torch.as_tensor(Y_train[batch_indices], dtype=torch.float32)

            predictions = model(batch_x)
            loss = nn.MSELoss()(predictions, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        validation_loss = evaluate_model(model=model, dataset=validation)
        validation_losses.append(validation_loss)
        display.set_description(f'Validation loss: {validation_loss:.5}')

        # stats, accuracy, loss_stat = evaluate_model(model=model, dataset=train)
        # train_stats.append((stats, accuracy))
        # train_losses.append(loss_stat)

    # return validation_stats, validation_losses, train_stats, train_losses
    return validation_losses


def evaluate_model(model, dataset: Tuple[csr_matrix, np.ndarray]) -> float:
    X, Y = dataset
    X_tens, Y_tens = torch.as_tensor(X.todense(), dtype=torch.float32), torch.as_tensor(Y, dtype=torch.float32)

    with torch.no_grad():
        raw_pred = model(X_tens)
        predictions = np.array(torch.argmax(raw_pred, dim=1))

    loss = torch.nn.MSELoss()(predictions, Y_tens)
    return loss.item()

In [None]:
model = DNN(layer_sizes=[256, 256, 1])
train_model(model=model, train=(X_train, Y_train    ), validation=(X_test, Y_test), batch_size=128, epochs=1)

  0%|          | 0/1 [00:00<?, ?it/s]

In [1]:
print('omg')

omg


In [8]:
n_epochs = 100
for i in range(n_epochs):
    print(f'i: {dataloader[i]}')

TypeError: 'DataLoader' object is not subscriptable

In [10]:
from torch.utils.data import DataLoader, TensorDataset

data = DataLoader(
    TensorDataset(torch.as_tensor(X_train.todense()), torch.as_tensor(Y_train)),
    batch_size=128,
    shuffle=True
)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [4]:
coo = X_train.tocoo()

In [5]:
% % time
values = X_train.data

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 11.9 µs


In [6]:
% % time
indices = np.vstack((coo.row, coo.col))

CPU times: user 215 ms, sys: 261 ms, total: 477 ms
Wall time: 557 ms


In [7]:
% % time
# i = torch.LongTensor(indices)
i = torch.as_tensor(indices, dtype=torch.int)

CPU times: user 242 µs, sys: 672 µs, total: 914 µs
Wall time: 932 µs


In [None]:
% % time
# v = torch.FloatTensor(values)
v = torch.as_tensor(values)

In [None]:
coo = X_train.tocoo()
values = X_train.data

indices = np.vstack((coo.row, coo.col))

i = torch.as_tensor(indices, dtype=torch.int)
v = torch.as_tensor(values)
shape = coo.shape

X_train_tens = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()


In [None]:
X_train.row

In [1]:
print('omg')

omg
