In [1]:
import pickle
from typing import List, Tuple, Union

import numpy as np
import torch
import tqdm
from scipy.sparse import csr_matrix
from torch import nn, optim
from torch.nn import Linear, ReLU
from torch.utils.data import Dataset


In [2]:
class DNN(nn.Module):
    def __init__(self, layer_sizes: List[int]):
        """
        The first layer size is the input dimension, and the last layer size is the output dimension.
        :param layer_sizes:
        """
        super(DNN, self).__init__()

        self.layers = nn.ModuleList()

        for i in range(0, len(layer_sizes) - 1):
            self.layers.append(Linear(in_features=layer_sizes[i],
                                      out_features=layer_sizes[i + 1]))
            self.layers.append(ReLU())

    def forward(self, x: torch.Tensor):
        for layer in self.layers:
            # print(f'Layer {layer} with input size {x.size()}')
            x = layer(x)
        return x

In [10]:
with open('x_train_transformed.pkl', 'rb') as f:
    X_train = pickle.load(f)

with open('x_test_transformed.pkl', 'rb') as f:
    X_test = pickle.load(f)

with open('y_train.pkl', 'rb') as f:
    Y_train = pickle.load(f)

with open('y_test.pkl', 'rb') as f:
    Y_test = pickle.load(f)

In [11]:
X_train.shape

(1186028, 755851)

In [None]:
%%time

from sklearn.decomposition import TruncatedSVD

tsvd = TruncatedSVD(n_components=X_train.shape[1] // 2)
X_train_transformed = tsvd.fit_transform(X_train)
X_test_transformed = tsvd.transform(X_test)

In [1]:
X_train

NameError: name 'X_train' is not defined

In [6]:
tsvd.explained_variance_ratio_

array([0.01034079, 0.08227368])

In [6]:
indices = np.array([1, 2, 3])
X_train[indices]

<3x755851 sparse matrix of type '<class 'numpy.float64'>'
	with 268 stored elements in Compressed Sparse Row format>

In [5]:
class SparseDataset(Dataset):
    """
    Custom Dataset class for scipy sparse matrix
    """

    def __init__(self, data: Union[np.ndarray, csr_matrix],
                 targets: Union[np.ndarray, csr_matrix]):
        self.data = data
        self.datatype = type(data)

        self.targets = targets
        self.targettype = type(targets)

    def __getitem__(self, index: int):
        x = self.data.getrow(index) if self.datatype == csr_matrix else self.data[index]
        y = self.targets.getrow(index) if self.targettype == csr_matrix else self.targets[index]
        return x, y

    def __len__(self):
        return self.data.shape[0]

In [None]:
def train_model(model, train: Tuple[csr_matrix, np.ndarray], validation: Tuple[csr_matrix, np.ndarray],
                batch_size: int, epochs: int):
    optimizer = optim.Adam(model.parameters())

    X_train, Y_train = train
    n_samples = X_train.shape[0]
    n_batches = np.ceil(n_samples * 1.0 / batch_size)
    display = tqdm.trange(epochs)
    validation_losses = []
    for _ in display:
        permutation = np.random.permutation(n_samples)

        # train in this epoch
        for batch_n in range(n_batches):
            batch_indices = permutation[batch_n * batch_size:
                                        min((batch_n + 1) * batch_size, len(permutation))]
            batch_x = torch.as_tensor(X_train[batch_indices].todense(), dtype=torch.float32)
            batch_y = torch.as_tensor(Y_train[batch_indices], dtype=torch.float32)

            predictions = model(batch_x)
            loss = nn.MSELoss()(predictions, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation_loss = evaluate_model(model=model, dataset=validation)
        # validation_losses.append(validation_loss)
        # display.set_description(f'Validation loss: {validation_loss:.5}')

    return validation_losses


# def evaluate_model(model, dataset: Tuple[csr_matrix, np.ndarray]) -> float:
#     X, Y = dataset
#     X_tens, Y_tens = torch.as_tensor(X.todense(), dtype=torch.float32), torch.as_tensor(Y, dtype=torch.float32)
#
#     with torch.no_grad():
#         raw_pred = model(X_tens)
#         predictions = np.array(torch.argmax(raw_pred, dim=1))
#
#     loss = torch.nn.MSELoss()(predictions, Y_tens)
#     return loss.item()