In [None]:
import cudf
import numpy as np
import torch
from torch.autograd import Variable

from batchloader import TensorBatchDataset, BatchDataLoader
from preprocess import PreprocessDF

In [None]:
# load data
filename = 'train_dataset.csv'
train_gdf = cudf.io.csv.read_csv(filename)

filename = 'test_dataset.csv'
test_gdf = cudf.io.csv.read_csv(filename)
# gdf = cudf.io.csv.read_csv(filename, index_col='Unnamed: 0')

# filename = 'dataset.parquet'
# num_rows, num_row_groups, names = cudf.io.parquet.read_parquet_metadata(filename)
# gdf = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
# gdf = cudf.concat(gdf)

print(train_gdf.shape)
print(train_gdf)
print(train_gdf.columns)

In [None]:
# dataset settings
n_samples, n_cat_features, n_cont_features, n_classes = 1000, 2, 100, 2

# create column names and instantiate preprocessor
cat_names = ['feature_cat_{}'.format(i) for i in range(n_cat_features)]
cont_names = ['feature_cont_{}'.format(i) for i in range(n_cont_features)]
label_name = 'target'
preprocessor = PreprocessDF(cat_names, cont_names, label_name, fill_strategy='median', to_cpu=False)

In [None]:
# preprocess data
(X_cat_train, X_cont_train), y_train = preprocessor.preproc_dataframe(train_gdf, mode='train')
(X_cat_test, X_cont_test), y_test = preprocessor.preproc_dataframe(train_gdf, mode='test')

In [None]:
X_cat_train = X_cat_train.type(torch.FloatTensor)
X_cont_train = X_cont_train.type(torch.FloatTensor)
y_train = y_train.type(torch.LongTensor)
X_cat_test = X_cat_test.type(torch.FloatTensor)
X_cont_test = X_cont_test.type(torch.FloatTensor)
y_test = y_test.type(torch.LongTensor)

In [None]:
print(X_cat_train.dtype, X_cont_train.dtype, y_train.dtype)
print(X_cat_test.dtype, X_cont_test.dtype, y_test.dtype)

In [None]:
# create batch datasets
batch_size = 100
train_dataset = TensorBatchDataset([X_cat_train, X_cont_train, y_train], 
                                   batch_size=batch_size, pin_memory=False)
test_dataset = TensorBatchDataset([X_cat_test, X_cont_test, y_test], 
                                  batch_size=batch_size, pin_memory=False)

In [None]:
# model training settings
epochs = int(5)
n_inputs = int(X_cat_train.size(1) + X_cont_train.size(1))
n_outputs = int(2)
learning_rate = 0.001

print(epochs, n_inputs, n_outputs)

In [None]:
# create batch data loaders
train_data_loader = BatchDataLoader(train_dataset, shuffle=False,
                                    pin_memory=False, drop_last=False, device='cuda')
test_data_loader = BatchDataLoader(test_dataset, shuffle=False,
                                   pin_memory=False, drop_last=False, device='cuda')

In [None]:
# define model
class LogisticRegression(torch.nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [None]:
# instantiate model, loss, and optimizer
model = LogisticRegression(input_dim, output_dim)
model = model.cuda()
print(model)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
# train the model by feeding in batches of data
batch_number = 0
for epoch in range(int(epochs)):
    for i, batch in enumerate(train_data_loader):
        # unpack batch
        (X_cat_batch, X_cont_batch), y_batch = batch
        X_batch = torch.cat((X_cat_batch, X_cont_batch), 1)
        
        # create variables from inputs and outputs
        X_batch = Variable(X_batch)
        y_batch = Variable(y_batch)
        
        # zero out gradients and use model to create outputs
        optimizer.zero_grad()
        outputs = model(X_batch)
        
        # calculate loss and backpropogate
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        # every 100 batches, evaluate on test dataset
        batch_number += 1
        if batch_number % 100 == 0:
            correct = 0
            total = 0
            total_loss = 0
            for batch in test_data_loader:
                # unpack batch
                (X_cat_batch, X_cont_batch), y_batch = batch
                X_batch = torch.cat((X_cat_batch, X_cont_batch), 1)

                # create variables from inputs and outputs
                X_batch = Variable(X_batch)
                Y_batch = Variable(y_batch)
                
                # use model to create outputs
                outputs = model(X_batch)
                
                # calculate loss
                test_loss = criterion(outputs, y_batch)
                total_loss += test_loss
                
                # calculate accuracy
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum()
            accuracy = 100 * correct / total
            print("Epoch: {}. Batch Number: {}. Loss: {}. Accuracy: {}.".format(epoch, batch_number, total_loss.item() / total, accuracy))