In [1]:
import cudf
import numpy as np
import torch
from torch.autograd import Variable

from batchloader import TensorBatchDataset, BatchDataLoader
from preprocess import PreprocessDF

In [2]:
# load data
filename = 'train_dataset.csv'
train_gdf = cudf.io.csv.read_csv(filename)

filename = 'test_dataset.csv'
test_gdf = cudf.io.csv.read_csv(filename)
# gdf = cudf.io.csv.read_csv(filename, index_col='Unnamed: 0')

# filename = 'dataset.parquet'
# num_rows, num_row_groups, names = cudf.io.parquet.read_parquet_metadata(filename)
# gdf = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
# gdf = cudf.concat(gdf)

print(train_gdf.shape)
print(train_gdf)
print(train_gdf.columns)

(8000, 103)
      feature_cat_0  feature_cat_1  feature_cont_0  feature_cont_1  \
0               1.0            1.0       -1.059951        1.462076   
1               1.0            0.0       -0.895100       -0.028954   
2               1.0            1.0        2.064241        0.415540   
3               0.0            0.0       -0.121514       -0.314254   
4               1.0            0.0        1.369914        0.235226   
5               0.0            0.0       -1.264873        0.010237   
6               0.0            0.0       -1.405209       -0.252663   
7               0.0            0.0        0.752061        0.054361   
8               1.0            1.0       -1.559383        1.016467   
9               1.0            1.0        0.141752       -0.735109   
10              0.0            1.0        1.396514        0.011593   
11              0.0            0.0        0.596260        0.142328   
12              1.0            1.0        1.279563       -0.730809   
13      

In [3]:
# dataset settings
n_samples, n_cat_features, n_cont_features, n_classes = 1000, 2, 100, 2

# create column names and instantiate preprocessor
cat_names = ['feature_cat_{}'.format(i) for i in range(n_cat_features)]
cont_names = ['feature_cont_{}'.format(i) for i in range(n_cont_features)]
label_name = 'target'
preprocessor = PreprocessDF(cat_names, cont_names, label_name, fill_strategy='median', to_cpu=False)

In [4]:
# preprocess data
(X_cat_train, X_cont_train), y_train = preprocessor.preproc_dataframe(train_gdf, mode='train')
(X_cat_test, X_cont_test), y_test = preprocessor.preproc_dataframe(train_gdf, mode='test')

  return cpp_dlpack.to_dlpack(gdf_cols)


In [5]:
X_cat_train = X_cat_train.type(torch.FloatTensor)
X_cont_train = X_cont_train.type(torch.FloatTensor)
y_train = y_train.type(torch.LongTensor)
X_cat_test = X_cat_test.type(torch.FloatTensor)
X_cont_test = X_cont_test.type(torch.FloatTensor)
y_test = y_test.type(torch.LongTensor)

In [6]:
print(X_cat_train.dtype, X_cont_train.dtype, y_train.dtype)
print(X_cat_test.dtype, X_cont_test.dtype, y_test.dtype)

torch.float32 torch.float32 torch.int64
torch.float32 torch.float32 torch.int64


In [7]:
# create batch datasets
batch_size = 100
train_dataset = TensorBatchDataset([X_cat_train, X_cont_train, y_train], 
                                   batch_size=batch_size, pin_memory=False)
test_dataset = TensorBatchDataset([X_cat_test, X_cont_test, y_test], 
                                  batch_size=batch_size, pin_memory=False)

In [8]:
# model training settings
epochs = int(5)
n_inputs = int(X_cat_train.size(1) + X_cont_train.size(1))
n_outputs = int(2)
learning_rate = 0.001

print(epochs, n_inputs, n_outputs)

5 102 2


In [9]:
# create batch data loaders
train_data_loader = BatchDataLoader(train_dataset, shuffle=False,
                                    pin_memory=False, drop_last=False, device='cuda')
test_data_loader = BatchDataLoader(test_dataset, shuffle=False,
                                   pin_memory=False, drop_last=False, device='cuda')

In [10]:
# define model
class LogisticRegression(torch.nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [11]:
# instantiate model, loss, and optimizer
model = LogisticRegression(n_inputs, n_outputs)
model = model.cuda()
print(model)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

LogisticRegression(
  (linear): Linear(in_features=102, out_features=2, bias=True)
)


In [12]:
# train the model by feeding in batches of data
batch_number = 0
for epoch in range(int(epochs)):
    for i, batch in enumerate(train_data_loader):
        # unpack batch
        (X_cat_batch, X_cont_batch), y_batch = batch
        X_batch = torch.cat((X_cat_batch, X_cont_batch), 1)
        
        # create variables from inputs and outputs
        X_batch = Variable(X_batch)
        y_batch = Variable(y_batch)
        
        # zero out gradients and use model to create outputs
        optimizer.zero_grad()
        outputs = model(X_batch)
        
        # calculate loss and backpropogate
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        # every 100 batches, evaluate on test dataset
        batch_number += 1
        if batch_number % 100 == 0:
            correct = 0
            total = 0
            total_loss = 0
            for batch in test_data_loader:
                # unpack batch
                (X_cat_batch, X_cont_batch), y_batch = batch
                X_batch = torch.cat((X_cat_batch, X_cont_batch), 1)

                # create variables from inputs and outputs
                X_batch = Variable(X_batch)
                Y_batch = Variable(y_batch)
                
                # use model to create outputs
                outputs = model(X_batch)
                
                # calculate loss
                test_loss = criterion(outputs, y_batch)
                total_loss += test_loss
                
                # calculate accuracy
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum()
            accuracy = 100 * correct / total
            print("Epoch: {}. Batch Number: {}. Loss: {}. Accuracy: {}.".format(epoch, batch_number, total_loss.item() / total, accuracy))

Epoch: 1. Batch Number: 100. Loss: 0.007068986892700195. Accuracy: 55.
Epoch: 2. Batch Number: 200. Loss: 0.006630552291870117. Accuracy: 60.
Epoch: 3. Batch Number: 300. Loss: 0.0062426824569702145. Accuracy: 65.
Epoch: 4. Batch Number: 400. Loss: 0.005894451141357422. Accuracy: 69.
