In [None]:
from fastai.tabular.all import *
import pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch
from autoencoder import TabularDataset, Autoencoder

In [None]:
data = pd.read_csv('data/TrainingWiDS2021.csv')
data.shape

In [None]:
categorical = []
for field in data.columns:
    if data[field].dtype == 'object':
        categorical.append(field)
    try:
        if data[field].describe()['max'] == data[field].describe()['min']:
            print('*****************************************************************')
            print(field)
    except:
        pass
print(categorical)

readmission_status has value 0, drop it

In [None]:
data = data.drop(columns='readmission_status')

In [None]:
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

In [None]:
print_memory_usage_of_data_frame(data)

In [None]:
one_hot = pd.get_dummies(data, columns=categorical)

In [None]:
one_hot = one_hot.iloc[:,1:]

In [None]:
one_hot

In [None]:
print_memory_usage_of_data_frame(one_hot)

This is intriguing. One hot encoding saved memory. I guess that makes sense.

In [None]:
y = data['diabetes_mellitus']

In [None]:
one_hot = one_hot.drop(columns=['diabetes_mellitus'])
one_hot = one_hot.fillna(-1)

In [None]:
continuous = list(set(one_hot.columns)-set(categorical))

In [None]:
to = TabularPandas(one_hot, procs=[Normalize], cont_names=continuous)

In [None]:
type(to)

In [None]:
to.xs.shape

### Now make a Dataset

In [None]:
BATCH_SIZE = 64
LEARNING_RATE = 0.001
NUM_EPOCHS = 80

In [None]:
dataset = TabularDataset(to.xs)

In [None]:
dls = DataLoader(dataset, batch_size=BATCH_SIZE)

In [None]:
otto = Autoencoder(to.xs.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(otto.parameters(), lr=LEARNING_RATE)
otto.float()
otto.train()
train_loss = []

for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    for batch in dls:     
        # clear the optimizer of previous gradients
        optimizer.zero_grad()
        # forward ---------------------------------------------
        output= otto(batch[1]) #batch is a list of three vectors: 
                # [y, cont_X, and cat_X]. We already converted categorical to continuous, so pass in batch[1]
        loss = criterion(output, batch[1])
        # backward --------------------------------------------
        loss.backward()
        optimizer.step()
        # keep a running total of loss for the batches in this epoch
        running_loss += loss.item()
    # log
    lossb = running_loss / len(dls)
    train_loss.append(lossb)
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, NUM_EPOCHS, loss.data), lossb)
    
otto.eval()

In [None]:
PATH = './autoencoder_state.txt'
torch.save(otto.state_dict(), PATH)

In [None]:
plt.plot([i for i in range(NUM_EPOCHS)], train_loss)