# README

These files do following steps:

* 1. We need to create a object `RegressionColumnarDataset`: in example **Traing a classifier** on PyTorch home page tutorial, we download data using PyTorch functions and then load the data into dataloader directly. The downloaded data have already been wrapped in a class called `torch.utils.data.Dataset`. Here, we use a dataset from Kaggle so we need to create a class to wrap that dataset.

* 2. After creating `RegressionColumnarDataset`, use function `setup_dataloader` to put our dataset into data loader.

* 3. In this level, we use a simple way to write Neural Network structure and `training_loop`. The `training_loop` trains the model and print some basic information of the training process.

In level 2, we will write a Network structure in a different way and write a `training_loop_2` with ability to plot the tracking loss and save best model in the training process.

In [8]:
import os
os.chdir('../seminar_1')
import pandas as pd
from utils.helper_functions import save_pickle, load_pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
import time
import copy
import torch.utils.data as Data
import torch
import torch.nn as nn
import torch.nn.functional as F

class RegressionColumnarDataset(Data.Dataset):
    """
    Object Dataset in PyTorch to store dataset before loading into DataLoader
    """
    def __init__(self, df_transformed, target):
        self.data = df_transformed.copy().values.astype(np.float32)
        # Select target
        self.target = target.values.astype(np.float32)
        self.n_feature = self.data.shape[1]

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return [self.data[idx], self.target[idx]]

def setup_dataloader(df, mapper, batch_size=256, shuffle=True, num_workers=0):
    """
    Convert preprocessed DataFrame (with target column 'price') into DataLoader
    """
    df_transformed = mapper.transform(df)
    target = df['price'].copy() / 1e6
    df_dataset = RegressionColumnarDataset(df_transformed, target)
    dataloader = Data.DataLoader(df_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0)
    return dataloader

def to_numpy(tensor):
    """
    Convert a Pytorch tensor to numpy
    """
    return tensor.cpu().data.numpy()

# Prepare data

In [9]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
mapper = load_pickle('model/mapper.pkl')

traindl = setup_dataloader(train, mapper)
testdl = setup_dataloader(test, mapper)

# Neural Network Level 1

In [10]:
# Weight initial
def customize_weight_init(x):
    """
    Use Xavier method to initialize weight
    """
    classname = x.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.xavier_normal_(x.weight)
        nn.init.constant_(x.bias, 0)
        
    if classname.find('BatchNorm') != -1:
        nn.init.constant_(x.weight, 1)
        nn.init.constant_(x.bias, 0)

# Define model
class NNet_model_1(nn.Module):
    def __init__(self, input_dim, layer1, layer2, n_output=1):
        '''
        A class that defines the neural network structure
        
        Params:
        input_dim: number of features from the dataset
        layer1 : num of neurons in layer 1
        layer2: num of neurons in layer 2
        
        output: 
        an object that holds the model structure (can be called as a function)
        '''
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, layer1),
            nn.ReLU(),
            nn.BatchNorm1d(layer1),
            nn.Dropout(0.3),
            
            nn.Linear(layer1, layer2),
            nn.ReLU(),
            nn.BatchNorm1d(layer2),
            nn.Dropout(0.3),
            
            nn.Linear(layer2, n_output)
        )
        
    def forward(self, x):
        return self.network(x)



In [11]:
model = NNet_model_1(traindl.dataset.n_feature, 128, 32)
model.apply(customize_weight_init)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay= 0.1)
loss_function = nn.MSELoss(reduce='mean')

In [12]:
def train_loop(traindl, testdl, model, optimizer, loss_function, 
               seed=0, epoches=5, save_folder='model', verbose=True):

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    if verbose: print('Training on: ', device)
    torch.manual_seed(seed)
    
    # Some setup
    start_time = time.time()
    
    for epoch in range(epoches): 
        # Tracking loss
        train_loss = np.array([])
        test_loss = np.array([])
        
        # Train
        model.train()
        for x, y in iter(traindl):
            x, y = x.to(device), y.to(device)
            
            optimizer.zero_grad()
            yhat = model(x).to(device)
            loss = loss_function(yhat, y)
            loss.backward()
            optimizer.step()
            
            # Track train loss
            train_loss  = np.concatenate((train_loss, loss.item()), axis=None)
        
        # Evaluate
        model.eval()
        for x, y in iter(testdl):
            x, y = x.to(device), y.to(device)
            
            yhat = model(x).to(device)
            loss = loss_function(yhat, y)
            
            # Track train loss
            test_loss  = np.concatenate((test_loss, loss.item()), axis=None)
            
        if verbose:
            cur_lr = optimizer.param_groups[0]['lr']
            print(f'Epoch {epoch+1}: train_loss: {train_loss.mean():.4f} test_loss: {test_loss.mean():.4f}')

        checkpoint = {
            'model': model,
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'test_loss': test_loss.mean(),
            'train_loss': train_loss.mean()
        }
        path_checkpoint = save_folder + '/nnet_1.pth'
        torch.save(checkpoint, path_checkpoint)

    if verbose:
        print(f"Finished training in {time.time() - start_time:.4f} seconds")
        print(f'Model save to {path_checkpoint}')

In [13]:
train_loop(traindl, testdl, model, optimizer, loss_function)

Training on:  cpu
Epoch 1: train_loss: 0.4709 test_loss: 0.1475
Epoch 2: train_loss: 0.1481 test_loss: 0.1371
Epoch 3: train_loss: 0.1360 test_loss: 0.1425
Epoch 4: train_loss: 0.1363 test_loss: 0.1359
Epoch 5: train_loss: 0.1337 test_loss: 0.1364
Finished training in 4.4022 seconds
Model save to model/nnet_1.pth
