## nn configuration 

### dataset class for tabular data

In [1]:
from torch.utils.data import Dataset, DataLoader#, TensorDataset

In [52]:
class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        """
        Characterizes a Dataset for PyTorch

        Parameters
        ----------

        data: pandas data frame
          The data frame object for the input data. It must
          contain all the continuous, categorical and the
          output columns to be used.

        cat_cols: List of strings
          The names of the categorical columns in the data.
          These columns will be passed through the embedding
          layers in the model. These columns must be
          label encoded beforehand. 

        output_col: string
          The name of the output variable column in the data
          provided.
        """
        
        self.n, self.c = data.shape
        print(data.shape)
        
        if output_col:
            self.y = data[output_col]
        else:
            self.y = np.zeros(self.n, 1)
        
        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns if col not in self.cat_cols + [output_col]]
        
        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))
        
        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))
            
    def __len__(self):
        """
        Returns total number of samples.
        """
        return self.n
        
    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

### neural net

In [116]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [129]:
class FeedForwardNN(nn.Module):
    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes, output_size, emb_dropout, lin_layer_dropouts, verb=False):
        """
        Parameters
        ----------

        emb_dims: List of two element tuples
          This list will contain a two element tuple for each
          categorical feature. The first element of a tuple will
          denote the number of unique values of the categorical
          feature. The second element will denote the embedding
          dimension to be used for that feature.

        no_of_cont: Integer
          The number of continuous features in the data.

        lin_layer_sizes: List of integers.
          The size of each linear layer. The length will be equal
          to the total number
          of linear layers in the network.

        output_size: Integer
          The size of the final output.

        emb_dropout: Float
          The dropout to be used after the embedding layers.

        lin_layer_dropouts: List of floats
          The dropouts to be used after each linear layer.
        """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        
        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont
        
        if verb:
            print(self.emb_layers)
            print(f'Total number of embeddings: {self.no_of_embs}')
            print(f'Total number of cont. vars: {self.no_of_cont}')

        # Linear layers
        # First layer num of rows = total sum of embeddings (over all cat vars) + sum of cont vars
        first_lin_layer = nn.Linear(in_features=self.no_of_embs + self.no_of_cont, out_features=lin_layer_sizes[0])

        # All linear layers
        #other_lin_layers = nn.ModuleList[nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i+1]) for i in range(len(lin_layer_sizes) - 1)]
        #self.lin_layers = nn.ModuleList([first_lin_layer] + [other_lin_layers])

        self.lin_layers = nn.ModuleList([first_lin_layer] +
                                        [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i+1])
                                         for i in range(len(lin_layer_sizes) - 1)])

        # Initialze parameters
        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output layer
        self.output_layer = nn.Linear(in_features=lin_layer_sizes[-1], out_features=output_size)

        # Batch norm layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])

        # Dropout layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.dropout_layers = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts])
        
    def forward(self, cont_data, cat_data):
        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)
        
        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)
            
            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = normalized_cont_data
        
        for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers, self.dropout_layers, self.bn_layers):
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)
        
        x = self.output_layer(x)
        
        return x
            

In [7]:
lin_layer_sizes = [50, 100]

In [8]:
no_of_cont = 4

In [9]:
first_lin_layer = nn.Linear(in_features=no_of_embs + no_of_cont, out_features=lin_layer_sizes[0])

NameError: name 'no_of_embs' is not defined

In [10]:
emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
emb_layers

NameError: name 'emb_dims' is not defined

In [11]:
no_of_embs = sum([y for x, y in emb_dims])
no_of_embs

NameError: name 'emb_dims' is not defined

In [12]:
sum([y for x, y in emb_dims])

NameError: name 'emb_dims' is not defined

## import and pre-process data 

In [13]:
import pandas as pd
import numpy as np

In [14]:
!ls ../data/houseprice/

data_description.txt  sample_submission.csv  train.csv
models		      test.csv		     train_tiny.csv


In [15]:
from pathlib import Path

In [16]:
path = Path('../data/houseprice/')

### TODO: implement fillmissing, auto-generate one-hot vector for NAs, mapping from train to val/test

### TODO: implement normalization, mapping from train to val/test

In [141]:
df = pd.read_csv(path/'train.csv', sep=',', usecols=['SalePrice', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 
                                                           'Street', 'YearBuilt', 'LotShape', '1stFlrSF',
                                                           '2ndFlrSF']).dropna().reset_index(drop=True)
df_test = pd.read_csv(path/'test.csv', sep=',', usecols=['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 
                                                           'Street', 'YearBuilt', 'LotShape', '1stFlrSF',
                                                           '2ndFlrSF']).dropna().reset_index(drop=True)

In [136]:
#tensor_train = torch.tensor(df_train.values, dtype=torch.float32)

In [92]:
df_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [93]:
#df_train.drop('Id', inplace=True, axis=1)

In [94]:
n,c = df_train.shape
n, c

(1201, 10)

In [95]:
categorical_features = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'YearBuilt']
output_feature = 'SalePrice'

In [96]:
from sklearn.preprocessing import LabelEncoder

In [97]:
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    df_train[cat_col] = label_encoders[cat_col].fit_transform(df_train[cat_col])

In [98]:
df_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,5,3,65.0,8450,1,3,104,856,854,208500
1,0,3,80.0,9600,1,3,77,1262,0,181500
2,5,3,68.0,11250,1,0,102,920,866,223500
3,6,3,60.0,9550,1,0,19,961,756,140000
4,5,3,84.0,14260,1,0,101,1145,1053,250000


In [99]:
df_train.iloc[0]

MSSubClass          5.0
MSZoning            3.0
LotFrontage        65.0
LotArea          8450.0
Street              1.0
LotShape            3.0
YearBuilt         104.0
1stFlrSF          856.0
2ndFlrSF          854.0
SalePrice      208500.0
Name: 0, dtype: float64

## define model parameters

In [110]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [101]:
categorical_features, output_feature

(['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'YearBuilt'], 'SalePrice')

In [128]:
cat_dims = [int(df_train[col].nunique()) for col in categorical_features]
emb_dims = [(card, min(50, (card+1)//2)) for card in cat_dims]
emb_dims

[(15, 8), (5, 3), (2, 1), (4, 2), (112, 50)]

In [109]:
lin_layer_sizes = [50, 100]

## train

In [130]:
dataset = TabularDataset(data=df_train, cat_cols=categorical_features, output_col=output_feature)

(1201, 10)


In [131]:
batchsize = 64
dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True)

In [132]:
no_of_epochs = 5
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [133]:
model = FeedForwardNN(emb_dims, no_of_cont=4, lin_layer_sizes=lin_layer_sizes, output_size=1, emb_dropout=0.04,
                      lin_layer_dropouts=[0.001, 0.01]).to(device)

In [134]:
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in dataloader:
        #cat_x = cat_x.to(device)
        cat_x = torch.tensor(cat_x, dtype=torch.long, device=device)         # long is needed for embeddings
        #cont_x = cont_x.to(device)
        cont_x = torch.tensor(cont_x, dtype=torch.float32, device=device)
        #y = y.to(device)
        y = torch.tensor(y, dtype=torch.float32, device=device)
        
        # Forward
        preds = model(cont_x, cat_x)                                         # this calls model.forward()
        loss = criterion(preds, y)
                
        # Backward pass and zero grad
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(loss)    

  after removing the cwd from sys.path.
  
  


tensor(3.6824e+10, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(4.1629e+10, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(3.2225e+10, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(4.0106e+10, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(4.2226e+10, device='cuda:0', grad_fn=<MseLossBackward>)


In [135]:
preds

tensor([[-0.4178],
        [-0.2862],
        [-0.1400],
        [ 0.2611],
        [-0.2501],
        [ 0.3144],
        [-0.0731],
        [-0.6758],
        [ 0.4927],
        [-0.5883],
        [-0.0403],
        [-0.1237],
        [-0.2576],
        [-0.9774],
        [ 0.1409],
        [-0.5651],
        [-0.3184],
        [-0.2968],
        [-0.6893],
        [ 0.7589],
        [ 0.4632],
        [-0.1995],
        [ 0.4237],
        [-0.2993],
        [-0.2863],
        [-0.6764],
        [ 0.2694],
        [-0.8707],
        [ 0.6386],
        [-0.4427],
        [ 0.3593],
        [ 1.0640],
        [-0.2333],
        [ 0.5831],
        [ 0.4855],
        [ 0.1318],
        [-0.3230],
        [-0.2893],
        [-0.7587],
        [-0.0176],
        [-0.5532],
        [ 0.1848],
        [-0.4049],
        [-0.0855],
        [-0.1403],
        [-0.4251],
        [ 0.1677],
        [-0.0505],
        [ 0.7031]], device='cuda:0', grad_fn=<AddmmBackward>)