In [1]:
from pathlib import Path

In [2]:
from fastai.tabular import *
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
PATH = Path('../data/houseprice/')

In [3]:
path = Path('C:/Users/michaeljeremias/Documents/PythonProjects/data/houseprice')

NameError: name 'Path' is not defined

In [5]:
df = pd.read_csv(PATH/'train.csv', sep=',')

In [6]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## fastai

In [7]:
cat_names = ['MSSubClass', 'MSZoning']
cont_names = ['LotFrontage']
dep_var = 'SalePrice'

In [8]:
procs = [Normalize, Categorify]

In [9]:
valid_idx = range(df.shape[0]-df.shape[0]//5, df.shape[0])

In [11]:
data = (TabularList.from_df(df, path=PATH, cat_names=cat_names, cont_names=cont_names, procs=procs)
                   .split_by_idx(valid_idx)
                   .label_from_df(cols=dep_var, label_cls=FloatList)
                   .databunch())

In [20]:
data.show_batch(num_workers=0)

BrokenPipeError: [Errno 32] Broken pipe

In [12]:
learn = tabular_learner(data, layers=[200,100], emb_szs={'MSSubClass': 10, 'MSZoning': 10}, metrics=rmse)

In [13]:
learn.layer_groups

[Sequential(
   (0): Embedding(16, 10)
   (1): Embedding(6, 10)
   (2): Dropout(p=0.0)
   (3): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (4): Linear(in_features=21, out_features=200, bias=True)
   (5): ReLU(inplace)
   (6): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (7): Linear(in_features=200, out_features=100, bias=True)
   (8): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (9): Linear(in_features=100, out_features=1, bias=True)
 )]

In [None]:
learn.fit_one_cycle(1, 1e-2)

##  pytorch

Source:

https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/

https://github.com/yashu-seth/pytorch-tabular/blob/master/pytorch_tabular.py

In [15]:
class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        """
        Characterizes a Dataset for PyTorch
        
        Parameters
        ----------
        data: pandas data frame
            The data frame object for the input data. It must
            contain all the continuous, categorical and the
            output columns to be used.
        
        cat_cols: List of strings
            The names of the categorical columns in the data.
            These columns will be passed through the embedding
            layers in the model. These columns must be
            label encoded beforehand. 
        
        output_col: string
            The name of the output variable column in the data
            provided.
        """
        self.n = data.shape[0]
        
        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y = np.zeros((self.n, 1))
        
        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns if col not in self.cat_cols + [output_col]]
        
        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))
        
        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))
    
    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n

    def __getitem__(self, idx):
        """
        Generates one sample of the data
        """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]


In [16]:
class FeedForwardNN(nn.Module):
    
    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
                output_size, emb_dropout, lin_layer_dropouts):
        
        """
        Parameters
        ----------

        emb_dims: List of two element tuples
          This list will contain a two element tuple for each
          categorical feature. The first element of a tuple will
          denote the number of unique values of the categorical
          feature. The second element will denote the embedding
          dimension to be used for that feature.

        no_of_cont: Integer
          The number of continuous features in the data.

        lin_layer_sizes: List of integers.
          The size of each linear layer. The length will be equal
          to the total number of linear layers in the network.

        output_size: Integer
          The size of the final output.

        emb_dropout: Float
          The dropout to be used after the embedding layers.

        lin_layer_dropouts: List of floats
          The dropouts to be used after each linear layer.
        """
    
        super().__init__()
    
        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                        for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear layers
        first_lin_layer = nn.Linear(self.no_of_embs + no_of_cont, lin_layer_sizes[0])

        self.lin_layers = nn.ModuleList([first_lin_layer] + [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i+1])
                                                            for i in range(len(lin_layer_sizes) - 1)])

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch norm layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])

        # Dropout layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.dropout_layers = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts])
    
    def forward(self, cont_data, cat_data):
        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)
        
        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)
            
            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = normalized_cont_data
        
        for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers, self.dropout_layers, self.bn_layers):
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)
        
        x = self.output_layer(x)
        
        return x

In [17]:
df = pd.read_csv(PATH/'train.csv', sep=',', usecols=['SalePrice', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
                                                    'Street', 'YearBuilt', 'LotShape', '1stFlrSF', '2ndFlrSF']).dropna()

In [18]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [19]:
cat_names = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'YearBuilt']
dep_var = 'SalePrice'

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
label_encoders = {}
for cat_col in cat_names:
    label_encoders[cat_col] = LabelEncoder()
    df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col])

In [22]:
data = TabularDataset(data=df, cat_cols=cat_names, output_col=dep_var)

In [23]:
bs = 64

In [24]:
dl = DataLoader(data, bs, shuffle=True, num_workers=0)

In [25]:
cat_dims = [int(df[col].nunique()) for col in cat_names]

In [26]:
cat_dims

[15, 5, 2, 4, 112]

In [27]:
emb_dims = [(x, min(50, (x+1)//2)) for x in cat_dims]

In [28]:
emb_dims

[(15, 8), (5, 3), (2, 1), (4, 2), (112, 50)]

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
model = FeedForwardNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100], output_size=1, emb_dropout=0.04,
                      lin_layer_dropouts=[0.001, 0.01]).to(device)

In [31]:
no_of_epochs = 5
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [32]:
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in dl:
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y = y.to(device)
        
        # Forward pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [33]:
data[0]

[array([208500.], dtype=float32),
 array([  65., 8450.,  856.,  854.], dtype=float32),
 array([  5,   3,   1,   3, 104], dtype=int64)]

In [34]:
model

FeedForwardNN(
  (emb_layers): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
    (4): Embedding(112, 50)
  )
  (lin_layers): ModuleList(
    (0): Linear(in_features=68, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=100, bias=True)
  )
  (output_layer): Linear(in_features=100, out_features=1, bias=True)
  (first_bn_layer): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_layers): ModuleList(
    (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (emb_dropout_layer): Dropout(p=0.04)
  (dropout_layers): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
  )
)