In [1]:
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn_pandas import DataFrameMapper
warnings.filterwarnings('ignore', category=DataConversionWarning)

def transform_date(df, field_name, drop=True):
    field = df[field_name]
    if not np.issubdtype(field, np.datetime64):
        df[field_name] = field = pd.to_datetime(field, infer_datetime_format=True)
    target_pre = re.sub('[Dd]ate$', '', field_name)
    for i in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[target_pre + i] = getattr(field.dt, i.lower())
    df[target_pre + 'Elapsed'] = field.astype(np.int64) // 10**9
    if drop:
        df.drop(field_name, axis=1, inplace=True)


def create_category_fields(df, is_train=True, train_df=None):  
    if is_train:
        for col_name, data in df.items():
            if is_string_dtype(data):
                df[col_name] = data.astype('category').cat.as_ordered()
    else:
        for col_name, data in df.items():
            if (col_name in train_df.columns) and (train_df[col_name].dtype.name == 'category'):
                df[col_name] = pd.Categorical(data, categories=train_df[col_name].cat.categories, ordered=True)



def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name + '_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict


def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique() > max_n_cat):
        df[name] = col.cat.codes + 1


def scale_vars(df, mapper):
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper


def preprocessing(df, y_fld, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, mapper=None):

    if not skip_flds: 
        skip_flds = []
    df = df.copy()
    if preproc_fn: 
        preproc_fn(df)

    y = df[y_fld].values
    df.drop(skip_flds + [y_fld], axis=1, inplace=True)

    if na_dict is None: 
        na_dict = {}
    for n,c in df.items(): 
        na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: 
        mapper = scale_vars(df, mapper)
    for n,c in df.items(): 
        numericalize(df, c, n, max_n_cat)

    res = [pd.get_dummies(df, dummy_na=True), y, na_dict]

    if do_scale: 
        res = res + [mapper]
    return res

def set_rf_samples(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))

def reset_rf_samples():
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))

In [2]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_numeric_dtype


dep = 'SalePrice'
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
keep_cols = list(np.load('tmp/keep_cols.npy'))

df_raw.loc[df_raw.YearMade<1950, 'YearMade'] = 1950
df_raw['age'] = df_raw.saleYear-df_raw.YearMade
df_raw = df_raw[keep_cols+['age', dep]].copy()
df_indep = df_raw.drop(dep,axis=1)

n_valid = 12000
n_trn = len(df_raw)-n_valid


cat_flds = [n for n in df_indep.columns if df_raw[n].nunique()<n_trn/50]
' '.join(cat_flds)

for o in ['saleElapsed', 'saleDayofyear', 'saleDay', 'age', 'YearMade']: cat_flds.remove(o)
[n for n in df_indep.drop(cat_flds,axis=1).columns if not is_numeric_dtype(df_raw[n])]


for n in cat_flds: df_raw[n] = df_raw[n].astype('category').cat.as_ordered()

cont_flds = [n for n in df_indep.columns if n not in cat_flds]
' '.join(cont_flds)

'YearMade saleElapsed SalesID MachineID saleDay saleDayofyear age'

In [3]:
df_raw = df_raw[cat_flds+cont_flds+[dep]]
df, y, nas, mapper = preprocessing(df_raw, 'SalePrice', do_scale=True)

val_idx = list(range(n_trn, len(df)))

In [4]:
def rmse(x,y): return np.sqrt(((x-y)**2).mean())
from sklearn.metrics import r2_score

emb_c = {n: len(c.cat.categories)+1 for n,c in df_raw[cat_flds].items()}

emb_szs = [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
metrics=[rmse]

In [125]:
from torch import nn
from torch.nn.init import kaiming_normal
import torch.nn.functional as F
import torch
from torch.optim import Adam, RMSprop
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import pandas as pd

def split_by_idx(idxs, *a):
    mask = np.zeros(len(a[0]),dtype=bool)
    mask[np.array(idxs)] = True
    return [(o[mask],o[~mask]) for o in a]


def init_embeddings(x):
    x = x.weight.data
    value = 2 / (x.size(1) + 1)
    x.uniform_(-value, value)
    
class StructuredData(object):
    def __init__(self, df, y, cat_flds, cont_flds, val_index=None, batch_size=32, shuffle=False, num_workers=1):
        self.val_index = val_index
        if val_index:
            ((val_df, df), (y_val, y)) = split_by_idx(val_index, df, y)

        train_dataset = StructuredDataSet(df[cat_flds], df[cont_flds], y)
        
        if shuffle:
            train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        else:
            train_sampler = None
            
        self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), 
                                       sampler=train_sampler, num_workers=num_workers)
        
        if val_index:
            validation_dataset = StructuredDataSet(val_df[cat_flds], val_df[cont_flds], y_val)
            self.validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, 
                                            num_workers=num_workers)
        else:
            self.validation_loader = None
            
    def get_data(self):  
        if self.val_index:
            return self.train_loader, self.validation_loader
        else:
            return self.train_loader  
    
class StructuredDataSet(Dataset):
    def __init__(self, cats, conts, y):
        self.cats = np.asarray(cats, dtype=np.int64)
        self.conts = np.asarray(conts, dtype=np.float32)
        self.N = len(y)
        y = np.zeros((n,1)) if y is None else y[:,None]
        self.y = np.asarray(y, dtype=np.float32)
            
    def __len__(self): 
        return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]
    

     
class MultiInputNN(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, sizes, drops, y_range=None, use_bn=False, f=F.relu):
        super().__init__()
        # embedding layers
        self.embeddings = nn.ModuleList([nn.Embedding(insize, outsize) for insize, outsize in emb_szs])
        for layer in self.embeddings:
            init_embeddings(layer)
        self.num_categorical = sum([layer.embedding_dim for layer in self.embeddings])

        self.num_numerical = n_cont
        # linear layers
        sizes = [self.num_categorical + self.num_numerical] + sizes
        self.linear = nn.ModuleList([nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(size) for size in sizes[1:]])
        for layer in self.linear:
            kaiming_normal(layer.weight.data)
        # dropout layers
        
        self.emb_drop = nn.Dropout(emb_drop)
        self.drop_out = [nn.Dropout(drop) for drop in drops]
        # output layer
        self.output = nn.Linear(sizes[-1], 1)
        kaiming_normal(self.output.weight.data)
        self.f = f
        self.bn = nn.BatchNorm1d(self.num_numerical)
        self.use_bn = use_bn
        self.y_range = y_range

    def forward(self, x_cat, x_cont):
        if self.num_categorical > 0:
            X = [emb_layer(x_cat[:,i]) for i, emb_layer in enumerate(self.embeddings)]
            X = torch.cat(X, dim=1)
            X = self.emb_drop(X)
        if self.num_numerical > 0:
            X2 = self.bn(x_cont)
            X = torch.cat([X, X2], dim=1) if self.num_categorical != 0 else X2
        for linear, drop, norm in zip(self.linear, self.drop_out, self.bns):
            X = self.f(linear(X))
            if self.use_bn: 
                X = norm(X)
            X = drop(X)
        X = self.output(X)
        if self.y_range:
            X = F.sigmoid(X)
            X = X * (self.y_range[1] - self.y_range[0])
            X = X + self.y_range[0]
        return X
    
    
    def fit(self, train_loader, learning_rate=1e-3, batch_size=64, epochs=1, val_loader=None, metrics=None, save=False, save_path='tmp/checkpoint.pth.tar', 
                    pre_saved=False):

        loss = nn.MSELoss()
        optimizer = RMSprop(self.parameters(), lr=learning_rate)
        n_batches = int(train_loader.dataset.N / train_loader.batch_size)

        if pre_saved:
            checkpoint = torch.load(save_path)
            start_epoch = checkpoint['epoch']
            self.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print('...restoring model...')
            
        begin = True
        for epoch_ in range(epochs):        
            if pre_saved:      
                if begin:
                    epoch = start_epoch
                    begin = False
            else:
                epoch = epoch_
            epoch += 1

            epoch_loss = 0.
            # train phase 
            self.train()
            for i, (batch_cat, batch_cont, batch_y) in enumerate(train_loader):
                optimizer.zero_grad()
                batch_cat, batch_cont, batch_y = Variable(batch_cat), Variable(batch_cont), Variable(batch_y)

                y_hat = self.forward(batch_cat, batch_cont)
                l = loss(y_hat, batch_y)
                epoch_loss += l.data[0]
                
                l.backward()
                optimizer.step()
                
                if i != 0 and i % 2000 == 0:
                    print('iteration: {} of n_batches: {}'.format(i, n_batches))
                    
            train_loss = epoch_loss / n_batches
            print_output = [epoch, train_loss]
            if val_loader:
                val_loss = self.validate(val_loader, loss, metrics)
                for i in val_loss: print_output.append(i)

            print(print_output)
        if save:
            state = {
            'epoch': epoch,
            'state_dict': self.state_dict(),
            'optimizer': optimizer.state_dict()}
            self.save_checkpoint(state, filename=save_path)

    def validate(self, val_loader, loss, metrics=None):
        self.eval()
        n_batches = int(val_loader.dataset.N / val_loader.batch_size)
        total_loss = 0.
        metric_scores = {}
        if metrics:
            for metric in metrics:
                metric_scores[str(metric)] = []
                
        for i, (batch_cat, batch_cont, batch_y) in enumerate(val_loader):
            batch_cat, batch_cont, batch_y = Variable(batch_cat), Variable(batch_cont), Variable(batch_y)
            y_hat = self.forward(batch_cat, batch_cont)
            l = loss(y_hat, batch_y)
            total_loss += l.data[0]
            
            if metrics:
                for metric in metrics:
                    metric_scores[str(metric)].append(metric(batch_y.data.numpy(), y_hat.data.numpy()))
        if metrics:
            final_metrics = []
            for metric in metrics:
                final_metrics.append(np.sum(metric_scores[str(metric)]) / n_batches)
            return total_loss / n_batches, final_metrics
        else:
            return total_loss / n_batches
        
        
    def save_checkpoint(self, state, filename='checkpoint.pth.tar'):
        torch.save(state, filename)
        
    def predict(self, df, cat_flds, cont_flds):
        self.eval()
        cats = np.asarray(df[cat_flds], dtype=np.int64)
        conts = np.asarray(df[cont_flds], dtype=np.float32)
        x_cat = Variable(torch.from_numpy(cats))
        x_cont = Variable(torch.from_numpy(conts))
        pred = self.forward(x_cat, x_cont)
        return pred.data.numpy().flatten()

    
# TODO
# - learning rate restarts
# change PATH for saved model? 

In [126]:
df_sample, y_sample, nas, mapper = preprocessing(df_raw[:10000], 'SalePrice', do_scale=True)

n_valid = 1000
n_trn = len(df_sample)-n_valid
val_idx = list(range(n_trn, len(df_sample)))

train_loader, val_loader = StructuredData(df_sample, y_sample, cat_flds, cont_flds, batch_size=64, val_index=val_idx).get_data()

In [127]:
y_range=(0,np.max(y)*1.2)
my_model = MultiInputNN(emb_szs, len(cont_flds), 0.05, 1, [500,250], [0.5,0.05], use_bn=True, y_range=y_range)

In [128]:
my_model.fit(train_loader, epochs=2, save=True, val_loader=val_loader, metrics=[rmse, r2_score])

[1, 1.7231188039694514, 0.4903570353984833, [0.7195496877034505, 0.07632531918010448]]
[2, 0.3590959183871746, 0.27899246116479237, [0.5417129516601562, 0.5061897871685839]]


In [129]:
my_model.fit(train_loader, epochs=2, save=True, pre_saved=True)

...restoring model...
[3, 0.2335229481969561]
[4, 0.1690023023635149]


In [130]:
my_model.fit(train_loader, epochs=6, save=True, pre_saved=True, val_loader=val_loader, metrics=[rmse, r2_score])

...restoring model...
[5, 0.12643333577683993, 0.13297265817721685, [0.3751620610555013, 0.8006963493330007]]
[6, 0.1058651225907462, 0.12605141748984655, [0.36430749893188474, 0.8157269626822686]]
[7, 0.09485879724047014, 0.0968967025478681, [0.3184300740559896, 0.876090542939196]]
[8, 0.08323746100068093, 0.0922976424296697, [0.3111626942952474, 0.8833985765837554]]
[9, 0.07922754921019078, 0.09525569528341293, [0.31697184244791665, 0.8774915622266801]]
[10, 0.07112292892166547, 0.09411198894182841, [0.3144283930460612, 0.8804460592590257]]


In [132]:
preds = my_model.predict(df_sample, cat_flds, cont_flds)

In [134]:
rmse(y_sample, preds)

0.24940509011324333