In [71]:
import os
os.chdir('../advanced_programming_teaching')
import pandas as pd
from utils.helper_functions import missing_values_table, process_datetime, save_pickle, load_pickle
pd.set_option('display.max_columns', 900, 'display.max_rows', 900)
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('data/kc_house_data.csv')

# Train test split

In [63]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3, random_state=0)

# Very basic preprocessing steps

In [64]:
missing_values_table(train)

Selected dataframe has 21 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [65]:
def preprocess_df(df):
    # Convert to datetime df type
    df['date'] = pd.to_datetime(df['date'])

    # Preprocess to feature engineering some datetime features
    df = process_datetime(df, 'date')
    return df

In [66]:
train_preprocessed = preprocess_df(train)

# Drop unused columns, separate categorical and numerical
# The reason is in the later part
train.drop(columns=['id', 'date', 'zipcode'], inplace=True)
categorical_feat = ['waterfront', 'view', 'condition']
numerical_feat = list(train.drop(categorical_feat, axis=1).columns)

# Mapper

In [69]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, StandardScaler

mapper=DataFrameMapper([
    # Numerical Features
    (['bedrooms'], StandardScaler()),
    (['bathrooms'], StandardScaler()),
    (['sqft_living'], StandardScaler()),
    (['sqft_lot'], StandardScaler()),
    (['grade'], StandardScaler()),
    (['sqft_above'], StandardScaler()),
    (['sqft_basement'], StandardScaler()),
    (['yr_built'], StandardScaler()),
    (['yr_renovated'], StandardScaler()),
    (['lat'], StandardScaler()),
    (['long'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_lot15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    (['sqft_living15'], StandardScaler()),
    
    # Categorical Features
    (['waterfront'], OneHotEncoder()),
    (['view'], OneHotEncoder()),
    (['condition'], OneHotEncoder()),
    (['date_month'], OneHotEncoder()),
    (['date_dow'], OneHotEncoder()),
    (['date_quarter'], OneHotEncoder()),
    (['date_isweeknd'], OneHotEncoder()),
    (['date_month_interval'], OneHotEncoder()),
], df_out=True)

train_tranformed = mapper.fit_transform(train_preprocessed)
# save_pickle('model/mapper.pkl', mapper)

# Neural Network 1

In [78]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data

# Weight initial
def customize_weight_init(x):
    classname = x.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.xavier_normal_(x.weight)
        nn.init.constant_(x.bias, 0)
        
    if classname.find('BatchNorm') != -1:
        nn.init.constant_(x.weight, 1)
        nn.init.constant_(x.bias, 0)

# Define model
class NNet_model_1(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hidden_1, 
                 hidden_2, 
                 hidden_3,
                 output=1):
        '''
        A class that defines the neural network structure
        
        Params:
        input_dim: number of features from the dataset
        hidden_1 : num of neurons in layer 1
        hidden_2: num of neurons in layer 2
        hidden_3: num of neurons in layer 3
        
        output: 
        an object that holds the model structure (can be called as a function)
        '''
        super().__init__()
        self.input_dim = input_dim
        self.network = nn.Sequential(
            # input layer
            nn.Linear(input_dim, hidden_1),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_1),
            nn.Dropout(0.5),
            
            # hidden layer 1
            nn.Linear(hidden_1, hidden_2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_2),
            nn.Dropout(0.3),
            
            # hidden layer 2
            nn.Linear(hidden_2, hidden_3),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_3),
            nn.Dropout(0.2),
            
            # output layer
            nn.Linear(hidden_3, output),
        )

    def forward(self, input):
        return self.network(input)

In [76]:
def train_loop(train, test, seed, learning_rate, 
               weight_decay, epoches=3, save_folder='temp_model', verbose=True):

    torch.cuda.current_device()
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    if verbose: print('Training on: ', device)
    torch.manual_seed(seed)

    model = NNModel(hidde).to(device)

    model.apply(customize_weight_init)
    class_weights = torch.FloatTensor([class_weight_0, class_weight_1]).to(device)
    opt = torch.optim.Adam(model.parameters(),lr = learning_rate, weight_decay=weight_decay)
    loss_fn = nn.MSELoss(reduction='mean')
    
    # Some setup
    start_time = time.time()
    num_batch = len(train_dl)
    cats_tensor = torch.LongTensor(val_dl.dataset.cats).to(device)
    conts_tensor = torch.FloatTensor(val_dl.dataset.conts).to(device)
    y_true = val_dl.dataset.y
    
    for epoch in range(epoches):      
        total_loss_train = 0 
        train_loss = 0
        model.train()
        for cat, cont, y in iter(train_dl):
            cat = cat.to(device)
            cont = cont.to(device)
            y = y.to(device)
            opt.zero_grad()
            prob = model(cat, cont).to(device)
            loss = loss_fn(prob, y)
            loss.backward()
            opt.step()
            total_loss_train += loss.item()
        train_loss = total_loss_train/num_batch
        
        model.eval()
        val_prob = model(cats_tensor, conts_tensor).to(device)
        _ , predicted_class = torch.max(val_prob.data, dim = 1)
        f1 = f1_score(y_true, to_numpy(predicted_class))
        f1_scores.append(f1)
        if verbose:
            cur_lr = opt.param_groups[0]['lr']
            print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} val_f1: {f1:.4f} current_lr: {cur_lr: .5f}')

        if f1_scores[-1] == max(f1_scores):
            if verbose: print("Find better model!")
            best_model= copy.deepcopy(model)
            best_scheduler = copy.deepcopy(scheduler)
            best_epoch= epoch + 1
            best_accuracy = round(max(f1_scores),2)
            best_optimizer=copy.deepcopy(opt)
            checkpoint = {
                'model': NNModel(emb_szs=emb_szs, n_cont=n_cont, emb_drop=emb_drop, out_sz= out_sz, szs=szs, drops=drops),
                'epoch': epoch + 1,
                'state_dict': best_model.state_dict(),
                'scheduler': best_scheduler.state_dict(),
                'optimizer': best_optimizer.state_dict(),
                'best_accuracy': best_accuracy}
            path_checkpoint = save_folder + '/embedding_classifier_crossentropy_checkpoint_' + today + '.pth'
            torch.save(checkpoint, path_checkpoint)

    if verbose:
        print(f"Finished training in {time.time() - start_time:.4f} seconds")
        print('Need {} epoches to reach the best model'.format(best_epoch))
        print(f'Model save to {path_checkpoint}')
    return best_accuracy

# Neural Network 3

In [None]:
def setup_dataloader(X, y, batch_size=256, shuffle=False, num_workers=4):
    catf, _ = split_features(X)
    X_dataset = RegressionColumnarDataset(X, catf, y)
    params = {
        'batch_size': batch_size,
        'shuffle': shuffle,
        'num_workers': num_workers}
    X_dataloader = data.DataLoader(X_dataset, **params)
    return X_dataloader

def setup_nn_params(df):
    catf, _ = split_features(df)
    cat_sz = [(c, int(df[c].max()+1)) for c in catf]
    emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
    n_cont = len(df.columns)-len(catf)
    return {'emb_szs': emb_szs, 'n_cont': n_cont}

def train_loop(train_dl, val_dl, seed, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
               learning_rate, step_size, gamma, weight_decay, class_weight_0 = 2.047, class_weight_1 = 8.867,
               epoches=3, save_folder='temp_model', verbose=True):

    torch.cuda.current_device()
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    if verbose: print('Training on: ', device)
    torch.manual_seed(seed)

    model = NNModel(emb_szs=emb_szs, 
            n_cont=n_cont, 
            emb_drop=emb_drop, 
            out_sz= out_sz, 
            szs=szs, 
            drops=drops).to(device)

    model.apply(customize_weight_init)
    class_weights = torch.FloatTensor([class_weight_0, class_weight_1]).to(device)
    opt = torch.optim.Adam(model.parameters(),lr = learning_rate, weight_decay=weight_decay)
    loss_fn = nn.CrossEntropyLoss(weight = class_weights, reduction='sum')
    scheduler=torch.optim.lr_scheduler.StepLR(opt, step_size=int(step_size), gamma= gamma, last_epoch=-1)
    
    # Some setup
    accuracy = []
    f1_scores = []
    best_epoch = 0
    start_time = time.time()
    num_batch = len(train_dl)
    cats_tensor = torch.LongTensor(val_dl.dataset.cats).to(device)
    conts_tensor = torch.FloatTensor(val_dl.dataset.conts).to(device)
    y_true = val_dl.dataset.y
    
    for epoch in range(epoches):      
        total_loss_train = 0 
        train_loss = 0
        model.train()
        scheduler.step()
        for cat, cont, y in iter(train_dl):
            cat = cat.to(device)
            cont = cont.to(device)
            y = y.to(device)
            opt.zero_grad()
            prob = model(cat, cont).to(device)
            loss = loss_fn(prob, y)
            loss.backward()
            opt.step()
            total_loss_train += loss.item()
        train_loss = total_loss_train/num_batch
        
        model.eval()
        val_prob = model(cats_tensor, conts_tensor).to(device)
        _ , predicted_class = torch.max(val_prob.data, dim = 1)
        f1 = f1_score(y_true, to_numpy(predicted_class))
        f1_scores.append(f1)
        if verbose:
            cur_lr = opt.param_groups[0]['lr']
            print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} val_f1: {f1:.4f} current_lr: {cur_lr: .5f}')

        if f1_scores[-1] == max(f1_scores):
            if verbose: print("Find better model!")
            best_model= copy.deepcopy(model)
            best_scheduler = copy.deepcopy(scheduler)
            best_epoch= epoch + 1
            best_accuracy = round(max(f1_scores),2)
            best_optimizer=copy.deepcopy(opt)
            checkpoint = {
                'model': NNModel(emb_szs=emb_szs, n_cont=n_cont, emb_drop=emb_drop, out_sz= out_sz, szs=szs, drops=drops),
                'epoch': epoch + 1,
                'state_dict': best_model.state_dict(),
                'scheduler': best_scheduler.state_dict(),
                'optimizer': best_optimizer.state_dict(),
                'best_accuracy': best_accuracy}
            path_checkpoint = save_folder + '/embedding_classifier_crossentropy_checkpoint_' + today + '.pth'
            torch.save(checkpoint, path_checkpoint)

    if verbose:
        print(f"Finished training in {time.time() - start_time:.4f} seconds")
        print('Need {} epoches to reach the best model'.format(best_epoch))
        print(f'Model save to {path_checkpoint}')
    return best_accuracy