## Import Package

In [2]:
# required packages
import numpy as np
import pandas as pd
import numba as nb
import warnings
import gc

# Machine Learning
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

# display set up
%matplotlib inline
%precision 4
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.set_option("display.precision", 3)
pd.options.display.max_rows = 999

## Load Data

In [25]:
# Functions for memory reduction
@nb.jit()
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return(df)

In [26]:
%%time

prop_2016 = pd.read_csv('data/prop_2016.csv')
prop_2016 = reduce_mem_usage(prop_2016)

prop_2017 = pd.read_csv('data/prop_2017.csv')
prop_2017 = reduce_mem_usage(prop_2017)

train = pd.read_csv('data/train.csv')
train = reduce_mem_usage(train)

Wall time: 1min 41s


In [27]:
def drop_features(df):
    """
    Drop id columns and columns not needed.
    """
    # id and label (not features)
    drop_list = ['parcelid']
    
    # Too many Missing Values from EDA
    drop_list.extend(['architecturalstyletypeid','buildingclasstypeid',
                      'decktypeid','typeconstructiontypeid'])
    
    # Duplicated Columns found from EDA
    drop_list.extend(['calculatedbathnbr','finishedsquarefeet50'])
    
    # Highly Correlated with other related columns
    drop_list.extend(['fullbathcnt','censustractandblock'])
    
    # Drop columns with low feature importance
#     drop_list.extend(['fireplaceflag','fips','finishedsquarefeet13','poolsizesum','pooltypeid10','basementsqft',
#                      'storytypeid','yardbuildingsqft26','finishedsquarefeet6','fireplacecnt','regionidcounty',
#                       'propertyzoningdesc','propertycountylandusecode'])

    return(df.drop(columns=drop_list))

In [28]:
def encoding(train, prop_2016, prop_2017, categorical_features):
    """
    To limit the value to small numbers since values in categorical_feature is suggested to be small.
    Now the number denoting missing values will be 0 instead of -1.
    """
    from sklearn.preprocessing import LabelEncoder
    
    prop = pd.concat([train[categorical_features],
                      prop_2016[categorical_features], 
                      prop_2017[categorical_features]], ignore_index=True)
    
    for col in categorical_features:
        encoder = LabelEncoder().fit(prop[col].astype(str))
        train[col] = encoder.transform(train[col].astype(str))
        prop_2016[col] = encoder.transform(prop_2016[col].astype(str))
        prop_2017[col] = encoder.transform(prop_2017[col].astype(str))
    del prop
    gc.collect()
    return(train, prop_2016, prop_2017)

In [29]:
# Do encoding for cateogircal features
categorical_features = ['airconditioningtypeid','buildingqualitytypeid',
                        'hashottuborspa','heatingorsystemtypeid','pooltypeid2','pooltypeid7',
                        'propertylandusetypeid','rawcensustractandblock',
                        'regionidcity','regionidneighborhood','regionidzip', 'yearbuilt','assessmentyear',
                        'taxdelinquencyflag','taxdelinquencyyear','geo_cluster']

train, prop_2016, prop_2017 = encoding(train, prop_2016, prop_2017, categorical_features)

# Transform to Numpy matrices
X = drop_features(train).drop(columns='logerror')
y = train.logerror.values

# Specify feature names
feature_names = [col for col in X.columns]

# Get categorical features
categorical_indices = []
for i, n in enumerate(X.columns):
    if n in categorical_features:
        categorical_indices.append(i)

# Perform shuffled train/test split
np.random.seed(910)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Keep outlier values (roughly outside 99% percentile) out of the training dataset
outlier_threshold = 0.4
mask = (abs(y_train) <= outlier_threshold)
X_train = X_train.iloc[mask, :]
y_train = y_train[mask]

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_test.shape))
print("y_val shape: {}".format(y_test.shape))

X_train shape: (131489, 127)
y_train shape: (131489,)
X_val shape: (33578, 127)
y_val shape: (33578,)


In [36]:
# CatBoost parameters Setting
params = {'loss_functiomn': 'MAE',
         'eval_metric': 'MAE',
         'nan_mode': 'Min',
         'iterations': 1000,
         'l2_leaf_reg': 5}

In [37]:
%%time
# Train CatBoost Regressor with cross-validated early-stopping
test_pool = Pool(X_test, y_test.astype(np.float64), cat_features=categorical_indices)

np.random.seed(910)
model = CatBoostRegressor(**params)
model.fit(X_train, y_train,
          cat_features=categorical_indices,
          use_best_model=True, eval_set=test_pool, early_stopping_rounds=50, verbose=False)


# Evaluate on train and validation sets
print(f"Train score: {abs(model.predict(X_train) - y_train).mean() * 100}")
print(f"Test score: {abs(model.predict(X_test) - y_test).mean() * 100}")

Train score: 5.106739281624631
Test score: 6.86613889576145
Wall time: 5min 27s


## Training on all Data

In [39]:
# Remove outlier (if any)
outlier_threshold = 0.4
mask = (abs(y) <= outlier_threshold)
X = X.iloc[mask, :]
y = y[mask]

# Train multiple models
bags = 8
models = []
params['iterations'] = 1000
for i in range(bags):
    print("Start training model {}".format(i))
    params['random_seed'] = i
    np.random.seed(910)
    model = CatBoostRegressor(**params)
    model.fit(X, y, cat_features=categorical_indices, verbose=False)
    models.append(model)
    
# Sanity check (make sure scores on a small portion of the dataset are reasonable)
for i, model in enumerate(models):
    print("model {}: {}".format(i, abs(model.predict(X_test) - y_test).mean() * 100))

Start training model 0
Start training model 1
Start training model 2
Start training model 3
Start training model 4
Start training model 5
Start training model 6
Start training model 7
model 0: 6.806486062415555
model 1: 6.807822975141546
model 2: 6.8120287008259295
model 3: 6.810470335163292
model 4: 6.8080887912448205
model 5: 6.81278833983821
model 6: 6.808338311486676
model 7: 6.806888600212144


In [40]:
def transform_test_features(prop_2016, prop_2017):
    """
    Helper method that prepares prop_2016 and prop_2017 for prediction.
    """
    prop_2016 = drop_features(prop_2016)
    prop_2017 = drop_features(prop_2017)
    
    # Create three datetime columns that does not exist in those dataset
    prop_2016['year'] = 0
    prop_2017['year'] = 1
    
    prop_2016['month'] = 8 # randomly select one month
    prop_2017['month'] = 8 # randomly select one month
    
    prop_2016['quarter'] = 3 # randomly select one quarter
    prop_2017['quarter'] = 3 # randomly select one quarter
    
    # Reorder to maintain categorical indices
    prop_2016 = prop_2016[['year', 'month', 'quarter'] + list(prop_2016.columns[:-3])]
    prop_2017 = prop_2017[['year', 'month', 'quarter'] + list(prop_2017.columns[:-3])]
    return(prop_2016, prop_2017)

def predict_and_export(models, prop_2016, prop_2017, file_name):
    """
    Helper method to make predicition and export results to csv.
    """
    # Construct DataFrame for prediction results
    submission_2016 = pd.DataFrame()
    submission_2017 = pd.DataFrame()
    submission_2016['ParcelId'] = prop_2016.parcelid
    submission_2017['ParcelId'] = prop_2017.parcelid
    
    # Prepare dataset for prediction
    prop_2016, prop_2017 = transform_test_features(prop_2016, prop_2017)
    
    # Make Prediction across multiple models
    pred_2016, pred_2017 = [], []
    for i, model in enumerate(models):
        print("Start model {} (2016)".format(i))
        pred_2016.append(model.predict(prop_2016))
        print("Start model {} (2017)".format(i))
        pred_2017.append(model.predict(prop_2017))
    
    # Take average across all models
    mean_pred_2016 = np.mean(pred_2016, axis=0)
    mean_pred_2017 = np.mean(pred_2017, axis=0)
    
    # Formatting for submission
    submission_2016['201610'] = [float(format(x, '.4f')) for x in mean_pred_2016]
    submission_2016['201611'] = submission_2016['201610']
    submission_2016['201612'] = submission_2016['201610']

    submission_2017['201710'] = [float(format(x, '.4f')) for x in mean_pred_2017]
    submission_2017['201711'] = submission_2017['201710']
    submission_2017['201712'] = submission_2017['201710']
    
    submission = pd.merge(submission_2016,submission_2017, how='inner', on='ParcelId')
    submission.to_csv(file_name, index=False)
    print("Submission Successfully Created.")
    return(submission, pred_2016, pred_2017)

In [41]:
%%time
del train
gc.collect()

file_name = 'submission/final_cat.csv'
submission, pred_2016, pred_2017 = predict_and_export(models, prop_2016, prop_2017, file_name)

Start model 0 (2016)
Start model 0 (2017)
Start model 1 (2016)
Start model 1 (2017)
Start model 2 (2016)
Start model 2 (2017)
Start model 3 (2016)
Start model 3 (2017)
Start model 4 (2016)
Start model 4 (2017)
Start model 5 (2016)
Start model 5 (2017)
Start model 6 (2016)
Start model 6 (2017)
Start model 7 (2016)
Start model 7 (2017)
Length of submission DataFrame: 2985217
Submission header:
   ParcelId     201610     201611     201612     201710     201711     201712
0  10754147  1.690e-02  1.690e-02  1.690e-02  1.830e-02  1.830e-02  1.830e-02
1  10759547  1.200e-02  1.200e-02  1.200e-02  1.530e-02  1.530e-02  1.530e-02
2  10843547  5.100e-03  5.100e-03  5.100e-03  3.900e-03  3.900e-03  3.900e-03
3  10859147  2.010e-02  2.010e-02  2.010e-02  2.040e-02  2.040e-02  2.040e-02
4  10879947  9.000e-04  9.000e-04  9.000e-04  5.000e-04  5.000e-04  5.000e-04
Wall time: 7min 1s
