In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
print('Loading Properties...')
properties2016 = pd.read_csv('../Resources/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../Resources/properties_2017.csv', low_memory = False)

print('Loading Train...')
train2016 = pd.read_csv('../Resources/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('../Resources/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

print('Loading Sample ...')
sample_submission = pd.read_csv('../Resources/sample_submission.csv', low_memory=False)

Loading Properties...
Loading Train...
Loading Sample ...


In [3]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [4]:
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

sample_submission['parcelid'] = sample_submission['ParcelId']

print('Merge Train & Test with Properties...')
train2016 = pd.merge(train2016, properties2016, how='left', on='parcelid')
train2017 = pd.merge(train2017, properties2017, how='left', on='parcelid')
test_df = pd.merge(sample_submission, properties2016, how='left', on='parcelid')

print('Concat Train 2016 & 2017...')
train_df = pd.concat([train2016, train2017], axis=0)

del properties2016, properties2017, train2016, train2017
gc.collect();

print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

Merge Train & Test with Properties...
Concat Train 2016 & 2017...
Train:  (167888, 63)
Test:  (2985217, 65)


In [5]:
# 98% missing value will be dropoff
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag']
13


In [6]:
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

We exclude: ['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
9


In [7]:
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))

We use these for training: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock']
43


In [8]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear']


In [9]:
print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 !!


In [10]:
def print_feature_importance(model, pool, X_train):
    feature_importances = model.get_feature_importance(pool)
    feature_names = X_train.columns
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}\t{}'.format(name, score))

In [11]:
for i in [train_features[ind] for ind in cat_feature_inds]:
    if train_df[i].dtype == 'float':
        train_df[i] = train_df[i].astype('int')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train_df[train_features], train_df.logerror, test_size=0.2, random_state=99)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

all_pool = Pool(train_df[train_features], train_df.logerror, cat_feature_inds)
train_pool = Pool(X_train, y_train, cat_feature_inds)
test_pool = Pool(X_test, y_test, cat_feature_inds)

(134310, 43) (134310,)
(33578, 43) (33578,)


In [13]:
catboost_parameters = {
    'iterations': 400,
    'learning_rate': 0.035,
    'depth': 7,
    'verbose': 20,
#     'l2_leaf_reg': 1000,
    'task_type': 'GPU',
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 0,
}

In [14]:
model = CatBoostRegressor(**catboost_parameters)
model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.0683956	test: 0.0696829	best: 0.0696829 (0)	total: 25.3ms	remaining: 10.1s
20:	learn: 0.0677368	test: 0.0692307	best: 0.0692307 (20)	total: 549ms	remaining: 9.91s
40:	learn: 0.0675111	test: 0.0691698	best: 0.0691698 (40)	total: 1.02s	remaining: 8.94s
60:	learn: 0.0673069	test: 0.0691288	best: 0.0691288 (60)	total: 1.5s	remaining: 8.33s
80:	learn: 0.0671440	test: 0.0690974	best: 0.0690957 (77)	total: 1.99s	remaining: 7.83s
100:	learn: 0.0670153	test: 0.0690786	best: 0.0690775 (97)	total: 2.5s	remaining: 7.4s
120:	learn: 0.0669019	test: 0.0690779	best: 0.0690746 (114)	total: 2.99s	remaining: 6.89s
140:	learn: 0.0667897	test: 0.0690826	best: 0.0690746 (114)	total: 3.48s	remaining: 6.4s
160:	learn: 0.0666795	test: 0.0690751	best: 0.0690697 (158)	total: 3.96s	remaining: 5.88s
180:	learn: 0.0665374	test: 0.0690532	best: 0.0690530 (179)	total: 4.44s	remaining: 5.38s
200:	learn: 0.0664186	test: 0.0690501	best: 0.0690489 (194)	total: 4.96s	remaining: 4.91s
220:	learn: 0.0663076	test

<catboost.core.CatBoostRegressor at 0x7f1b68214908>

In [15]:
num_ensembles = 5
# ensemble models
models = [None] * num_ensembles
for i in range(num_ensembles):
    print("\nTraining (ensemble): %d ..." % (i))
    catboost_parameters['random_seed'] = i
    models[i] = CatBoostRegressor(**catboost_parameters)
    models[i].fit(train_pool, eval_set=test_pool)
    print('-- Feature Importance --')
    print_feature_importance(models[i], train_pool, X_train)


Training (ensemble): 0 ...
0:	learn: 0.0683956	test: 0.0696829	best: 0.0696829 (0)	total: 23.3ms	remaining: 9.31s
20:	learn: 0.0677368	test: 0.0692307	best: 0.0692307 (20)	total: 569ms	remaining: 10.3s
40:	learn: 0.0675111	test: 0.0691698	best: 0.0691698 (40)	total: 1.04s	remaining: 9.08s
60:	learn: 0.0673069	test: 0.0691288	best: 0.0691288 (60)	total: 1.55s	remaining: 8.61s
80:	learn: 0.0671440	test: 0.0690974	best: 0.0690957 (77)	total: 2.04s	remaining: 8.02s
100:	learn: 0.0670153	test: 0.0690786	best: 0.0690775 (97)	total: 2.55s	remaining: 7.56s
120:	learn: 0.0669019	test: 0.0690779	best: 0.0690746 (114)	total: 3.06s	remaining: 7.04s
140:	learn: 0.0667897	test: 0.0690826	best: 0.0690746 (114)	total: 3.56s	remaining: 6.55s
160:	learn: 0.0666795	test: 0.0690751	best: 0.0690697 (158)	total: 4.06s	remaining: 6.02s
180:	learn: 0.0665374	test: 0.0690532	best: 0.0690530 (179)	total: 4.56s	remaining: 5.52s
200:	learn: 0.0664186	test: 0.0690501	best: 0.0690489 (194)	total: 5.09s	remaining: 

260:	learn: 0.0660528	test: 0.0690281	best: 0.0690099 (202)	total: 6.53s	remaining: 3.48s
280:	learn: 0.0659589	test: 0.0690339	best: 0.0690099 (202)	total: 7.03s	remaining: 2.98s
300:	learn: 0.0658637	test: 0.0690352	best: 0.0690099 (202)	total: 7.53s	remaining: 2.48s
320:	learn: 0.0657706	test: 0.0690345	best: 0.0690099 (202)	total: 8.05s	remaining: 1.98s
340:	learn: 0.0656598	test: 0.0690461	best: 0.0690099 (202)	total: 8.55s	remaining: 1.48s
360:	learn: 0.0655360	test: 0.0690529	best: 0.0690099 (202)	total: 9.05s	remaining: 978ms
380:	learn: 0.0654389	test: 0.0690528	best: 0.0690099 (202)	total: 9.55s	remaining: 476ms
399:	learn: 0.0653341	test: 0.0690561	best: 0.0690099 (202)	total: 10s	remaining: 0us
bestTest = 0.06900991255
bestIteration = 202
Shrink model to first 203 iterations.
-- Feature Importance --
regionidzip	8.250825217549652
yearbuilt	7.054449617166805
finishedsquarefeet12	6.977635906555425
transaction_month	6.691409589557911
propertycountylandusecode	5.975345208028586

In [17]:
for i in [train_features[ind] for ind in cat_feature_inds]:
    if test_df[i].dtype == 'float':
        test_df[i] = test_df[i].astype('int')

In [18]:
submission = pd.DataFrame({
    'ParcelId': test_df['parcelid'],
})

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    test_df['transactiondate'] = test_date
    test_df = add_date_features(test_df)
    y_pred = 0.0
    for i in range(num_ensembles):
        print("Ensemble:", i)
        y_pred += models[i].predict(test_df[train_features])
    y_pred /= num_ensembles
    submission[label] = y_pred

submission_major = 2
print("Creating submission: submission_%03d.csv ..." % (submission_major))
submission.to_csv(
    'submission_%03d.csv' % (submission_major),
    float_format='%.4f',
    index=False)
print("Finished.")

Predicting for: 201610 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201611 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201612 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201710 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201711 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201712 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Creating submission: submission_002.csv ...
Finished.
