In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
%matplotlib inline

## Read data

In [2]:
print('Loading Properties...')
properties2016 = pd.read_csv('C:/Users/Yousuf Khan/Data/zillow-prize-1/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('C:/Users/Yousuf Khan/Data/zillow-prize-1/properties_2017.csv', low_memory = False)

print('Loading Train...')
train2016 = pd.read_csv('C:/Users/Yousuf Khan/Data/zillow-prize-1/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('C:/Users/Yousuf Khan/Data/zillow-prize-1/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

print('Loading Sample ...')
sample_submission = pd.read_csv('C:/Users/Yousuf Khan/Data/zillow-prize-1/sample_submission.csv', low_memory=False)

Loading Properties...
Loading Train...
Loading Sample ...


In [3]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

## Prepare training and exam data

In [4]:
%%time
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

sample_submission['parcelid'] = sample_submission['ParcelId']

print('Merge Train & Test with Properties...')
train2016 = pd.merge(train2016, properties2016, how='left', on='parcelid')
train2017 = pd.merge(train2017, properties2017, how='left', on='parcelid')
test_df = pd.merge(sample_submission, properties2016, how='left', on='parcelid')

print('Concat Train 2016 & 2017...')
train_df = pd.concat([train2016, train2017], axis=0)

del properties2016, properties2017, train2016, train2017
gc.collect();

print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

Merge Train & Test with Properties...
Concat Train 2016 & 2017...
Train:  (167888, 63)
Test:  (2985217, 65)
Wall time: 4min 32s


## Choose the attributes to be used for the training

In [5]:
test_df.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712,parcelid,airconditioningtypeid,architecturalstyletypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,0,0,0,0,0,0,10754147,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,0,0,0,0,0,0,10759547,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,0,0,0,0,0,0,10843547,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,0,0,0,0,0,0,10859147,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,0,0,0,0,0,0,10879947,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [6]:
train_df.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


## Remove almost empty columns

In [7]:
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        print(c, "------","{:.5f}".format(missing_frac))
        exclude_missing.append(c)
print("\nWe exclude:",exclude_missing)
print("\n",len(exclude_missing))

architecturalstyletypeid ------ 0.99721
basementsqft ------ 0.99945
buildingclasstypeid ------ 0.99982
decktypeid ------ 0.99242
finishedsquarefeet13 ------ 0.99955
finishedsquarefeet6 ------ 0.99519
poolsizesum ------ 0.98905
pooltypeid10 ------ 0.99031
pooltypeid2 ------ 0.98643
storytypeid ------ 0.99945
typeconstructiontypeid ------ 0.99689
yardbuildingsqft26 ------ 0.99902
fireplaceflag ------ 0.99765

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag']

 13


In [10]:
for c in train_df.columns:
     if train_df[c].isnull().sum() != 0:
        if len(train_df[c].unique()) == 2:
            print(c, train_df[c].unique())

decktypeid [nan 66.]
hashottuborspa [nan True]
poolcnt [nan  1.]
pooltypeid10 [nan  1.]
pooltypeid2 [nan  1.]
pooltypeid7 [nan  1.]
storytypeid [nan  7.]
fireplaceflag [nan True]
taxdelinquencyflag [nan 'Y']


## Remove all columns with only one unique value along with null values

In [9]:
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print("\n",len(exclude_unique))

We exclude: ['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']

 9


## Select the attributes or columns to be used for the training

In [10]:
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print("\n",len(train_features))

We use these for training: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock']

 43


## Classifiable features

In [11]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear']


## Fill in the blank values

In [12]:
print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 !!


# Training

In [13]:
def print_feature_importance(model, pool, X_train):
    feature_importances = model.get_feature_importance(pool)
    feature_names = X_train.columns
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}\t{}'.format(name, score))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_df[train_features], train_df.logerror, test_size=0.2, random_state=99)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

all_pool = Pool(train_df[train_features], train_df.logerror, cat_feature_inds)
train_pool = Pool(X_train, y_train, cat_feature_inds)
test_pool = Pool(X_test, y_test, cat_feature_inds)

(134310, 43) (134310,)
(33578, 43) (33578,)


In [15]:
train_pool

<catboost.core.Pool at 0x1c080186678>

In [16]:
y_train.head()

32872   -0.001000
51062   -0.029737
70941    0.031500
2118    -0.077560
63050    0.041100
Name: logerror, dtype: float64

In [17]:
catboost_parameters = {
    'iterations': 400,
    'learning_rate': 0.035,
    'depth': 7,
    'verbose': 20,
#     'l2_leaf_reg': 1000,
    'task_type': 'GPU',
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 0,
}

## CatBoostRegressor

In [18]:
model = CatBoostRegressor(**catboost_parameters)
model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.0685721	test: 0.0698997	best: 0.0698997 (0)	total: 636ms	remaining: 4m 13s
20:	learn: 0.0676645	test: 0.0691855	best: 0.0691855 (20)	total: 7.61s	remaining: 2m 17s
40:	learn: 0.0673697	test: 0.0690752	best: 0.0690752 (40)	total: 14.6s	remaining: 2m 7s
60:	learn: 0.0671764	test: 0.0690454	best: 0.0690454 (60)	total: 21.2s	remaining: 1m 57s
80:	learn: 0.0670239	test: 0.0690356	best: 0.0690356 (80)	total: 27.7s	remaining: 1m 48s
100:	learn: 0.0668856	test: 0.0690200	best: 0.0690200 (100)	total: 34.4s	remaining: 1m 41s
120:	learn: 0.0667335	test: 0.0690136	best: 0.0690136 (120)	total: 41s	remaining: 1m 34s
140:	learn: 0.0666128	test: 0.0690146	best: 0.0690097 (129)	total: 46.8s	remaining: 1m 26s
160:	learn: 0.0665155	test: 0.0690159	best: 0.0690097 (129)	total: 52.9s	remaining: 1m 18s
180:	learn: 0.0664104	test: 0.0690093	best: 0.0690065 (173)	total: 59.4s	remaining: 1m 11s
200:	learn: 0.0663087	test: 0.0690160	best: 0.0690065 (173)	total: 1m 5s	remaining: 1m 5s
220:	learn: 0.0

<catboost.core.CatBoostRegressor at 0x1c0e2c95390>

In [19]:
print_feature_importance(model, train_pool, X_train)

regionidzip	6.953406502023265
yearbuilt	6.281918858949721
finishedsquarefeet12	6.1919648863190755
transaction_month	6.173953224491355
calculatedfinishedsquarefeet	5.751172272035678
propertycountylandusecode	5.183058365739174
transaction_day	4.9891619256545034
taxamount	4.826309698346977
regionidcity	4.344205969502303
lotsizesquarefeet	3.752647671848723
regionidneighborhood	3.544050176441048
structuretaxvaluedollarcnt	3.4424464678274753
propertylandusetypeid	3.1758155985692023
latitude	3.1229722917229443
taxvaluedollarcnt	3.095788564573395
assessmentyear	3.0193749277922888
buildingqualitytypeid	2.7738720134653327
heatingorsystemtypeid	2.3993810576985624
transaction_quarter	2.389034293801706
airconditioningtypeid	2.1900429596160516
landtaxvaluedollarcnt	2.090379667982566
longitude	2.084941793526099
censustractandblock	1.9819640092392496
taxdelinquencyyear	1.6779208897534241
bedroomcnt	1.3816512052728742
rawcensustractandblock	1.2896210810684796
garagetotalsqft	1.2115343724459853
regionid

## Ensemble - CatBoostRegressor

In [21]:
num_ensembles = 5
# ensemble models
models = [None] * num_ensembles
for i in range(num_ensembles):
    print("\nTraining (ensemble): %d ..." % (i))
    catboost_parameters['random_seed'] = i
    models[i] = CatBoostRegressor(**catboost_parameters)
    models[i].fit(train_pool, eval_set=test_pool)
    print('-- Feature Importance --')
    print_feature_importance(models[i], train_pool, X_train)


Training (ensemble): 0 ...
0:	learn: 0.0685721	test: 0.0698997	best: 0.0698997 (0)	total: 317ms	remaining: 2m 6s
20:	learn: 0.0676645	test: 0.0691855	best: 0.0691855 (20)	total: 7.27s	remaining: 2m 11s
40:	learn: 0.0673697	test: 0.0690752	best: 0.0690752 (40)	total: 14.2s	remaining: 2m 4s
60:	learn: 0.0671764	test: 0.0690454	best: 0.0690454 (60)	total: 20.8s	remaining: 1m 55s
80:	learn: 0.0670239	test: 0.0690356	best: 0.0690356 (80)	total: 27.2s	remaining: 1m 47s
100:	learn: 0.0668856	test: 0.0690200	best: 0.0690200 (100)	total: 34s	remaining: 1m 40s
120:	learn: 0.0667335	test: 0.0690136	best: 0.0690136 (120)	total: 40.5s	remaining: 1m 33s
140:	learn: 0.0666128	test: 0.0690146	best: 0.0690097 (129)	total: 46.4s	remaining: 1m 25s
160:	learn: 0.0665155	test: 0.0690159	best: 0.0690097 (129)	total: 52.4s	remaining: 1m 17s
180:	learn: 0.0664104	test: 0.0690093	best: 0.0690065 (173)	total: 58.9s	remaining: 1m 11s
200:	learn: 0.0663087	test: 0.0690160	best: 0.0690065 (173)	total: 1m 5s	remai

240:	learn: 0.0661877	test: 0.0690282	best: 0.0690016 (146)	total: 1m 16s	remaining: 50.8s
260:	learn: 0.0661048	test: 0.0690282	best: 0.0690016 (146)	total: 1m 23s	remaining: 44.2s
280:	learn: 0.0660352	test: 0.0690325	best: 0.0690016 (146)	total: 1m 28s	remaining: 37.6s
300:	learn: 0.0659696	test: 0.0690387	best: 0.0690016 (146)	total: 1m 34s	remaining: 31.2s
320:	learn: 0.0659028	test: 0.0690358	best: 0.0690016 (146)	total: 1m 40s	remaining: 24.8s
340:	learn: 0.0658387	test: 0.0690308	best: 0.0690016 (146)	total: 1m 47s	remaining: 18.5s
360:	learn: 0.0657521	test: 0.0690295	best: 0.0690016 (146)	total: 1m 53s	remaining: 12.2s
380:	learn: 0.0656762	test: 0.0690237	best: 0.0690016 (146)	total: 1m 59s	remaining: 5.95s
399:	learn: 0.0656024	test: 0.0690244	best: 0.0690016 (146)	total: 2m 5s	remaining: 0us
bestTest = 0.06900160924
bestIteration = 146
Shrink model to first 147 iterations.
-- Feature Importance --
finishedsquarefeet12	8.844267691835219
regionidzip	7.3654528123336425
taxamo

In [22]:
submission = pd.DataFrame({
    'ParcelId': test_df['parcelid'],
})

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    test_df['transactiondate'] = test_date
    test_df = add_date_features(test_df)
    y_pred = 0.0
    for i in range(num_ensembles):
        print("Ensemble:", i)
        y_pred += models[i].predict(test_df[train_features])
    y_pred /= num_ensembles
    submission[label] = y_pred

submission_major = 2
print("Creating submission: submission_%03d.csv ..." % (submission_major))
submission.to_csv(
    'C:/Users/Yousuf Khan/Data/zillow-prize-1/submission_%03d.csv' % (submission_major),
    float_format='%.4f',
    index=False)
print("Finished.")

Predicting for: 201610 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201611 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201612 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201710 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201711 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201712 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Creating submission: submission_002.csv ...
Finished.
