In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv("../output/outlier_remove.csv")
cat_feature_inds = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']

In [3]:
train_df.fillna(-999, inplace=True)
for i in cat_feature_inds:
    if train_df[i].dtype == 'float':
        train_df[i] = train_df[i].astype('int')

In [4]:
def print_feature_importance(model, pool, X_train):
    feature_importances = model.get_feature_importance(pool)
    feature_names = X_train.columns
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}\t{}'.format(name, score))

In [6]:
train_features = [x for x in list(train_df.columns) if x != 'logerror']
X_train, X_test, y_train, y_test = train_test_split(train_df[train_features], train_df.logerror, test_size=0.2, random_state=99)

all_pool = Pool(train_df[train_features], train_df.logerror, cat_feature_inds)
train_pool = Pool(X_train, y_train, cat_feature_inds)
test_pool = Pool(X_test, y_test, cat_feature_inds)

In [7]:
catboost_parameters = {
    'iterations': 400,
    'learning_rate': 0.035,
    'depth': 7,
    'verbose': 20,
#     'l2_leaf_reg': 1000,
    'task_type': 'GPU',
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 0,
}

In [8]:
model = CatBoostRegressor(**catboost_parameters)
model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.0527219	test: 0.0532467	best: 0.0532467 (0)	total: 27.4ms	remaining: 10.9s
20:	learn: 0.0520318	test: 0.0526561	best: 0.0526561 (20)	total: 462ms	remaining: 8.34s
40:	learn: 0.0517932	test: 0.0525766	best: 0.0525764 (39)	total: 834ms	remaining: 7.3s
60:	learn: 0.0515910	test: 0.0525391	best: 0.0525367 (55)	total: 1.2s	remaining: 6.69s
80:	learn: 0.0514249	test: 0.0525154	best: 0.0525154 (80)	total: 1.58s	remaining: 6.22s
100:	learn: 0.0512982	test: 0.0525080	best: 0.0525080 (100)	total: 1.95s	remaining: 5.77s
120:	learn: 0.0511670	test: 0.0524994	best: 0.0524977 (108)	total: 2.33s	remaining: 5.38s
140:	learn: 0.0510901	test: 0.0525003	best: 0.0524977 (108)	total: 2.71s	remaining: 4.98s
160:	learn: 0.0509829	test: 0.0524972	best: 0.0524959 (159)	total: 3.08s	remaining: 4.57s
180:	learn: 0.0508716	test: 0.0524940	best: 0.0524925 (176)	total: 3.45s	remaining: 4.18s
200:	learn: 0.0507593	test: 0.0524785	best: 0.0524780 (196)	total: 3.82s	remaining: 3.79s
220:	learn: 0.0506629	t

<catboost.core.CatBoostRegressor at 0x7fbef5f1d7b8>

In [9]:
num_ensembles = 5
# ensemble models
models = [None] * num_ensembles
for i in range(num_ensembles):
    print("\nTraining (ensemble): %d ..." % (i))
    catboost_parameters['random_seed'] = i
    models[i] = CatBoostRegressor(**catboost_parameters)
    models[i].fit(train_pool, eval_set=test_pool)
    print('-- Feature Importance --')
    print_feature_importance(models[i], train_pool, X_train)


Training (ensemble): 0 ...
0:	learn: 0.0527219	test: 0.0532467	best: 0.0532467 (0)	total: 20.9ms	remaining: 8.35s
20:	learn: 0.0520318	test: 0.0526561	best: 0.0526561 (20)	total: 433ms	remaining: 7.82s
40:	learn: 0.0517932	test: 0.0525766	best: 0.0525764 (39)	total: 802ms	remaining: 7.02s
60:	learn: 0.0515910	test: 0.0525391	best: 0.0525367 (55)	total: 1.17s	remaining: 6.49s
80:	learn: 0.0514249	test: 0.0525154	best: 0.0525154 (80)	total: 1.54s	remaining: 6.07s
100:	learn: 0.0512982	test: 0.0525080	best: 0.0525080 (100)	total: 1.9s	remaining: 5.64s
120:	learn: 0.0511670	test: 0.0524994	best: 0.0524977 (108)	total: 2.29s	remaining: 5.28s
140:	learn: 0.0510901	test: 0.0525003	best: 0.0524977 (108)	total: 2.66s	remaining: 4.89s
160:	learn: 0.0509829	test: 0.0524972	best: 0.0524959 (159)	total: 3.02s	remaining: 4.49s
180:	learn: 0.0508716	test: 0.0524940	best: 0.0524925 (176)	total: 3.39s	remaining: 4.1s
200:	learn: 0.0507593	test: 0.0524785	best: 0.0524780 (196)	total: 3.75s	remaining: 3

220:	learn: 0.0506265	test: 0.0524386	best: 0.0524386 (220)	total: 4.11s	remaining: 3.33s
240:	learn: 0.0505189	test: 0.0524430	best: 0.0524277 (230)	total: 4.47s	remaining: 2.95s
260:	learn: 0.0504337	test: 0.0524374	best: 0.0524277 (230)	total: 4.85s	remaining: 2.58s
280:	learn: 0.0503318	test: 0.0524435	best: 0.0524277 (230)	total: 5.2s	remaining: 2.2s
300:	learn: 0.0502552	test: 0.0524360	best: 0.0524277 (230)	total: 5.56s	remaining: 1.83s
320:	learn: 0.0501614	test: 0.0524489	best: 0.0524277 (230)	total: 5.93s	remaining: 1.46s
340:	learn: 0.0500727	test: 0.0524522	best: 0.0524277 (230)	total: 6.32s	remaining: 1.09s
360:	learn: 0.0499894	test: 0.0524484	best: 0.0524277 (230)	total: 6.68s	remaining: 722ms
380:	learn: 0.0498749	test: 0.0524309	best: 0.0524277 (230)	total: 7.04s	remaining: 351ms
399:	learn: 0.0497637	test: 0.0524281	best: 0.0524273 (390)	total: 7.41s	remaining: 0us
bestTest = 0.05242727511
bestIteration = 390
Shrink model to first 391 iterations.
-- Feature Importance

-- Feature Importance --
regionidneighborhood_fill	7.737225600075182
finishedsquarefeet12	7.201080958123898
regionidcity_fill	6.779871071660735
month	6.3775065311713535
propertylandusetypeid	5.718034882169506
N_life	4.2420285279274825
N_ValueRatio	4.202404642663443
taxamount	4.167505219348889
buildingqualitytypeid_fill	3.5142520194696947
structuretaxvaluedollarcnt	3.0773969211161765
calculatedfinishedsquarefeet	3.074481040801232
landtaxvaluedollarcnt	2.935761264863984
heatingorsystemtypeid	2.8366955160621594
lotsizesquarefeet	2.6556048249040702
N_zip_count	2.4603476336331203
airconditioningtypeid	2.4519835426301566
poolcnt	2.4197188456179672
taxvaluedollarcnt	2.258431563502428
lotsizesquarefeet_refill	2.0999335992800594
longitude	1.865809405394812
regionidzip_fill	1.6922815786101288
min_temp	1.6411607721924704
parcelid	1.555554180343447
rawcensustractandblock	1.5258560332884015
max_temp	1.4914597791245945
regionidcounty	1.352585098621538
latitude	1.2716368945882406
bedroomcnt	1.1223831

In [17]:
test_2016 = 
for i in [train_features[ind] for ind in cat_feature_inds]:
    if test_df[i].dtype == 'float':
        test_df[i] = test_df[i].astype('int')

In [13]:
df_sub_2016 = pd.read_csv('../output/final_sub_2016.csv').drop_duplicates('parcelid')
df_sub_2016['year'] = 0
results = {}
for month in [10,11,12]:
    df_sub_2016['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2016[train_features]) / len(models)
        else:
            results[month] += i.predict(df_sub_2016[train_features]) / len(models)
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub = pd.read_csv('../Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017 = pd.read_csv('../output/final_sub_2017.csv')
df_sub_2017['year'] = 1
results = {}
for month in [10,11,12]:
    df_sub_2017['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2017[train_features]) / len(models)
        else:
            results[month] += i.predict(df_sub_2017[train_features]) / len(models)
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('../output/submission/cat_en5_opt.csv', index = False)

In [18]:
submission = pd.DataFrame({
    'ParcelId': test_df['parcelid'],
})

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    test_df['transactiondate'] = test_date
    test_df = add_date_features(test_df)
    y_pred = 0.0
    for i in range(num_ensembles):
        print("Ensemble:", i)
        y_pred += models[i].predict(test_df[train_features])
    y_pred /= num_ensembles
    submission[label] = y_pred

submission_major = 2
print("Creating submission: submission_%03d.csv ..." % (submission_major))
submission.to_csv(
    'submission_%03d.csv' % (submission_major),
    float_format='%.4f',
    index=False)
print("Finished.")

Predicting for: 201610 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201611 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201612 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201710 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201711 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201712 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Creating submission: submission_002.csv ...
Finished.
