In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
df_merge = reduce_mem_usage(pd.read_csv('./output/final_merge.csv'))
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']
df_drop = df_merge.drop_duplicates(subset = ['parcelid', 'logerror'])
df_drop = df_drop[ df_drop.logerror > -0.4 ]
df_drop = df_drop[ df_drop.logerror < 0.419 ]
df_drop = df_drop.reset_index(drop = True)
target = df_drop.logerror
features = df_drop.drop(['logerror'], axis = 1)

Memory usage of dataframe is 65.85 MB
Memory usage after optimization is: 26.09 MB
Decreased by 60.4%


In [3]:
df_drop = pd.read_csv('./output/outlier_remove.csv')
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']
target = df_drop.logerror
features = df_drop.drop(['logerror'], axis = 1)

In [5]:
score = 0
kf = KFold(n_splits=5)
models = []

params = {"objective": "regression", "boosting": "gbdt", "num_leaves": 500, 
              "learning_rate": 0.002515, 'bagging_fraction': 0.8297, "reg_lambda": 0.1052, 
              'reg_alpha':0.1046, "metric": "rmse", 'max_depth': 10, 'min_child_weight': 27,
              'verbose': -1, 'min_split_gain':0.08138 , 'subsample_freq':1, 'sub_feature':  0.4492}
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    d_training = lgb.Dataset(train_features, label=train_target, 
                             categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target, 
                         categorical_feature=categorical_features, free_raw_data=False)
    model = lgb.train(params, train_set=d_training, num_boost_round=3000, 
                      valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    y_pred_valid = model.predict(test_features)
    score += np.sqrt(mean_squared_error(test_target, y_pred_valid)) / 3
    models.append(model)

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0822919	valid_1's rmse: 0.0858488
[50]	training's rmse: 0.082208	valid_1's rmse: 0.0857774
[75]	training's rmse: 0.082126	valid_1's rmse: 0.0857063
[100]	training's rmse: 0.0820519	valid_1's rmse: 0.0856467
[125]	training's rmse: 0.0819758	valid_1's rmse: 0.0855874
[150]	training's rmse: 0.0819027	valid_1's rmse: 0.0855269
[175]	training's rmse: 0.081843	valid_1's rmse: 0.085481
[200]	training's rmse: 0.0817786	valid_1's rmse: 0.0854322
[225]	training's rmse: 0.0817165	valid_1's rmse: 0.0853869
[250]	training's rmse: 0.0816632	valid_1's rmse: 0.0853469
[275]	training's rmse: 0.0816131	valid_1's rmse: 0.0853098
[300]	training's rmse: 0.0815619	valid_1's rmse: 0.0852739
[325]	training's rmse: 0.0815119	valid_1's rmse: 0.0852399
[350]	training's rmse: 0.0814649	valid_1's rmse: 0.0852101
[375]	training's rmse: 0.0814274	valid_1's rmse: 0.085183
[400]	training's rmse: 0.081385	valid_1's rmse: 0.0851572
[425



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0833887	valid_1's rmse: 0.0814841
[50]	training's rmse: 0.0832997	valid_1's rmse: 0.0814188
[75]	training's rmse: 0.0832124	valid_1's rmse: 0.0813543
[100]	training's rmse: 0.0831332	valid_1's rmse: 0.0812984
[125]	training's rmse: 0.0830533	valid_1's rmse: 0.0812436
[150]	training's rmse: 0.0829747	valid_1's rmse: 0.0811869
[175]	training's rmse: 0.0829106	valid_1's rmse: 0.0811416
[200]	training's rmse: 0.0828424	valid_1's rmse: 0.0810948
[225]	training's rmse: 0.0827745	valid_1's rmse: 0.0810511
[250]	training's rmse: 0.0827182	valid_1's rmse: 0.0810134
[275]	training's rmse: 0.0826644	valid_1's rmse: 0.0809791
[300]	training's rmse: 0.0826118	valid_1's rmse: 0.0809443
[325]	training's rmse: 0.0825618	valid_1's rmse: 0.0809132
[350]	training's rmse: 0.0825104	valid_1's rmse: 0.0808838
[375]	training's rmse: 0.082468	valid_1's rmse: 0.0808573
[400]	training's rmse: 0.0824209	valid_1's rmse: 0.0808309



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0826851	valid_1's rmse: 0.0843388
[50]	training's rmse: 0.0825915	valid_1's rmse: 0.0842823
[75]	training's rmse: 0.0825016	valid_1's rmse: 0.0842285
[100]	training's rmse: 0.08242	valid_1's rmse: 0.0841807
[125]	training's rmse: 0.0823358	valid_1's rmse: 0.0841343
[150]	training's rmse: 0.0822545	valid_1's rmse: 0.0840882
[175]	training's rmse: 0.0821889	valid_1's rmse: 0.0840517
[200]	training's rmse: 0.0821167	valid_1's rmse: 0.0840157
[225]	training's rmse: 0.0820487	valid_1's rmse: 0.083982
[250]	training's rmse: 0.0819914	valid_1's rmse: 0.0839502
[275]	training's rmse: 0.0819362	valid_1's rmse: 0.0839206
[300]	training's rmse: 0.0818799	valid_1's rmse: 0.0838932
[325]	training's rmse: 0.0818264	valid_1's rmse: 0.083867
[350]	training's rmse: 0.0817771	valid_1's rmse: 0.0838447
[375]	training's rmse: 0.0817356	valid_1's rmse: 0.0838241
[400]	training's rmse: 0.0816907	valid_1's rmse: 0.083805
[42



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0827814	valid_1's rmse: 0.0839151
[50]	training's rmse: 0.0826837	valid_1's rmse: 0.0838694
[75]	training's rmse: 0.0825849	valid_1's rmse: 0.083828
[100]	training's rmse: 0.0824969	valid_1's rmse: 0.08379
[125]	training's rmse: 0.0824069	valid_1's rmse: 0.083753
[150]	training's rmse: 0.0823192	valid_1's rmse: 0.0837186
[175]	training's rmse: 0.0822483	valid_1's rmse: 0.0836896
[200]	training's rmse: 0.0821731	valid_1's rmse: 0.0836606
[225]	training's rmse: 0.0821006	valid_1's rmse: 0.0836349
[250]	training's rmse: 0.0820363	valid_1's rmse: 0.083611
[275]	training's rmse: 0.0819767	valid_1's rmse: 0.0835884
[300]	training's rmse: 0.081916	valid_1's rmse: 0.0835665
[325]	training's rmse: 0.0818588	valid_1's rmse: 0.0835464
[350]	training's rmse: 0.081804	valid_1's rmse: 0.0835281
[375]	training's rmse: 0.0817578	valid_1's rmse: 0.0835125
[400]	training's rmse: 0.0817101	valid_1's rmse: 0.0834976
[425]



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0838302	valid_1's rmse: 0.0796423
[50]	training's rmse: 0.0837351	valid_1's rmse: 0.0795881
[75]	training's rmse: 0.0836408	valid_1's rmse: 0.0795417
[100]	training's rmse: 0.0835574	valid_1's rmse: 0.0795029
[125]	training's rmse: 0.083472	valid_1's rmse: 0.0794645
[150]	training's rmse: 0.0833883	valid_1's rmse: 0.0794247
[175]	training's rmse: 0.0833191	valid_1's rmse: 0.0794004
[200]	training's rmse: 0.0832459	valid_1's rmse: 0.0793669
[225]	training's rmse: 0.0831763	valid_1's rmse: 0.0793344
[250]	training's rmse: 0.0831145	valid_1's rmse: 0.0793108
[275]	training's rmse: 0.0830588	valid_1's rmse: 0.079292
[300]	training's rmse: 0.083001	valid_1's rmse: 0.079267
[325]	training's rmse: 0.0829458	valid_1's rmse: 0.0792734
[350]	training's rmse: 0.0828927	valid_1's rmse: 0.0792494
[375]	training's rmse: 0.0828479	valid_1's rmse: 0.0792304
[400]	training's rmse: 0.0828005	valid_1's rmse: 0.0792117
[4

In [7]:
del df_drop
gc.collect()
df_sub_2016 = reduce_mem_usage(pd.read_csv('./output/final_sub_2016.csv')).drop_duplicates('parcelid')
df_sub_2016['year'] = 0
results = {}
for month in [10,11,12]:
    df_sub_2016['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2016[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2016[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub = pd.read_csv('./Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017 = reduce_mem_usage(pd.read_csv('./output/final_sub_2017.csv'))
df_sub_2017['year'] = 1
results = {}
for month in [10,11,12]:
    df_sub_2017['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2017[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2017[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('./output/sample_submission.csv', index = False)

Memory usage of dataframe is 1087.64 MB
Memory usage after optimization is: 428.55 MB
Decreased by 60.6%
Memory usage of dataframe is 1227.28 MB
Memory usage after optimization is: 483.58 MB
Decreased by 60.6%


In [8]:
df_sub

Unnamed: 0,parcelid,201610,201611,201612,201710,201711,201712
0,10754147,-0.015853,-0.015826,-0.015813,-0.011554,-0.011527,-0.011514
1,10759547,-0.011209,-0.011178,-0.011166,-0.006334,-0.006303,-0.006291
2,10843547,0.003648,0.003662,0.003662,0.005417,0.005451,0.005451
3,10859147,0.014715,0.014736,0.014749,0.014211,0.014232,0.014245
4,10879947,0.010276,0.010297,0.010310,0.010075,0.010096,0.010109
...,...,...,...,...,...,...,...
2985212,168176230,0.014003,0.014110,0.014110,0.017458,0.017477,0.017477
2985213,14273630,0.013737,0.013844,0.013844,0.018469,0.018473,0.018473
2985214,168040630,0.014003,0.014110,0.014110,0.010036,0.010045,0.010056
2985215,168040830,0.013829,0.013936,0.013936,0.054464,0.054530,0.054557
