In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
df_merge = reduce_mem_usage(pd.read_csv('./output/final_merge.csv'))
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']
df_drop = df_merge.drop_duplicates(subset = ['parcelid', 'logerror'])
df_drop = df_drop[ df_drop.logerror > -0.4 ]
df_drop = df_drop[ df_drop.logerror < 0.419 ]
df_drop = df_drop.reset_index(drop = True)
target = df_drop.logerror
features = df_drop.drop(['logerror'], axis = 1)

Memory usage of dataframe is 65.85 MB
Memory usage after optimization is: 26.09 MB
Decreased by 60.4%


In [13]:
df_drop.to_csv('./output/outlier_remove.csv', index = False)

In [8]:
score = 0
kf = KFold(n_splits=3)
models = []

params = {"objective": "regression", "boosting": "gbdt", "num_leaves": 500, 
              "learning_rate": 0.002515, 'bagging_fraction': 0.8297, "reg_lambda": 0.1052, 
              'reg_alpha':0.1046, "metric": "rmse", 'max_depth': 10, 'min_child_weight': 27,
              'verbose': -1, 'min_split_gain':0.08138 , 'subsample_freq':1, 'sub_feature':  0.4492}
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    d_training = lgb.Dataset(train_features, label=train_target, 
                             categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target, 
                         categorical_feature=categorical_features, free_raw_data=False)
    model = lgb.train(params, train_set=d_training, num_boost_round=3000, 
                      valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    y_pred_valid = model.predict(test_features)
    score += np.sqrt(mean_squared_error(test_target, y_pred_valid)) / 3
    models.append(model)



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0824427	valid_1's rmse: 0.0841988
[50]	training's rmse: 0.0823596	valid_1's rmse: 0.0841425
[75]	training's rmse: 0.082281	valid_1's rmse: 0.0840867
[100]	training's rmse: 0.0822077	valid_1's rmse: 0.0840381
[125]	training's rmse: 0.0821311	valid_1's rmse: 0.0839897
[150]	training's rmse: 0.082059	valid_1's rmse: 0.083944
[175]	training's rmse: 0.0820008	valid_1's rmse: 0.0839048
[200]	training's rmse: 0.0819378	valid_1's rmse: 0.0838658
[225]	training's rmse: 0.0818748	valid_1's rmse: 0.08383
[250]	training's rmse: 0.0818242	valid_1's rmse: 0.0837971
[275]	training's rmse: 0.0817757	valid_1's rmse: 0.0837682
[300]	training's rmse: 0.0817254	valid_1's rmse: 0.0837367
[325]	training's rmse: 0.0816771	valid_1's rmse: 0.0837092
[350]	training's rmse: 0.0816309	valid_1's rmse: 0.0836837
[375]	training's rmse: 0.0815944	valid_1's rmse: 0.0836615
[400]	training's rmse: 0.0815523	valid_1's rmse: 0.0836401
[42



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0826771	valid_1's rmse: 0.0837032
[50]	training's rmse: 0.0825798	valid_1's rmse: 0.0836486
[75]	training's rmse: 0.0824844	valid_1's rmse: 0.083597
[100]	training's rmse: 0.0824013	valid_1's rmse: 0.0835519
[125]	training's rmse: 0.082312	valid_1's rmse: 0.0835045
[150]	training's rmse: 0.0822268	valid_1's rmse: 0.0834592
[175]	training's rmse: 0.0821574	valid_1's rmse: 0.0834229
[200]	training's rmse: 0.0820828	valid_1's rmse: 0.083387
[225]	training's rmse: 0.082013	valid_1's rmse: 0.0833531
[250]	training's rmse: 0.0819541	valid_1's rmse: 0.0833217
[275]	training's rmse: 0.0818989	valid_1's rmse: 0.0832946
[300]	training's rmse: 0.081842	valid_1's rmse: 0.0832672
[325]	training's rmse: 0.0817846	valid_1's rmse: 0.0832405
[350]	training's rmse: 0.0817319	valid_1's rmse: 0.0832166
[375]	training's rmse: 0.0816883	valid_1's rmse: 0.083196
[400]	training's rmse: 0.0816422	valid_1's rmse: 0.0831774
[425



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0838518	valid_1's rmse: 0.0813017
[50]	training's rmse: 0.0837527	valid_1's rmse: 0.0812532
[75]	training's rmse: 0.0836533	valid_1's rmse: 0.0812083
[100]	training's rmse: 0.083566	valid_1's rmse: 0.0811682
[125]	training's rmse: 0.0834712	valid_1's rmse: 0.0811284
[150]	training's rmse: 0.0833804	valid_1's rmse: 0.0810926
[175]	training's rmse: 0.0833072	valid_1's rmse: 0.0810605
[200]	training's rmse: 0.0832299	valid_1's rmse: 0.0810308
[225]	training's rmse: 0.0831549	valid_1's rmse: 0.0810009
[250]	training's rmse: 0.0830905	valid_1's rmse: 0.0809744
[275]	training's rmse: 0.0830321	valid_1's rmse: 0.0809501
[300]	training's rmse: 0.0829726	valid_1's rmse: 0.0809275
[325]	training's rmse: 0.0829127	valid_1's rmse: 0.0809107
[350]	training's rmse: 0.0828561	valid_1's rmse: 0.0808899
[375]	training's rmse: 0.0828113	valid_1's rmse: 0.0808777
[400]	training's rmse: 0.0827605	valid_1's rmse: 0.0808655

In [9]:
del df_merge
gc.collect()
df_sub_2016 = reduce_mem_usage(pd.read_csv('./output/final_sub_2016.csv')).drop_duplicates('parcelid')
df_sub_2016['year'] = 0
results = {}
for month in [10,11,12]:
    df_sub_2016['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2016[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2016[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub = pd.read_csv('./Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017 = reduce_mem_usage(pd.read_csv('./output/final_sub_2017.csv'))
df_sub_2017['year'] = 1
results = {}
for month in [10,11,12]:
    df_sub_2017['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2017[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2017[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('./output/sample_submission.csv', index = False)

Memory usage of dataframe is 1087.64 MB
Memory usage after optimization is: 428.55 MB
Decreased by 60.6%
Memory usage of dataframe is 1227.28 MB
Memory usage after optimization is: 483.58 MB
Decreased by 60.6%


In [10]:
df_sub

Unnamed: 0,parcelid,201610,201611,201612,201710,201711,201712
0,10754147,-0.008423,-0.008376,-0.008376,-0.006210,-0.006163,-0.006163
1,10759547,-0.002626,-0.002611,-0.002611,-0.000116,-0.000100,-0.000100
2,10843547,0.004614,0.004620,0.004620,0.002415,0.002457,0.002457
3,10859147,0.013256,0.013272,0.013272,0.012729,0.012744,0.012744
4,10879947,0.009906,0.009922,0.009922,0.009583,0.009599,0.009599
...,...,...,...,...,...,...,...
2985212,168176230,0.010511,0.010558,0.010558,0.016145,0.016151,0.016151
2985213,14273630,0.010378,0.010425,0.010425,0.017108,0.017108,0.017108
2985214,168040630,0.010511,0.010558,0.010558,0.009141,0.009141,0.009141
2985215,168040830,0.010374,0.010421,0.010421,0.045609,0.045645,0.045645
