In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [9]:
df_merge = reduce_mem_usage(pd.read_csv('./output/final_merge.csv'))
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']
df_drop = df_merge.drop_duplicates(subset = ['parcelid', 'logerror'])
df_drop = df_drop[ df_drop.logerror > -0.4 ]
df_drop = df_drop[ df_drop.logerror < 0.419 ]
df_drop = df_drop.reset_index(drop = True)
target = df_drop.logerror
features = df_drop.drop(['logerror'], axis = 1)

Memory usage of dataframe is 65.85 MB
Memory usage after optimization is: 26.09 MB
Decreased by 60.4%


In [13]:
df_drop.to_csv('./output/outlier_remove.csv', index = False)

In [10]:
score = 0
kf = KFold(n_splits=3)
models = []

params = {"objective": "regression", "boosting": "gbdt", "num_leaves": 700, 
              "learning_rate": 0.002646, 'bagging_fraction': 0.8296, "reg_lambda": 0.0963, 
              'reg_alpha':0.1066, "metric": "rmse", 'max_depth': -1, 'min_child_weight': 27,
              'verbose': -1, 'min_split_gain':0.08097 , 'subsample_freq':1, 'sub_feature':  0.5809}
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    d_training = lgb.Dataset(train_features, label=train_target, 
                             categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target, 
                         categorical_feature=categorical_features, free_raw_data=False)
    model = lgb.train(params, train_set=d_training, num_boost_round=3000, 
                      valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    y_pred_valid = model.predict(test_features)
    score += np.sqrt(mean_squared_error(test_target, y_pred_valid)) / 3
    models.append(model)



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0945452	valid_1's rmse: 0.0966607
[50]	training's rmse: 0.0944052	valid_1's rmse: 0.0965747
[75]	training's rmse: 0.0942687	valid_1's rmse: 0.0964955
[100]	training's rmse: 0.0941411	valid_1's rmse: 0.0964239
[125]	training's rmse: 0.094022	valid_1's rmse: 0.096354
[150]	training's rmse: 0.0939129	valid_1's rmse: 0.0962911
[175]	training's rmse: 0.0938204	valid_1's rmse: 0.0962349
[200]	training's rmse: 0.0937221	valid_1's rmse: 0.096183
[225]	training's rmse: 0.0936282	valid_1's rmse: 0.0961359
[250]	training's rmse: 0.0935509	valid_1's rmse: 0.0960918
[275]	training's rmse: 0.0934673	valid_1's rmse: 0.0960499
[300]	training's rmse: 0.0933895	valid_1's rmse: 0.0960122
[325]	training's rmse: 0.0933174	valid_1's rmse: 0.0959781
[350]	training's rmse: 0.0932435	valid_1's rmse: 0.0959464
[375]	training's rmse: 0.0931813	valid_1's rmse: 0.09592
[400]	training's rmse: 0.0931184	valid_1's rmse: 0.095895
[425



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0949834	valid_1's rmse: 0.0957003
[50]	training's rmse: 0.0948029	valid_1's rmse: 0.0956181
[75]	training's rmse: 0.0946305	valid_1's rmse: 0.0955455
[100]	training's rmse: 0.0944698	valid_1's rmse: 0.0954783
[125]	training's rmse: 0.0943202	valid_1's rmse: 0.0954178
[150]	training's rmse: 0.0941774	valid_1's rmse: 0.0953629
[175]	training's rmse: 0.0940545	valid_1's rmse: 0.0953155
[200]	training's rmse: 0.0939306	valid_1's rmse: 0.0952712
[225]	training's rmse: 0.0938143	valid_1's rmse: 0.0952331
[250]	training's rmse: 0.0937118	valid_1's rmse: 0.0951968
[275]	training's rmse: 0.0936126	valid_1's rmse: 0.0951632
[300]	training's rmse: 0.093522	valid_1's rmse: 0.0951326
[325]	training's rmse: 0.0934351	valid_1's rmse: 0.0951053
[350]	training's rmse: 0.0933423	valid_1's rmse: 0.09508
[375]	training's rmse: 0.0932652	valid_1's rmse: 0.0950588
[400]	training's rmse: 0.093194	valid_1's rmse: 0.0950399
[4



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0960251	valid_1's rmse: 0.0935749
[50]	training's rmse: 0.0958409	valid_1's rmse: 0.0935081
[75]	training's rmse: 0.0956641	valid_1's rmse: 0.0934471
[100]	training's rmse: 0.0955022	valid_1's rmse: 0.0933901
[125]	training's rmse: 0.0953478	valid_1's rmse: 0.0933387
[150]	training's rmse: 0.0952022	valid_1's rmse: 0.0932924
[175]	training's rmse: 0.0950772	valid_1's rmse: 0.0932545
[200]	training's rmse: 0.0949471	valid_1's rmse: 0.0932178
[225]	training's rmse: 0.0948271	valid_1's rmse: 0.0931842
[250]	training's rmse: 0.0947226	valid_1's rmse: 0.0931577
[275]	training's rmse: 0.09462	valid_1's rmse: 0.0931292
[300]	training's rmse: 0.0945209	valid_1's rmse: 0.0931058
[325]	training's rmse: 0.0944296	valid_1's rmse: 0.0930843
[350]	training's rmse: 0.0943317	valid_1's rmse: 0.0930634
[375]	training's rmse: 0.0942483	valid_1's rmse: 0.093047
[400]	training's rmse: 0.094171	valid_1's rmse: 0.0930303
[4

In [11]:
del df_merge
gc.collect()
df_sub_2016 = reduce_mem_usage(pd.read_csv('./output/final_sub_2016.csv')).drop_duplicates('parcelid')
df_sub_2016['year'] = 0
results = {}
for month in [10,11,12]:
    df_sub_2016['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2016[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2016[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub = pd.read_csv('./Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017 = reduce_mem_usage(pd.read_csv('./output/final_sub_2017.csv'))
df_sub_2017['year'] = 1
results = {}
for month in [10,11,12]:
    df_sub_2017['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2017[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2017[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('./output/sample_submission.csv', index = False)

Memory usage of dataframe is 1087.64 MB
Memory usage after optimization is: 428.55 MB
Decreased by 60.6%
Memory usage of dataframe is 1227.28 MB
Memory usage after optimization is: 483.58 MB
Decreased by 60.6%


In [12]:
df_sub

Unnamed: 0,parcelid,201610,201611,201612,201710,201711,201712
0,10754147,-0.025716,-0.025634,-0.025575,-0.021063,-0.020982,-0.020923
1,10759547,-0.004304,-0.004321,-0.004262,0.000949,0.000932,0.000991
2,10843547,0.002021,0.002021,0.002021,0.003166,0.003158,0.003310
3,10859147,0.019901,0.019901,0.019930,0.018740,0.018740,0.018768
4,10879947,0.008085,0.008152,0.008233,0.008025,0.008092,0.008173
...,...,...,...,...,...,...,...
2985212,168176230,0.003283,0.003245,0.003283,0.027966,0.028009,0.028026
2985213,14273630,0.003313,0.003275,0.003313,0.020822,0.020851,0.020861
2985214,168040630,0.003283,0.003245,0.003283,0.009524,0.009533,0.009555
2985215,168040830,0.003191,0.003153,0.003191,0.111149,0.111058,0.111189
