In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
df_drop = pd.read_csv('../output/outlier_remove.csv')
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']
target = df_drop.logerror
features = df_drop.drop(['logerror'], axis = 1)

In [10]:
from sklearn.model_selection import train_test_split
score = 0
models = []
params = {"objective": "regression", "boosting": "gbdt", "num_leaves": 500, 
              "learning_rate": 0.002515, 'bagging_fraction': 0.8297, "reg_lambda": 0.1052, 
              'reg_alpha':0.1046, "metric": "rmse", 'max_depth': 10, 'min_child_weight': 27,
              'verbose': -1, 'min_split_gain':0.08138 , 'subsample_freq':1, 'sub_feature':  0.4492}
num_ensembles = 5
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=99)
# ensemble models
for i in range(num_ensembles):
    print("\nTraining (ensemble): %d ..." % (i))
    params['random_seed'] = i
    d_training = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=5000, valid_sets=[d_training,d_test],
                      verbose_eval=25, early_stopping_rounds=50)
    models.append(model)


Training (ensemble): 0 ...
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.0828857	valid_1's rmse: 0.0835359
[50]	training's rmse: 0.0827942	valid_1's rmse: 0.0834708
[75]	training's rmse: 0.0827064	valid_1's rmse: 0.0834091
[100]	training's rmse: 0.082619	valid_1's rmse: 0.0833517
[125]	training's rmse: 0.0825447	valid_1's rmse: 0.0833013
[150]	training's rmse: 0.0824701	valid_1's rmse: 0.0832508
[175]	training's rmse: 0.0824003	valid_1's rmse: 0.0832047
[200]	training's rmse: 0.0823315	valid_1's rmse: 0.083162
[225]	training's rmse: 0.0822651	valid_1's rmse: 0.0831209
[250]	training's rmse: 0.0822065	valid_1's rmse: 0.0830839
[275]	training's rmse: 0.0821507	valid_1's rmse: 0.0830493
[300]	training's rmse: 0.0820955	valid_1's rmse: 0.0830166
[325]	training's rmse: 0.0820495	valid_1's rmse: 0.0829869
[350]	training's rmse: 0.0819969	valid_1's rmse: 0.0829573
[375]	training's rmse: 0.08195	valid_1's rmse: 0.0829288
[400]	training's rmse: 0.0819052	

[525]	training's rmse: 0.081706	valid_1's rmse: 0.0827966
[550]	training's rmse: 0.0816718	valid_1's rmse: 0.082779
[575]	training's rmse: 0.0816408	valid_1's rmse: 0.0827618
[600]	training's rmse: 0.0816112	valid_1's rmse: 0.0827466
[625]	training's rmse: 0.081583	valid_1's rmse: 0.082732
[650]	training's rmse: 0.081555	valid_1's rmse: 0.0827183
[675]	training's rmse: 0.0815279	valid_1's rmse: 0.0827056
[700]	training's rmse: 0.0814999	valid_1's rmse: 0.0826933
[725]	training's rmse: 0.0814697	valid_1's rmse: 0.0826802
[750]	training's rmse: 0.081445	valid_1's rmse: 0.0826686
[775]	training's rmse: 0.0814199	valid_1's rmse: 0.0826576
[800]	training's rmse: 0.0813976	valid_1's rmse: 0.0826467
[825]	training's rmse: 0.0813714	valid_1's rmse: 0.0826367
[850]	training's rmse: 0.0813469	valid_1's rmse: 0.0826269
[875]	training's rmse: 0.0813286	valid_1's rmse: 0.0826181
[900]	training's rmse: 0.0813097	valid_1's rmse: 0.0826103
[925]	training's rmse: 0.0812893	valid_1's rmse: 0.0826013
[95

[575]	training's rmse: 0.081644	valid_1's rmse: 0.0827616
[600]	training's rmse: 0.0816121	valid_1's rmse: 0.082747
[625]	training's rmse: 0.0815799	valid_1's rmse: 0.0827316
[650]	training's rmse: 0.0815536	valid_1's rmse: 0.0827183
[675]	training's rmse: 0.0815199	valid_1's rmse: 0.0827046
[700]	training's rmse: 0.0814959	valid_1's rmse: 0.0826926
[725]	training's rmse: 0.0814684	valid_1's rmse: 0.0826807
[750]	training's rmse: 0.0814466	valid_1's rmse: 0.0826716
[775]	training's rmse: 0.0814216	valid_1's rmse: 0.0826607
[800]	training's rmse: 0.0813965	valid_1's rmse: 0.0826497
[825]	training's rmse: 0.0813758	valid_1's rmse: 0.0826387
[850]	training's rmse: 0.0813536	valid_1's rmse: 0.0826307
[875]	training's rmse: 0.0813367	valid_1's rmse: 0.0826218
[900]	training's rmse: 0.081314	valid_1's rmse: 0.0826126
[925]	training's rmse: 0.0812925	valid_1's rmse: 0.082603
[950]	training's rmse: 0.0812697	valid_1's rmse: 0.0825947
[975]	training's rmse: 0.0812537	valid_1's rmse: 0.0825875
[

[800]	training's rmse: 0.0813934	valid_1's rmse: 0.082641
[825]	training's rmse: 0.0813672	valid_1's rmse: 0.0826311
[850]	training's rmse: 0.0813426	valid_1's rmse: 0.0826224
[875]	training's rmse: 0.0813209	valid_1's rmse: 0.0826147
[900]	training's rmse: 0.0812995	valid_1's rmse: 0.0826056
[925]	training's rmse: 0.0812796	valid_1's rmse: 0.0825979
[950]	training's rmse: 0.0812604	valid_1's rmse: 0.08259
[975]	training's rmse: 0.0812414	valid_1's rmse: 0.0825823
[1000]	training's rmse: 0.0812235	valid_1's rmse: 0.0825754
[1025]	training's rmse: 0.0812063	valid_1's rmse: 0.082569
[1050]	training's rmse: 0.0811913	valid_1's rmse: 0.0825628
[1075]	training's rmse: 0.0811708	valid_1's rmse: 0.0825571
[1100]	training's rmse: 0.0811568	valid_1's rmse: 0.082551
[1125]	training's rmse: 0.0811392	valid_1's rmse: 0.0825457
[1150]	training's rmse: 0.0811239	valid_1's rmse: 0.0825413
[1175]	training's rmse: 0.0811116	valid_1's rmse: 0.0825362
[1200]	training's rmse: 0.0810981	valid_1's rmse: 0.0

[1200]	training's rmse: 0.0811057	valid_1's rmse: 0.0825358
[1225]	training's rmse: 0.0810938	valid_1's rmse: 0.0825313
[1250]	training's rmse: 0.0810802	valid_1's rmse: 0.0825265
[1275]	training's rmse: 0.0810673	valid_1's rmse: 0.0825224
[1300]	training's rmse: 0.0810567	valid_1's rmse: 0.0825179
[1325]	training's rmse: 0.0810449	valid_1's rmse: 0.0825146
[1350]	training's rmse: 0.0810342	valid_1's rmse: 0.0825114
[1375]	training's rmse: 0.0810233	valid_1's rmse: 0.0825083
[1400]	training's rmse: 0.0810129	valid_1's rmse: 0.0825046
[1425]	training's rmse: 0.081002	valid_1's rmse: 0.0825022
[1450]	training's rmse: 0.0809893	valid_1's rmse: 0.0824994
[1475]	training's rmse: 0.0809798	valid_1's rmse: 0.0824966
[1500]	training's rmse: 0.0809709	valid_1's rmse: 0.0824944
[1525]	training's rmse: 0.0809613	valid_1's rmse: 0.0824917
[1550]	training's rmse: 0.0809543	valid_1's rmse: 0.0824897
[1575]	training's rmse: 0.0809454	valid_1's rmse: 0.0824879
[1600]	training's rmse: 0.0809375	valid_1

In [14]:
df_sub_2016 = reduce_mem_usage(pd.read_csv('../output/final_sub_2016.csv')).drop_duplicates('parcelid')
df_sub_2016['year'] = 0
results = {}
for month in [10,11,12]:
    df_sub_2016['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2016[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2016[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub = pd.read_csv('../Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017 = reduce_mem_usage(pd.read_csv('../output/final_sub_2017.csv'))
df_sub_2017['year'] = 1
results = {}
for month in [10,11,12]:
    df_sub_2017['month'] = month
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(df_sub_2017[list(features.columns)], 
                                       num_iteration=i.best_iteration) / len(models)
        else:
            results[month] += i.predict(df_sub_2017[list(features.columns)], 
                                        num_iteration=i.best_iteration) / len(models)
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('../output/submission/lgb_en5_opt.csv', index = False)

Memory usage of dataframe is 1087.64 MB
Memory usage after optimization is: 428.55 MB
Decreased by 60.6%
Memory usage of dataframe is 1227.28 MB
Memory usage after optimization is: 483.58 MB
Decreased by 60.6%
