In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
df_merge = reduce_mem_usage(pd.read_csv('../output/outlier_remove.csv'))
target = df_merge.logerror
features = df_merge.drop(['logerror'], axis = 1)

Memory usage of dataframe is 58.98 MB
Memory usage after optimization is: 23.37 MB
Decreased by 60.4%


In [4]:
df_merge.columns

Index(['N_ValueRatio', 'N_life', 'N_zip_count', 'airconditioningtypeid',
       'basementsqft', 'bedroomcnt', 'buildingqualitytypeid_fill',
       'calculatedbathnbr', 'calculatedfinishedsquarefeet',
       'finishedfloor1squarefeet', 'finishedsquarefeet12',
       'finishedsquarefeet50', 'fips', 'fireplacecnt', 'garagecarcnt',
       'garagetotalsqft', 'hashottuborspa', 'heatingorsystemtypeid',
       'landtaxvaluedollarcnt', 'latitude', 'longitude', 'lotsizesquarefeet',
       'lotsizesquarefeet_refill', 'max_temp', 'min_temp', 'numberofstories',
       'parcelid', 'poolcnt', 'pooltypeid2', 'propertycountylandusecode_fill',
       'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity_fill',
       'regionidcounty', 'regionidneighborhood_fill', 'regionidzip_fill',
       'roomcnt', 'structuretaxvaluedollarcnt', 'taxamount',
       'taxdelinquencyflag', 'taxvaluedollarcnt', 'threequarterbathnbr',
       'unitcnt_fill', 'yardbuildingsqft17', 'month', 'year', 'logerror'],
    

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
columns = features.columns
kf = KFold(n_splits=5)
splits = kf.split(features, target)
score = 0
NFOLDS=5
feature_importance_df = pd.DataFrame()
out_folder_train_prediction= pd.DataFrame()
models=[]
y_oof = np.zeros(features.shape[0])
for fold_n, (train_index, valid_index) in enumerate(splits):
    dtrain = xgb.DMatrix(features.iloc[train_index],target.iloc[train_index])
    dvalid = xgb.DMatrix(features.iloc[valid_index], target.iloc[valid_index])
    y_valid=target.iloc[valid_index]
#     params = {
#     'eta': 0.037,
#     'max_depth': 5,
#     'subsample': 0.80,
#     'objective': 'reg:linear',
#     'eval_metric': 'mae',
#     'lambda': 0.8,   
#     'alpha': 0.4, 
#     'base_score': y_mean,
#     'tree_method': 'gpu_hist',
#     'n_gpus': 2,
#     'silent': 1}


    params = {'eval_metric': 'rmse',\
              'objective': 'reg:squarederror',\
              'booster':'gbtree',\
              'nthread' : 4,\
              'eta' : 0.05,\
              'max_depth' : 9,\
              'subsample' : 0.7119,\
              'colsample_bytree' : 0.5507,\
              'colsample_bylevel' : 0.4715,\
              'gamma':0.8069,\
              'max_bin':330,\
              'min_child_weight':14,\
              'reg_alpha':1.665,\
              'reg_lambda':1.874,
              'tree_method': 'gpu_hist',
              'n_gpus': 2}

    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
    model=xgb.train(params, dtrain, 5000, watchlist, maximize=False, early_stopping_rounds = 200, verbose_eval=50)
    
    y_pred_valid = model.predict(dvalid, ntree_limit=model.best_ntree_limit)

    
    oof_preds=pd.DataFrame()
    oof_preds['train_index']=valid_index
    oof_preds['TARGET']= y_pred_valid
    oof_preds["folder"]=fold_n + 1
    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
    fold_importance_df["fold"] = fold_n + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print(f"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_valid, y_pred_valid))}")
    
    score += np.sqrt(mean_squared_error(y_valid,y_pred_valid)) / NFOLDS
    
    y_oof[valid_index] = y_pred_valid
    
    models.append(model)
          
    del dtrain, dvalid, watchlist, y_valid
    gc.collect()
    
print(f"\nMean rmse = {score}")

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:0.473885	valid-rmse:0.476928
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.088757	valid-rmse:0.093385
[100]	train-rmse:0.079506	valid-rmse:0.084705
[150]	train-rmse:0.07867	valid-rmse:0.084498
[200]	train-rmse:0.077983	valid-rmse:0.084429
[250]	train-rmse:0.077372	valid-rmse:0.084397
[300]	train-rmse:0.076831	valid-rmse:0.084373
[350]	train-rmse:0.076312	valid-rmse:0.084367
[400]	train-rmse:0.075781	valid-rmse:0.084368
[450]	train-rmse:0.075237	valid-rmse:0.084401
[500]	train-rmse:0.074747	valid-rmse:0.084416
Stopping. Best iteration:
[316]	train-rmse:0.076689	valid-rmse:0.084355

Fold 1 | rmse: 0.08435534685850143
[0]	train-rmse:0.47396	valid-rmse:0.476675
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[50]	train-rmse:0.089757	valid-rmse:0.089657
[100]	train-rmse:0.080622	valid-rmse:0.080458
[150]	train-rmse:0.079785	valid-rmse:0.080233
[200]	train-rmse:0.079093	valid-rmse:0.080159
[250]	train-rmse:0.078439	valid-rmse:0.080116
[300]	train-rmse:0.077853	valid-rmse:0.080083
[350]	train-rmse:0.077231	valid-rmse:0.080097
[400]	train-rmse:0.076671	valid-rmse:0.080099
[450]	train-rmse:0.076106	valid-rmse:0.080106
[500]	train-rmse:0.075601	valid-rmse:0.080132
Stopping. Best iteration:
[325]	train-rmse:0.077536	valid-rmse:0.080074

Fold 2 | rmse: 0.08007410913705826
[0]	train-rmse:0.47553	valid-rmse:0.469854
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[50]	train-rmse:0.089091	valid-rmse:0.090079
[100]	train-rmse:0.079804	valid-rmse:0.083223
[150]	train-rmse:0.07893	valid-rmse:0.083168
[200]	train-rmse:0.078225	valid-rmse:0.083125
[250]	train-rmse:0.07761	valid-rmse:0.083124
[300]	train-rmse:0.077031	valid-rmse:0.083124
[350]	train-rmse:0.0764	valid-rmse:0.083129
[400]	train-rmse:0.075866	valid-rmse:0.083149
[450]	train-rmse:0.075367	valid-rmse:0.083172
[500]	train-rmse:0.074878	valid-rmse:0.083199
Stopping. Best iteration:
[312]	train-rmse:0.076877	valid-rmse:0.08312

Fold 3 | rmse: 0.08311984688043594
[0]	train-rmse:0.473723	valid-rmse:0.477745
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[50]	train-rmse:0.089052	valid-rmse:0.093123
[100]	train-rmse:0.079826	valid-rmse:0.083591
[150]	train-rmse:0.078981	valid-rmse:0.083342
[200]	train-rmse:0.078279	valid-rmse:0.083306
[250]	train-rmse:0.077634	valid-rmse:0.083319
[300]	train-rmse:0.077051	valid-rmse:0.083333
[350]	train-rmse:0.076448	valid-rmse:0.083348
[400]	train-rmse:0.075858	valid-rmse:0.083354
Stopping. Best iteration:
[202]	train-rmse:0.078265	valid-rmse:0.083303

Fold 4 | rmse: 0.08330345153808594
[0]	train-rmse:0.475261	valid-rmse:0.471181
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[50]	train-rmse:0.090113	valid-rmse:0.083395
[100]	train-rmse:0.080947	valid-rmse:0.080262
[150]	train-rmse:0.080058	valid-rmse:0.08128
[200]	train-rmse:0.079373	valid-rmse:0.081501
[250]	train-rmse:0.078702	valid-rmse:0.081521
Stopping. Best iteration:
[73]	train-rmse:0.082254	valid-rmse:0.07923

Fold 5 | rmse: 0.07923046499490738

Mean rmse = 0.08201664388179779


In [6]:
feature_importance_df

Unnamed: 0,feature,importance,fold
17,lotsizesquarefeet,4046,1
25,N_life,3693,1
9,N_ValueRatio,3644,1
12,calculatedfinishedsquarefeet,3413,1
2,finishedsquarefeet12,3409,1
...,...,...,...
45,threequarterbathnbr,37,5
30,pooltypeid2,34,5
28,regionidcounty,17,5
39,fips,10,5


In [7]:
df_sub_2016 = pd.read_csv('../output/final_sub_2016.csv').drop_duplicates('parcelid')
df_sub_2016['year'] = 0
results = {}
for month in [10,11,12]:
    df_sub_2016['month'] = month
    dtest = xgb.DMatrix(df_sub_2016)
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(dtest, ntree_limit=model.best_ntree_limit) / len(models)
        else:
            results[month] += i.predict(dtest, ntree_limit=model.best_ntree_limit) / len(models)
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub = pd.read_csv('../Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017 = pd.read_csv('../output/final_sub_2017.csv')
df_sub_2017['year'] = 1
results = {}
for month in [10,11,12]:
    df_sub_2017['month'] = month
    dtest = xgb.DMatrix(df_sub_2017)
    for i in models:
        if  month not in results.keys():
            results[month] = i.predict(dtest, ntree_limit=model.best_ntree_limit) / len(models)
        else:
            results[month] += i.predict(dtest, ntree_limit=model.best_ntree_limit) / len(models)
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('../output/submission/xgb5_opt.csv', index = False)

In [8]:
df_sub

Unnamed: 0,parcelid,201610,201611,201612,201710,201711,201712
0,10754147,0.035147,0.036466,0.036466,0.033464,0.034782,0.034782
1,10759547,0.019437,0.020414,0.020414,0.019644,0.020620,0.020620
2,10843547,0.033776,0.034347,0.034347,0.038337,0.038907,0.038907
3,10859147,0.037062,0.037387,0.037387,0.031832,0.032156,0.032156
4,10879947,0.018197,0.018522,0.018522,0.015649,0.015973,0.015973
...,...,...,...,...,...,...,...
2985212,168176230,0.052423,0.052933,0.052933,0.029887,0.030141,0.030141
2985213,14273630,0.048077,0.048402,0.048402,0.019347,0.019347,0.019347
2985214,168040630,0.052423,0.052933,0.052933,0.025535,0.025720,0.025720
2985215,168040830,0.051158,0.051668,0.051668,0.034702,0.035182,0.035182
