In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [3]:
df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')
df_train = reduce_mem_usage(df_train,use_float16=True)

Memory usage of dataframe is 2638.86 MB
Memory usage after optimization is: 733.78 MB
Decreased by 72.2%


In [4]:
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)
target = np.log1p(df_train["meter_reading"])
features = df_train[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [6]:
del df_train, train_engineer
gc.collect()

7

In [7]:
columns = features.columns
kf = KFold(n_splits=3)
splits = kf.split(features, target)
score = 0
NFOLDS=3
feature_importance_df = pd.DataFrame()
out_folder_train_prediction= pd.DataFrame()
models=[]

# run xgb with different params

for fold_n, (train_index, valid_index) in enumerate(splits):
    dtrain = xgb.DMatrix(features.iloc[train_index],target.iloc[train_index])
    dvalid = xgb.DMatrix(features.iloc[valid_index], target.iloc[valid_index])
    y_valid=target.iloc[valid_index]
# 1.14
#     params = {'eval_metric': 'rmse',\
#               'objective': 'reg:squarederror',\
#               'booster':'gbtree',\
#               'nthread' : 4,\
#               'eta' : 0.05,\
#               'max_leaves': 2000,\
#               'max_depth' : 12,\
#               'subsample' : 0.9,\
#               'colsample_bytree' : 0.9,\
#               'colsample_bylevel' : 0.9,\
#              'gamma':1.0,\
#              'max_bin':500,\
#              'min_child_weight':3.0,\
#              'reg_alpha':0.1,\
#              'reg_lambda':0.1,\
#              'n_gpus': 2}

    params = {'eval_metric': 'rmse',\
              'objective': 'reg:squarederror',\
              'booster':'gbtree',\
              'nthread' : 4,\
              'eta' : 0.05,\
              'max_leaves': 1800,\
              'max_depth' : 12,\
              'subsample' : 0.1,\
              'colsample_bytree' : 0.9,\
              'colsample_bylevel' : 0.9,\
              'gamma':0,\
              'max_bin':180,\
              'min_child_weight':3.0,\
              'reg_alpha':2.0,\
              'reg_lambda':0.1,
              'tree_method': 'gpu_hist'
              'n_gpus': 2}
 

    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
    model=xgb.train(params, dtrain, 1000, watchlist, maximize=False, early_stopping_rounds = 200, verbose_eval=50)
    
    y_pred_valid = model.predict(dvalid, ntree_limit=model.best_ntree_limit)

    
    oof_preds=pd.DataFrame()
    oof_preds['train_index']=valid_index
    oof_preds['TARGET']= y_pred_valid
    oof_preds["folder"]=fold_n + 1
    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df = pd.DataFrame(model.get_fscore().items(), 
                                      columns=['feature','importance']).sort_values('importance', ascending=False)
    fold_importance_df["fold"] = fold_n + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print(f"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_valid, y_pred_valid))}")
    
    score += np.sqrt(mean_squared_error(y_valid,y_pred_valid)) / NFOLDS
    
    models.append(model)
          
    del dtrain, dvalid, watchlist, y_valid
    gc.collect()
    
print(f"\nMean rmse = {score}")

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.09607	valid-rmse:4.10874
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:1.01446	valid-rmse:1.08258
[100]	train-rmse:0.827756	valid-rmse:0.916038
[150]	train-rmse:0.771476	valid-rmse:0.880163
[200]	train-rmse:0.736928	valid-rmse:0.863381
[250]	train-rmse:0.713559	valid-rmse:0.854703
[300]	train-rmse:0.69682	valid-rmse:0.85111
[350]	train-rmse:0.684234	valid-rmse:0.84909
[400]	train-rmse:0.673897	valid-rmse:0.848058
[450]	train-rmse:0.665394	valid-rmse:0.848331
[500]	train-rmse:0.658587	valid-rmse:0.848667
[550]	train-rmse:0.65175	valid-rmse:0.848999
[600]	train-rmse:0.64658	valid-rmse:0.849286
Stopping. Best iteration:
[409]	train-rmse:0.672314	valid-rmse:0.847955

Fold 1 | rmse: 0.8479544520378113


  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.08945	valid-rmse:4.11582
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.954529	valid-rmse:1.0678
[100]	train-rmse:0.798493	valid-rmse:0.923546
[150]	train-rmse:0.754606	valid-rmse:0.883469
[200]	train-rmse:0.729942	valid-rmse:0.865462
[250]	train-rmse:0.712076	valid-rmse:0.853953
[300]	train-rmse:0.698203	valid-rmse:0.847165
[350]	train-rmse:0.686687	valid-rmse:0.842022
[400]	train-rmse:0.676455	valid-rmse:0.838169
[450]	train-rmse:0.668135	valid-rmse:0.836115
[500]	train-rmse:0.661344	valid-rmse:0.835401
[550]	train-rmse:0.655651	valid-rmse:0.83436
[600]	train-rmse:0.650737	valid-rmse:0.834422
[650]	train-rmse:0.645911	valid-rmse:0.834574
[700]	train-rmse:0.641362	valid-rmse:0.834455
[750]	train-rmse:0.637191	valid-rmse:0.834429
[800]	train-rmse:0.633442	valid-rmse:0.834662
Stopping. Best iteration:
[625]	train-rmse:0.647934	valid-rmse:0.834141

Fold 2 

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.10932	valid-rmse:4.0775
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.984811	valid-rmse:1.06839
[100]	train-rmse:0.794776	valid-rmse:0.937235
[150]	train-rmse:0.736237	valid-rmse:0.907638
[200]	train-rmse:0.706902	valid-rmse:0.89719
[250]	train-rmse:0.683621	valid-rmse:0.889159
[300]	train-rmse:0.668755	valid-rmse:0.886005
[350]	train-rmse:0.654421	valid-rmse:0.883309
[400]	train-rmse:0.643346	valid-rmse:0.881163
[450]	train-rmse:0.63436	valid-rmse:0.880394
[500]	train-rmse:0.627085	valid-rmse:0.879984
[550]	train-rmse:0.621459	valid-rmse:0.879883
[600]	train-rmse:0.616395	valid-rmse:0.880216
[650]	train-rmse:0.611315	valid-rmse:0.880725
[700]	train-rmse:0.606582	valid-rmse:0.881058
[750]	train-rmse:0.602257	valid-rmse:0.881479
Stopping. Best iteration:
[553]	train-rmse:0.621229	valid-rmse:0.879835

Fold 3 | rmse: 0.879835307598114

Mean rmse = 0.853976

In [8]:
out_folder_train_prediction.to_csv('out_folder_train_prediction_xgb.csv',index= False)

In [9]:
# xgb 
(1.14988+1.07825+ 1.15351)/3

1.1272133333333334

In [10]:
test_feature = pd.read_csv('../../Large_output/test_merge.csv')
test_feature = reduce_mem_usage(test_feature)
test_feature = features_engineering(test_feature)
row_ids = test_feature[['row_id']]
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

Memory usage of dataframe is 4771.91 MB
Memory usage after optimization is: 1671.69 MB
Decreased by 65.0%


In [11]:
dtest = xgb.DMatrix(test_feature)

In [12]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(dtest, ntree_limit=model.best_ntree_limit)) / len(models)
    else:
        results += np.expm1(model.predict(dtest, ntree_limit=model.best_ntree_limit)) / len(models)
    del model
    gc.collect()

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
test_feature['meter_reading']=np.clip(results, 0, a_max=None)
test_feature.loc[(test_feature['site_id']==0) & 
                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &
                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)
df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})
df_result.to_csv('../../Large_output/xgb_bayes_clean.csv',index = False)