In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
train_df = pd.read_csv('../Large_output/train_df_new.csv')
train_df['meter_reading'] = np.expm1(train_df["meter_reading"])
train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading']=\
train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading'].mul(0.2931)

In [6]:
target = np.log1p(train_df["meter_reading"])
features = train_df[['building_id',\
 'site_id',\
 'primary_use',\
 'meter',\
 'dayofweek',\
 'square_feet',\
 'year_built',\
 'floor_count',\
 'air_temperature',\
 'cloud_coverage',\
 'dew_temperature',\
 'precip_depth_1_hr',\
 'sea_level_pressure',\
 'wind_direction',\
 'wind_speed',\
 'relative_humidity',\
 'feels_like',\
 'hour']]

In [9]:
categorical_features = ['building_id', 'site_id', 'primary_use', 'meter','dayofweek']

In [10]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [16]:
features=reduce_mem_usage(features,use_float16=True)
X_test=reduce_mem_usage(X_test,use_float16=True)

Memory usage of dataframe is 2640.82 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Memory usage after optimization is: 568.51 MB
Decreased by 78.5%
Memory usage of dataframe is 5726.29 MB
Memory usage after optimization is: 1232.74 MB
Decreased by 78.5%


In [11]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
features=reduce_mem_usage(features,use_float16=True)

Memory usage of dataframe is 2640.82 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Memory usage after optimization is: 568.51 MB
Decreased by 78.5%


In [13]:
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
columns = features.columns
kf = StratifiedKFold(n_splits=5, shuffle=False, random_state=45)
splits=kf.split(features,train_df['month'])
score = 0
NFOLDS=5
feature_importance_df = pd.DataFrame()
out_folder_train_prediction= pd.DataFrame()
models=[]
y_oof = np.zeros(features.shape[0])
for fold_n, (train_index, valid_index) in enumerate(splits):
    dtrain = xgb.DMatrix(features.iloc[train_index],target.iloc[train_index])
    dvalid = xgb.DMatrix(features.iloc[valid_index], target.iloc[valid_index])
    y_valid=target.iloc[valid_index]

    params = {'eval_metric': 'rmse',\
              'objective': 'reg:squarederror',\
              'booster':'gbtree',\
              'nthread' : 4,\
              'eta' : 0.05,\
              'max_leaves': 1800,\
              'max_depth' : 12,\
              'subsample' : 0.1,\
              'colsample_bytree' : 0.9,\
              'colsample_bylevel' : 0.9,\
              'gamma':0,\
              'max_bin':180,\
              'min_child_weight':3.0,\
              'reg_alpha':2.0,\
              'reg_lambda':0.1,
              'tree_method': 'gpu_hist',
              'n_gpus': 2}

    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
    model=xgb.train(params, dtrain, 2000, watchlist, maximize=False, early_stopping_rounds = 200, verbose_eval=50)
    
    y_pred_valid = model.predict(dvalid, ntree_limit=model.best_ntree_limit)

    
    oof_preds=pd.DataFrame()
    oof_preds['train_index']=valid_index
    oof_preds['TARGET']= y_pred_valid
    oof_preds["folder"]=fold_n + 1
    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
    fold_importance_df["fold"] = fold_n + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print(f"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_valid, y_pred_valid))}")
    
    score += np.sqrt(mean_squared_error(y_valid,y_pred_valid)) / NFOLDS
    
    y_oof[valid_index] = y_pred_valid
    
    models.append(model)
          
    del dtrain, dvalid, watchlist, y_valid
    gc.collect()
    
print(f"\nMean rmse = {score}")
# one error here but deletion will make this work and this is not necessary
# print(f"Out of folds rmse = {np.sqrt(mean_squared_error((target, y_oof)))}")

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.09856	valid-rmse:4.11155
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.999885	valid-rmse:1.00408
[100]	train-rmse:0.820916	valid-rmse:0.833524
[150]	train-rmse:0.769534	valid-rmse:0.798885
[200]	train-rmse:0.737511	valid-rmse:0.779073
[250]	train-rmse:0.71614	valid-rmse:0.767724
[300]	train-rmse:0.697023	valid-rmse:0.757964
[350]	train-rmse:0.683936	valid-rmse:0.753678
[400]	train-rmse:0.673517	valid-rmse:0.750265
[450]	train-rmse:0.664385	valid-rmse:0.74754
[500]	train-rmse:0.657322	valid-rmse:0.746102
[550]	train-rmse:0.650531	valid-rmse:0.744552
[600]	train-rmse:0.645264	valid-rmse:0.744046
[650]	train-rmse:0.639814	valid-rmse:0.743107
[700]	train-rmse:0.634518	valid-rmse:0.742406
[750]	train-rmse:0.630082	valid-rmse:0.741935
[800]	train-rmse:0.626031	valid-rmse:0.74203
[850]	train-rmse:0.621977	valid-rmse:0.74196
[900]	train-rmse:0.618614	valid-rmse

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.10666	valid-rmse:4.07735
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.986558	valid-rmse:1.06456
[100]	train-rmse:0.80048	valid-rmse:0.92138
[150]	train-rmse:0.752451	valid-rmse:0.892215
[200]	train-rmse:0.722688	valid-rmse:0.875356
[250]	train-rmse:0.699638	valid-rmse:0.863528
[300]	train-rmse:0.683237	valid-rmse:0.856248
[350]	train-rmse:0.669486	valid-rmse:0.850807
[400]	train-rmse:0.659716	valid-rmse:0.84763
[450]	train-rmse:0.651771	valid-rmse:0.845365
[500]	train-rmse:0.644381	valid-rmse:0.843647
[550]	train-rmse:0.637901	valid-rmse:0.842337
[600]	train-rmse:0.632808	valid-rmse:0.841816
[650]	train-rmse:0.627555	valid-rmse:0.84098
[700]	train-rmse:0.623182	valid-rmse:0.840779
[750]	train-rmse:0.619155	valid-rmse:0.840529
[800]	train-rmse:0.615066	valid-rmse:0.840152
[850]	train-rmse:0.61129	valid-rmse:0.839719
[900]	train-rmse:0.608119	valid-rmse:

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.10341	valid-rmse:4.09708
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.995087	valid-rmse:1.03318
[100]	train-rmse:0.820024	valid-rmse:0.877656
[150]	train-rmse:0.764657	valid-rmse:0.838237
[200]	train-rmse:0.732883	valid-rmse:0.818822
[250]	train-rmse:0.709522	valid-rmse:0.805935
[300]	train-rmse:0.691759	valid-rmse:0.798073
[350]	train-rmse:0.678747	valid-rmse:0.793046
[400]	train-rmse:0.667609	valid-rmse:0.789206
[450]	train-rmse:0.659453	valid-rmse:0.786696
[500]	train-rmse:0.651977	valid-rmse:0.785093
[550]	train-rmse:0.645329	valid-rmse:0.784107
[600]	train-rmse:0.639456	valid-rmse:0.783294
[650]	train-rmse:0.633913	valid-rmse:0.782653
[700]	train-rmse:0.629134	valid-rmse:0.781777
[750]	train-rmse:0.624877	valid-rmse:0.781434
[800]	train-rmse:0.62069	valid-rmse:0.781379
[850]	train-rmse:0.616887	valid-rmse:0.781034
[900]	train-rmse:0.613657	valid-r

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.0994	valid-rmse:4.10682
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:1.00208	valid-rmse:0.994173
[100]	train-rmse:0.823693	valid-rmse:0.820339
[150]	train-rmse:0.770739	valid-rmse:0.779659
[200]	train-rmse:0.738412	valid-rmse:0.758209
[250]	train-rmse:0.715088	valid-rmse:0.744332
[300]	train-rmse:0.697997	valid-rmse:0.735124
[350]	train-rmse:0.685538	valid-rmse:0.72936
[400]	train-rmse:0.675374	valid-rmse:0.72589
[450]	train-rmse:0.666507	valid-rmse:0.723159
[500]	train-rmse:0.658197	valid-rmse:0.720393
[550]	train-rmse:0.650842	valid-rmse:0.71869
[600]	train-rmse:0.645439	valid-rmse:0.717804
[650]	train-rmse:0.64004	valid-rmse:0.71696
[700]	train-rmse:0.63489	valid-rmse:0.716153
[750]	train-rmse:0.630355	valid-rmse:0.715716
[800]	train-rmse:0.626215	valid-rmse:0.715525
[850]	train-rmse:0.622854	valid-rmse:0.715186
[900]	train-rmse:0.61959	valid-rmse:0.7

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:4.0991	valid-rmse:4.1157
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 200 rounds.
[50]	train-rmse:1.00419	valid-rmse:1.00973
[100]	train-rmse:0.82228	valid-rmse:0.843475
[150]	train-rmse:0.768298	valid-rmse:0.806672
[200]	train-rmse:0.739544	valid-rmse:0.790521
[250]	train-rmse:0.716653	valid-rmse:0.777986
[300]	train-rmse:0.699252	valid-rmse:0.771598
[350]	train-rmse:0.683277	valid-rmse:0.765087
[400]	train-rmse:0.671436	valid-rmse:0.761229
[450]	train-rmse:0.662362	valid-rmse:0.758912
[500]	train-rmse:0.65433	valid-rmse:0.757258
[550]	train-rmse:0.647973	valid-rmse:0.756688
[600]	train-rmse:0.6433	valid-rmse:0.75665
[650]	train-rmse:0.638119	valid-rmse:0.75614
[700]	train-rmse:0.633384	valid-rmse:0.755457
[750]	train-rmse:0.629116	valid-rmse:0.755572
[800]	train-rmse:0.624843	valid-rmse:0.755587
[850]	train-rmse:0.621311	valid-rmse:0.755636
[900]	train-rmse:0.617639	valid-rmse:0.75

TypeError: mean_squared_error() missing 1 required positional argument: 'y_pred'

In [19]:
test_df = pd.read_csv('../Large_output/test_1.074.csv')
test_df = test_df[['building_id',\
 'site_id',\
 'primary_use',\
 'meter',\
 'dayofweek',\
 'square_feet',\
 'year_built',\
 'floor_count',\
 'air_temperature',\
 'cloud_coverage',\
 'dew_temperature',\
 'precip_depth_1_hr',\
 'sea_level_pressure',\
 'wind_direction',\
 'wind_speed',\
 'relative_humidity',\
 'feels_like',\
 'hour']]
test_df=reduce_mem_usage(test_df,use_float16=True)

Memory usage of dataframe is 5726.29 MB
Memory usage after optimization is: 1232.74 MB
Decreased by 78.5%


In [20]:
from tqdm import tqdm_notebook as tqdm
# group the prediction into 120 to save the space
def predictions(models, iterations = 120):
    results = []
    set_size = len(test_df)
    batch_size = set_size // iterations
    meter_reading = []
    for i in tqdm(range(iterations)):
        pos = i*batch_size
        temp_df = test_df.iloc[pos : pos+batch_size]
        dtest = xgb.DMatrix(temp_df)
        fold_preds = [np.expm1(model.predict(dtest, ntree_limit=model.best_ntree_limit)) for model in models]
        meter_reading.extend(np.mean(fold_preds, axis=0))
    print(len(meter_reading))
    assert len(meter_reading) == set_size
    test_df['meter_reading']=np.clip(meter_reading, 0, a_max=None)
    test_df.loc[(test_df['site_id']==0) & 
                 (test_df['meter']==0),'meter_reading']=test_df.loc[(test_df['site_id']==0) &
                                                            (test_df['meter']==0),'meter_reading'].mul(3.4118)
    submission = pd.read_csv('../Resources/sample_submission.csv')
    submission['meter_reading'] = test_df['meter_reading']
    return submission
df_result = predictions(models)
df_result.to_csv('../Large_output/xgb_new_csv_nomonth.csv',index = False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))


41697600


In [21]:
df_result.head(20)

Unnamed: 0,row_id,meter_reading
0,0,172.321182
1,1,62.858246
2,2,6.296654
3,3,293.548431
4,4,1443.78894
5,5,9.223751
6,6,134.421921
7,7,540.996765
8,8,772.75647
9,9,403.09021
