# Imports


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import gc
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Functions

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def building_feature_engineering(df):
    """
    Drop floor_count;Fill the missing year_built values by the year_built for the same building_id.
    Encode primary_use
    """
    # 1-Drop floor_count due to highly correlated to square_feet
    df=df.drop('floor_count',axis=1)
    
    #2-Imputate year_built
    df=df.set_index('building_id')
    df_filled=pd.DataFrame(df.groupby('building_id')['year_built'].agg(np.mean).transform(lambda x:x.fillna(method='ffill')),columns=['year_built'])
    df.update(df_filled,overwrite=True)
    df=df.reset_index()
    
    #3 Encode categorical variable--primary_use
    le = LabelEncoder()
    df['primary_use'] = le.fit_transform(df['primary_use'])
    
    return df

In [4]:
def add_times_series_feature(df):
    df= df.set_index('timestamp')
    df.index = pd.to_datetime(df.index)
    df['hour']=df.index.hour
    df['day'] = df.index.day
    df['weekday']=df.index.weekday
    df['month']=df.index.month
    df=df.reset_index()
    return df

In [5]:
def weather_info_imputation(weather_df):
    """
    Impute missing values from weather dataset by the mean value of the day of the month (forward fill)
    """

    weather_df = weather_df.set_index(['site_id','month','day'])
    cloud_coverage_filler = np.rint(weather_df.groupby(['site_id','month','day'])['cloud_coverage'].mean())
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=['cloud_coverage'])
    weather_df.update(cloud_coverage_filler,overwrite=False)

    precip_depth_filler = weather_df.groupby(['site_id','month','day'])['precip_depth_1_hr'].mean()
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])
    weather_df.update(precip_depth_filler,overwrite=False)

    sea_filler = weather_df.groupby(['site_id','month','day'])['sea_level_pressure'].mean()
    sea_filler = pd.DataFrame(sea_filler.fillna(method='ffill'),columns=['sea_level_pressure'])
    weather_df.update(sea_filler,overwrite=False)
    
    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','month','day'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','month','day'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)
    
    
   
    air_temperature_filler = weather_df.groupby(['site_id','month','day'])['air_temperature'].mean()
    air_temperature_filler = pd.DataFrame(air_temperature_filler.fillna(method='ffill'),columns=['air_temperature'])
    weather_df.update( air_temperature_filler,overwrite=False)
                                                                           
    dew_temperature_filler = weather_df.groupby(['site_id','month','day'])[ 'dew_temperature'].mean()
    dew_temperature_filler = pd.DataFrame(dew_temperature_filler.fillna(method='ffill'),columns=['dew_temperature'])                                                                      
    weather_df.update( dew_temperature_filler,overwrite=False)
    
    wrong_temp=weather_df[weather_df['air_temperature'] < weather_df['dew_temperature']][['air_temperature','dew_temperature']]
    for i in range(len(wrong_temp)):
        wrong_temp['air_temperature'].iloc[i]=wrong_temp['dew_temperature'].iloc[i]
       
    wrong_temp=wrong_temp.drop('dew_temperature',axis=1)
    weather_df.update(wrong_temp.values,overwrite=False)
    
    weather_df=weather_df.reset_index()
    weather_df=weather_df.drop(['hour','day','weekday','month'],axis=1)
    
    
    return weather_df


In [6]:
def train_engineering(df):
    # Remove Outlier building_id=1099
    #building_1099=df[df['building_id']==1099]
    #df=df[df['building_id']!=1099]
    
    # Change timestamp to datetime type and add time features
    df= df.set_index('timestamp')
    df.index = pd.to_datetime(df.index)
    df['hour'] = df.index.day
    df['day'] = df.index.day
    df['weekday']=df.index.weekday

    # For more accurate numerical calculations
    df['year_built'] = df['year_built']-1900
    df['square_feet'] = df['square_feet']

    # Target

    df=df.reset_index()
    df['meter_reading'] = np.log1p(df['meter_reading'])
    
    df=df.reset_index()
    
    return df

# Loading Preprocessed Training Data

In [9]:
features=pd.read_csv('train_features.csv')

In [10]:
features.shape

(20198534, 17)

In [11]:
features.tail()

Unnamed: 0.1,Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day,weekday
20198529,20198529,1444,0,15,1,19619,14.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198530,20198530,1445,0,15,0,4298,14.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198531,20198531,1446,0,15,1,11265,97.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198532,20198532,1447,0,15,4,29775,101.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198533,20198533,1448,0,15,6,92271,101.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5


In [12]:
features=features.iloc[:20198533]

In [13]:
features.shape

(20198533, 17)

In [14]:
features.tail()

Unnamed: 0.1,Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day,weekday
20198528,20198528,1443,0,15,0,40311,13.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198529,20198529,1444,0,15,1,19619,14.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198530,20198530,1445,0,15,0,4298,14.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198531,20198531,1446,0,15,1,11265,97.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5
20198532,20198532,1447,0,15,4,29775,101.0,1.7,2.0,-5.6,-1.0,1008.5,180.0,8.8,31,31,5


In [15]:
target=pd.read_csv('train_traget.csv')

In [16]:
target.tail()

Unnamed: 0,0,0.0
20198528,20198529,2.277267
20198529,20198530,1.762159
20198530,20198531,0.0
20198531,20198532,5.078761
20198532,20198533,1.348073


In [17]:
target.shape

(20198533, 2)

In [18]:
features.columns

Index(['Unnamed: 0', 'building_id', 'meter', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'hour', 'day', 'weekday'],
      dtype='object')

In [19]:
features=features.drop(['Unnamed: 0'],axis=1)

In [20]:
features.shape

(20198533, 16)

In [21]:
target=target.drop(['0'],axis=1)

In [22]:
features.shape

(20198533, 16)

In [23]:
target.shape

(20198533, 1)

# Train KFold LightGBM

In [24]:
categorical_features = ['hour','meter','weekday','primary_use','site_id','building_id']
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=3)
oof = np.zeros(len(features))
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    oof[test_index] = model.predict(test_features, num_iteration=model.best_iteration)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.16116	valid_1's rmse: 1.64688
[50]	training's rmse: 0.956806	valid_1's rmse: 1.5972
[75]	training's rmse: 0.876782	valid_1's rmse: 1.59767
[100]	training's rmse: 0.830586	valid_1's rmse: 1.59792
Early stopping, best iteration is:
[56]	training's rmse: 0.932023	valid_1's rmse: 1.59472
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.18349	valid_1's rmse: 1.33025
[50]	training's rmse: 0.971206	valid_1's rmse: 1.22238
[75]	training's rmse: 0.894929	valid_1's rmse: 1.20837
[100]	training's rmse: 0.849285	valid_1's rmse: 1.20409
[125]	training's rmse: 0.817318	valid_1's rmse: 1.20023
[150]	training's rmse: 0.793675	valid_1's rmse: 1.19854
[175]	training's rmse: 0.777309	valid_1's rmse: 1.19764
[200]	training's rmse: 0.761643	valid_1's rmse: 1.19661
[225]	training's rmse: 0.749959	valid_1's rmse: 1.19637
[250]	training's rmse: 0.738836	valid_1's rmse: 1.19617
[275]	trainin

In [25]:
print('oof_RMSE : ' ,np.sqrt(mean_squared_error(oof, target.values)))

oof_RMSE :  1.4241667246553864


In [26]:
del features, target
gc.collect()

61

# Test Set Preprocessing

In [27]:
test =pd.read_csv('test.csv',parse_dates=['timestamp'])

In [28]:
weather_test = pd.read_csv('weather_test.csv')

In [29]:
building = pd.read_csv('building_metadata.csv')

In [30]:
row_ids = test['row_id']

In [31]:
test=test.drop('row_id', axis=1)

In [32]:
building_processed=building_feature_engineering(building)

In [33]:
weather_test_processed=add_times_series_feature(weather_test)

In [34]:
weather_test_processed=weather_info_imputation(weather_test_processed)

In [35]:
test=test.merge(building_processed,left_on = ['building_id'], right_on=['building_id'],how='left')
test=test.merge(weather_test_processed, left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')

In [36]:
test.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2017-01-01,0,0,7432,2008.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
1,1,0,2017-01-01,0,0,2720,2004.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
2,2,0,2017-01-01,0,0,5376,1991.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
3,3,0,2017-01-01,0,0,23685,2002.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
4,4,0,2017-01-01,0,0,116607,1975.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6


In [37]:
del building_processed
gc.collect()

361

In [38]:
del weather_test_processed
gc.collect()

27

In [39]:
test.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2017-01-01,0,0,7432,2008.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
1,1,0,2017-01-01,0,0,2720,2004.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
2,2,0,2017-01-01,0,0,5376,1991.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
3,3,0,2017-01-01,0,0,23685,2002.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6
4,4,0,2017-01-01,0,0,116607,1975.0,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6


same feature engineering process on test set

In [40]:
test_processed= test.set_index('timestamp')
test_processed.index = pd.to_datetime(test_processed.index)
test_processed['hour'] = test_processed.index.hour
test_processed['day'] = test_processed.index.day
test_processed['weekday']=test_processed.index.weekday

In [41]:
# For more accurate numerical calculations
test_processed['year_built'] = test_processed['year_built']-1900
test_processed['square_feet'] = test_processed['square_feet']

In [42]:
del test

In [43]:
gc.collect()

141

In [44]:
del building
gc.collect()

27

In [45]:
del weather_test
gc.collect()

20

In [46]:
test_processed=test_processed.reset_index()

In [47]:
test_processed=test_processed.drop('timestamp',axis=1)

In [48]:
reduce_mem_usage(test_processed)

Memory usage of dataframe is 4930.98 MB
Memory usage after optimization is: 1749.70 MB
Decreased by 64.5%


Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day,weekday
0,0,0,0,0,7432,108.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
1,1,0,0,0,2720,104.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
2,2,0,0,0,5376,91.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
3,3,0,0,0,23685,102.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
4,4,0,0,0,116607,75.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
5,5,0,0,0,8000,100.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
6,6,0,0,4,27926,81.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
7,7,0,0,0,121074,89.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
8,7,1,0,0,121074,89.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6
9,8,0,0,0,60809,103.0,17.799999,4.0,11.7,0.282609,1021.400024,100.0,3.6,0,1,6


In [49]:
gc.collect()

35

# Predict on Test Set

In [50]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(test_processed, num_iteration=model.best_iteration)) / len(models)
    else:
        results += np.expm1(model.predict(test_processed, num_iteration=model.best_iteration)) / len(models)
    del model

In [51]:
submission = pd.DataFrame({'row_id': row_ids, 'meter_reading': np.clip(results, 0, a_max=None)})
del row_ids,results
gc.collect()

73

In [56]:
submission.tail()

Unnamed: 0,row_id,meter_reading
41697595,41697595,6.404761
41697596,41697596,2.450963
41697597,41697597,170.223892
41697598,41697598,4.501239
41697599,41697599,64.032252


In [57]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,0,24.411938
1,1,4.187166
2,2,69.777427
3,3,229.345039
4,4,4.53094


In [52]:
submission.to_csv(r'C:\Users\syxsw\MLProjects\BuildingEnergy\ASHRAE---Great-Energy-Predictor-III-master\submission.csv', index=False)