In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [21]:
df_train= pd.read_csv('../../Large_output/train_merge.csv')

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [6]:
df_train = reduce_mem_usage(df_train,use_float16=True)
df_train.loc[:,'timestamp1']=df_train.timestamp

Memory usage of dataframe is 2271.93 MB
Memory usage after optimization is: 568.74 MB
Decreased by 75.0%


In [29]:
df_train.loc[:,'timestamp']=df_train.timestamp1
df_train.timestamp=pd.to_datetime(df_train.timestamp, format='%Y-%m-%d %H:%M:%S')
df_train.timestamp = (df_train.timestamp - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600

In [31]:
def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):
    """Helper routine for 'find_bad_zeros'.
    
    This operates upon a single dataframe produced by 'groupby'. We expect an 
    additional column 'meter_id' which is a duplicate of 'meter' because groupby 
    eliminates the original one."""
    meter = Xy_subset.meter_id.iloc[0]
    is_zero = Xy_subset.meter_reading == 0
    if meter == 0:
        # Electrical meters should never be zero. Keep all zero-readings in this table so that
        # they will all be dropped in the train set.
        return is_zero

    transitions = (is_zero != is_zero.shift(1))
    all_sequence_ids = transitions.cumsum()
    ids = all_sequence_ids[is_zero].rename("ids")
    if meter in [2, 3]:
        # It's normal for steam and hotwater to be turned off during the summer
        keep = set(ids[(Xy_subset.timestamp < summer_start) |
                       (Xy_subset.timestamp > summer_end)].unique())
        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)
    elif meter == 1:
        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index("timestamp").ids
        is_bad = ids.map(ids.value_counts()) >= min_interval

        # Cold water may be turned off during the winter
        jan_id = time_ids.get(0, False)
        dec_id = time_ids.get(8283, False)
        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and
                dec_id == time_ids.get(8783, False)):
            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))
    else:
        raise Exception(f"Unexpected meter type: {meter}")

    result = is_zero.copy()
    result.update(is_bad)
    return result

In [5]:
df_train_clean = pd.read_csv('../../Large_output/train_clean_merge.csv')

In [6]:
df_train_clean = reduce_mem_usage(df_train_clean,use_float16=True)
train_engineer = features_engineering(df_train_clean)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)
target = np.log1p(train_engineer["meter_reading"])
features = train_engineer[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

Memory usage of dataframe is 2638.86 MB
Memory usage after optimization is: 733.78 MB
Decreased by 72.2%


In [10]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1803,
    "learning_rate": 0.05,
    "colsample_bytree": 0.9,
    "reg_lambda": 0.1,
    'reg_alpha':0.1,
    "metric": "rmse",
    'max_bins':600,
    'max_depth':-1,
    'min_child_sample':20,
    'min_child_weight':30,
    'min_split_gain':0.1,
    'subsample': 0.9,
    'subsample_freq':1,
     
}

In [11]:
categorical_features = ["building_id", "site_id", "meter", 
                        "primary_use",  "weekend",'is_holiday']

In [12]:
features.isna().sum()

building_id          0
meter                0
site_id              0
primary_use          0
square_feet          0
air_temperature      0
cloud_coverage       0
dew_temperature      0
precip_depth_1_hr    0
hour                 0
weekend              0
is_holiday           0
dtype: int64

In [13]:
kf = KFold(n_splits=3)
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.923136	valid_1's rmse: 0.999472
[50]	training's rmse: 0.72767	valid_1's rmse: 0.860393
[75]	training's rmse: 0.675437	valid_1's rmse: 0.840509
[100]	training's rmse: 0.646649	valid_1's rmse: 0.836474
[125]	training's rmse: 0.626953	valid_1's rmse: 0.836071
[150]	training's rmse: 0.611737	valid_1's rmse: 0.837473
Early stopping, best iteration is:
[116]	training's rmse: 0.633365	valid_1's rmse: 0.835784
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.920024	valid_1's rmse: 1.0214
[50]	training's rmse: 0.729434	valid_1's rmse: 0.859795
[75]	training's rmse: 0.679488	valid_1's rmse: 0.824596
[100]	training's rmse: 0.653321	valid_1's rmse: 0.811392
[125]	training's rmse: 0.63395	valid_1's rmse: 0.80658
[150]	training's rmse: 0.618901	valid_1's rmse: 0.805244
[175]	training's rmse: 0.609641	valid_1's rmse: 0.80511
[200]	training's rmse: 0.602338	valid_1's rmse: 0.805402


In [14]:
test_feature = pd.read_csv('../../Large_output/test_merge.csv')

In [15]:
test_feature = features_engineering(test_feature)
row_ids = test_feature[['row_id']]
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday', 'row_id']]

In [16]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(test_feature, num_iteration=model.best_iteration)) / len(models)
    else:
        results += np.expm1(model.predict(test_feature, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
test_feature['meter_reading']=np.clip(results, 0, a_max=None)

In [18]:
test_feature.loc[(test_feature['site_id']==0) & 
                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &
                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)
df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})
df_result

Unnamed: 0,row_id,meter_reading
0,0,134.285627
1,1,69.056423
2,2,10.880237
3,3,251.449750
4,4,1112.319660
...,...,...
41697595,41697595,6.082773
41697596,41697596,4.195885
41697597,41697597,6.979827
41697598,41697598,181.797817


In [19]:
df_result.to_csv('../../Large_output/lgb3_cleaned.csv', index = False)
# 1.081