In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')

In [3]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [5]:
df_train = reduce_mem_usage(df_train,use_float16=True)

Memory usage of dataframe is 2638.86 MB
Memory usage after optimization is: 733.78 MB
Decreased by 72.2%


In [6]:
df_train['local_time'] = pd.to_datetime(df_train['local_time'])
df_train['month'] = df_train['local_time'].dt.month
df_train['group'] = df_train['month']
df_train['group'].replace((1,2,3,4,5,6), 1,inplace=True)
df_train['group'].replace((7,8,9,10,11,12), 2, inplace=True)
df_train['group'].value_counts()
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)

In [10]:
total_list = ['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']

In [11]:
X_half_1 = train_engineer.loc[train_engineer.group==1][total_list]
X_half_2 = train_engineer.loc[train_engineer.group==2][total_list]
y_half_1 = np.log1p(train_engineer.loc[train_engineer.group==1]['meter_reading'])
y_half_2 = np.log1p(train_engineer.loc[train_engineer.group==2]['meter_reading'])

In [12]:
categoricals = ["building_id", "site_id", "meter", "primary_use",  "weekend",'is_holiday']
d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categoricals, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categoricals, free_raw_data=False)
watchlist_1 = [d_half_2, d_half_1]
watchlist_2 = [d_half_1, d_half_2]

In [13]:
params1 = {
    "objective": "regression",
    'bagging_fraction':0.9,
    "boosting": "gbdt",
    "num_leaves": 3000,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "reg_lambda":3.0 ,
    'reg_alpha': 0.1 ,
    'max_depth':12,
    'min_child_weight':  20,
    'min_data_in_leaf':20,
    "metric": "rmse"
}
params2 = {
    "objective": "regression",
    'bagging_fraction': 0.1,
    "boosting": "gbdt",
    "num_leaves": 1000,
    "learning_rate": 0.05,
    "feature_fraction":  0.9,
    "reg_lambda": 0.1,
    'reg_alpha':3.0 ,
    'max_depth':12,
    'min_child_weight':3,
    'min_data_in_leaf':200,
    "metric": "rmse"
}

In [14]:
print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params1, train_set=d_half_1, num_boost_round=10000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params2, train_set=d_half_2, num_boost_round=10000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Building model with first half and validating on second half:




Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.598892	valid_0's rmse: 0.866394
[400]	training's rmse: 0.55618	valid_0's rmse: 0.867651
Early stopping, best iteration is:
[256]	training's rmse: 0.583824	valid_0's rmse: 0.86591
Building model with second half and validating on first half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.649583	valid_0's rmse: 0.836275
Early stopping, best iteration is:
[184]	training's rmse: 0.654797	valid_0's rmse: 0.836056


In [15]:
test_feature = pd.read_csv('../../Large_output/test_merge.csv')
test_feature = features_engineering(test_feature)
row_ids = test_feature[['row_id']]
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

In [16]:
models = [model_half_1,model_half_2]
from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test_feature.shape[0]/50000)))):
    res.append(sum(np.expm1([model.predict(test_feature.iloc[i:i+step_size]) for model in models])/len(models)))
    i+=step_size

100%|██████████| 834/834 [06:22<00:00,  2.18it/s]


In [17]:
res = np.concatenate(res)

In [18]:
test_feature['meter_reading']=np.clip(res, 0, a_max=None)
test_feature.loc[(test_feature['site_id']==0) & (test_feature['meter']==0),'meter_reading']\
=test_feature.loc[(test_feature['site_id']==0) & (test_feature['meter']==0),'meter_reading'].mul(3.4118)
df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})
df_result

Unnamed: 0,row_id,meter_reading
0,0,105.533187
1,1,68.027624
2,2,10.686327
3,3,188.630619
4,4,1102.553080
...,...,...
41697595,41697595,6.004035
41697596,41697596,4.176343
41697597,41697597,7.992368
41697598,41697598,178.120886


In [19]:
df_result.to_csv('../../Large_output/lgb_half_clean.csv',index = False)
# 1.089