In [1]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
# first data of groupby fill weather missing merged
df_train= pd.read_csv('../../Large_output/train_merge.csv')

In [3]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [5]:
df_train = reduce_mem_usage(df_train,use_float16=True)

Memory usage of dataframe is 2271.93 MB
Memory usage after optimization is: 568.74 MB
Decreased by 75.0%


In [7]:
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)
target = np.log1p(train_engineer["meter_reading"])
features = train_engineer[['building_id', 'meter','site_id','primary_use', 
                          'square_feet','air_temperature','cloud_coverage',
                          'dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]
del df_train

In [8]:
categorical_features = ["building_id", "site_id", "meter", 
                        "primary_use",  "weekend",'is_holiday']

In [9]:
params = {'n_estimators': 2000,
          'learning_rate':0.05,
          'depth':12,
          'eval_metric': 'RMSE',
          'loss_function': 'RMSE',
          'early_stopping_rounds' : 50,
          'random_state':42,
          'metric_period': 100,}

In [11]:
NFOLDS = 3
columns = features.columns
kf = KFold(n_splits=3)
splits = kf.split(features, target)
y_oof = np.zeros(features.shape[0])
score = 0
out_folder_train_prediction= pd.DataFrame()
feature_importance_df = pd.DataFrame()

models = []

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_tr=features.iloc[train_index]
    y_tr=target.iloc[train_index]
    X_val=features.iloc[valid_index]
    y_val=target.iloc[valid_index]
    
    model = CatBoostRegressor(**params)
        
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), cat_features=categorical_features, verbose=True)

    y_pred_valid = model.predict(X_val)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_val, y_pred_valid))}")
    
    score += np.sqrt(mean_squared_error(y_val, y_pred_valid)) / NFOLDS
    
    oof_preds=pd.DataFrame()
    oof_preds['train_index']=valid_index
    oof_preds['TARGET']= y_pred_valid
    oof_preds["folder"]=fold_n + 1
    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)
    
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature']=columns
    fold_importance_df['importance']=model.get_feature_importance()
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    models.append(model)
    del X_val,X_tr,y_val,y_tr
    gc.collect()
    
print(f"\nMean rmse = {score}")
print(f"Out of folds rmse = {np.sqrt(mean_squared_error(target, y_oof))}")
out_folder_train_prediction.to_csv('out_folder_train_prediction_cat.csv',index= False)



0:	learn: 2.0197523	test: 2.0568292	best: 2.0568292 (0)	total: 7.46s	remaining: 4h 8m 39s
100:	learn: 1.0631220	test: 1.2024373	best: 1.2024373 (100)	total: 10m 58s	remaining: 3h 26m 16s
200:	learn: 0.9971614	test: 1.1719861	best: 1.1719861 (200)	total: 22m 52s	remaining: 3h 24m 42s
300:	learn: 0.9585480	test: 1.1571542	best: 1.1571542 (300)	total: 34m 30s	remaining: 3h 14m 46s
400:	learn: 0.9306938	test: 1.1495907	best: 1.1495907 (400)	total: 46m 4s	remaining: 3h 3m 45s
500:	learn: 0.9091359	test: 1.1437987	best: 1.1436488 (498)	total: 57m 56s	remaining: 2h 53m 21s
600:	learn: 0.8928639	test: 1.1409205	best: 1.1409205 (600)	total: 1h 9m 48s	remaining: 2h 42m 30s
700:	learn: 0.8811650	test: 1.1384600	best: 1.1383988 (691)	total: 1h 21m 33s	remaining: 2h 31m 8s
800:	learn: 0.8700997	test: 1.1364594	best: 1.1364406 (799)	total: 1h 33m 38s	remaining: 2h 20m 9s
900:	learn: 0.8603185	test: 1.1353025	best: 1.1353025 (900)	total: 1h 45m 40s	remaining: 2h 8m 53s
1000:	learn: 0.8518508	test: 1.



0:	learn: 2.0422157	test: 2.0181382	best: 2.0181382 (0)	total: 6.86s	remaining: 3h 48m 28s
100:	learn: 1.0744251	test: 1.2112417	best: 1.2112417 (100)	total: 11m	remaining: 3h 27m 4s
200:	learn: 1.0138663	test: 1.1410900	best: 1.1410900 (200)	total: 22m 54s	remaining: 3h 24m 57s
300:	learn: 0.9761728	test: 1.1121296	best: 1.1121296 (300)	total: 34m 40s	remaining: 3h 15m 43s
400:	learn: 0.9511017	test: 1.0948035	best: 1.0948035 (400)	total: 46m 43s	remaining: 3h 6m 20s
500:	learn: 0.9329604	test: 1.0856865	best: 1.0856865 (500)	total: 58m 45s	remaining: 2h 55m 47s
600:	learn: 0.9184059	test: 1.0791304	best: 1.0791304 (600)	total: 1h 10m 52s	remaining: 2h 44m 59s
700:	learn: 0.9073367	test: 1.0737496	best: 1.0737496 (700)	total: 1h 23m	remaining: 2h 33m 48s
800:	learn: 0.8976513	test: 1.0693314	best: 1.0693314 (800)	total: 1h 35m 5s	remaining: 2h 22m 21s
900:	learn: 0.8891734	test: 1.0677035	best: 1.0672219 (884)	total: 1h 47m 31s	remaining: 2h 11m 9s
1000:	learn: 0.8817576	test: 1.06502



0:	learn: 2.0336521	test: 2.0309568	best: 2.0309568 (0)	total: 6.67s	remaining: 3h 42m 18s
100:	learn: 1.0377871	test: 1.2119426	best: 1.2119426 (100)	total: 10m 56s	remaining: 3h 25m 40s
200:	learn: 0.9587146	test: 1.1904469	best: 1.1904081 (199)	total: 22m 45s	remaining: 3h 23m 44s
300:	learn: 0.9146566	test: 1.1796038	best: 1.1796038 (300)	total: 34m 32s	remaining: 3h 15m
400:	learn: 0.8855939	test: 1.1717177	best: 1.1717177 (400)	total: 46m 22s	remaining: 3h 4m 54s
500:	learn: 0.8648077	test: 1.1684259	best: 1.1684132 (496)	total: 58m 14s	remaining: 2h 54m 14s
600:	learn: 0.8491754	test: 1.1659104	best: 1.1657589 (598)	total: 1h 9m 49s	remaining: 2h 42m 31s
700:	learn: 0.8342476	test: 1.1642697	best: 1.1641529 (699)	total: 1h 21m 43s	remaining: 2h 31m 27s
800:	learn: 0.8231683	test: 1.1638116	best: 1.1634155 (765)	total: 1h 33m 41s	remaining: 2h 20m 14s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.163415455
bestIteration = 765

Shrink model to first 766 itera

In [12]:
test_feature = pd.read_csv('../../Large_output/test_merge.csv')
test_feature = reduce_mem_usage(test_feature)
test_feature = features_engineering(test_feature)
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday', 'row_id']]
row_ids = test_feature[['row_id']]
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

Memory usage of dataframe is 4771.91 MB
Memory usage after optimization is: 1671.69 MB
Decreased by 65.0%


In [13]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(test_feature)) / len(models)
    else:
        results += np.expm1(model.predict(test_feature)) / len(models)
    del model
    gc.collect()

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
test_feature['meter_reading']=np.clip(results, 0, a_max=None)
test_feature.loc[(test_feature['site_id']==0) & 
                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &
                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)
df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})
df_result.to_csv('../../Large_output/cat_init_kf3.csv',index = False)