In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm_notebook as tqdm
import datetime
from meteocalc import feels_like, Temp
from sklearn import metrics
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
train_df = pd.read_csv('../Large_output/train_1.074.csv')
train_df['meter_reading'] = np.expm1(train_df["meter_reading"])
train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading']\
=train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading'].mul(0.2931)

In [8]:
target = np.log1p(train_df["meter_reading"])
features = train_df[['building_id',\
 'site_id',\
 'primary_use',\
 'meter',\
 'dayofweek',\
 'square_feet',\
 'year_built',\
 'floor_count',\
 'air_temperature',\
 'cloud_coverage',\
 'dew_temperature',\
 'precip_depth_1_hr',\
 'sea_level_pressure',\
 'wind_direction',\
 'wind_speed',\
 'relative_humidity',\
 'feels_like',\
 'hour']]

In [10]:
categorical_features = ['building_id', 'site_id', 'primary_use', 'meter','dayofweek']

In [11]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [12]:
params = {'n_estimators': 3000,
          'learning_rate':0.05,
          'depth':12,
          'eval_metric': 'RMSE',
          'loss_function': 'RMSE',
          'early_stopping_rounds' : 50,
          'random_state':42,
          'metric_period': 100,
          'task_type': 'GPU',
          'boosting_type': 'Plain',
          'gpu_cat_features_storage': 'CpuPinnedMemory'
          }
features.fillna(-999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


In [13]:
features=reduce_mem_usage(features,use_float16=True)

Memory usage of dataframe is 2640.82 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Memory usage after optimization is: 568.51 MB
Decreased by 78.5%


In [14]:
# run kf = 5 not fully run, run by other servies without notebook saving
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
NFOLDS = 5
columns = features.columns
kf = StratifiedKFold(n_splits=5, shuffle=False, random_state=2319)
splits=kf.split(features,train_df['month'])
y_oof = np.zeros(features.shape[0])
score = 0
out_folder_train_prediction= pd.DataFrame()
feature_importance_df = pd.DataFrame()

models = []



for fold_n, (train_index, valid_index) in enumerate(splits):
    X_tr=features.iloc[train_index]
    y_tr=target.iloc[train_index]
    X_val=features.iloc[valid_index]
    y_val=target.iloc[valid_index]
    
    model = CatBoostRegressor(**params)
        
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), cat_features=categorical_features, verbose=True)

    y_pred_valid = model.predict(X_val)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_val, y_pred_valid))}")
    
    score += np.sqrt(mean_squared_error(y_val, y_pred_valid)) / NFOLDS
    
    oof_preds=pd.DataFrame()
    oof_preds['train_index']=valid_index
    oof_preds['TARGET']= y_pred_valid
    oof_preds["folder"]=fold_n + 1
    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)
    
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature']=columns
    fold_importance_df['importance']=model.get_feature_importance()
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    models.append(model)
    del X_val,X_tr,y_val,y_tr
    gc.collect()
    
print(f"\nMean rmse = {score}")
# print(f"Out of folds rmse = {np.sqrt(mean_squared_error((target, y_oof)))}")
out_folder_train_prediction.to_csv('out_folder_train_prediction_cat_1.074.csv',index= False)

0:	learn: 1.9036093	test: 1.8990542	best: 1.8990542 (0)	total: 1.89s	remaining: 1h 34m 20s
100:	learn: 0.9049218	test: 0.8915325	best: 0.8915325 (100)	total: 2m 57s	remaining: 1h 24m 52s
200:	learn: 0.8375390	test: 0.8283873	best: 0.8283873 (200)	total: 6m 4s	remaining: 1h 24m 33s
300:	learn: 0.7997424	test: 0.7980601	best: 0.7980601 (300)	total: 9m 19s	remaining: 1h 23m 35s
400:	learn: 0.7741627	test: 0.7801037	best: 0.7801037 (400)	total: 12m 39s	remaining: 1h 22m 1s
500:	learn: 0.7557908	test: 0.7699491	best: 0.7699491 (500)	total: 15m 55s	remaining: 1h 19m 25s
600:	learn: 0.7407948	test: 0.7629022	best: 0.7629022 (600)	total: 19m 9s	remaining: 1h 16m 28s
700:	learn: 0.7289020	test: 0.7569599	best: 0.7569599 (700)	total: 22m 24s	remaining: 1h 13m 30s
800:	learn: 0.7191050	test: 0.7520300	best: 0.7520300 (800)	total: 25m 40s	remaining: 1h 10m 29s
900:	learn: 0.7102967	test: 0.7483493	best: 0.7483493 (900)	total: 28m 54s	remaining: 1h 7m 20s
1000:	learn: 0.7028909	test: 0.7452059	best

KeyboardInterrupt: 

In [None]:
test_df = pd.read_csv('../Large_output/test_1.074.csv')
X_test = test_df[['building_id',\
                 'site_id',\
                 'primary_use',\
                 'meter',\
                 'dayofweek',\
                 'square_feet',\
                 'year_built',\
                 'floor_count',\
                 'air_temperature',\
                 'cloud_coverage',\
                 'dew_temperature',\
                 'precip_depth_1_hr',\
                 'sea_level_pressure',\
                 'wind_direction',\
                 'wind_speed',\
                 'relative_humidity',\
                 'feels_like',\
                 'hour']]
X_test.fillna(-999, inplace=True)
X_test=reduce_mem_usage(X_test,use_float16=True)

In [None]:
# separate the test df into 120 part to predict because of limitation of cpu
from tqdm import tqdm_notebook as tqdm
def predictions(models, iterations = 120):
    # split test data into batches
    set_size = len(test_df)
    batch_size = set_size // iterations
    meter_reading = []
    for i in tqdm(range(iterations)):
        pos = i*batch_size
        fold_preds = [np.expm1(model.predict(test_df.iloc[pos : pos+batch_size])) for model in models]
        meter_reading.extend(np.mean(fold_preds, axis=0))

    print(len(meter_reading))
    assert len(meter_reading) == set_size
    test_df['meter_reading']=np.clip(meter_reading, 0, a_max=None)
    test_df.loc[(test_df['site_id']==0) & (test_df['meter']==0),'meter_reading']=test_df.loc[(test_df['site_id']==0) &\
                                                                                             (test_df['meter']==0),'meter_reading'].mul(3.4118)
    submission = pd.read_csv('../Resources/sample_submission.csv')
    submission['meter_reading'] = test_df['meter_reading']
    submission.to_csv('../Large_output/cat_new_csv_nomonth.csv', index=False)
    print('We are done!')
predictions(models)