In [1]:
from LGBM_classes_methods import *

All classes are imported from another file.

# Classifier

In [2]:
lgbm_reg_clf = CatSplitRegressor(LGBMWrapper(categorical_feature = categorical_columns), "had_air_temperature")

# Train Model

## Open Data, Get Train/Test Sets

In [3]:
c =CombinedAttributesAdder()

In [4]:
df = pd.read_pickle(r'C:\Users\lukep\Documents\big_data\ASHRAE\PROCESSED_TRAIN_DF.pkl')
# df = df[df['meter']==0] # Remove Later
df = c.transform(df)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['building_id']):
    strat_train_set = df.iloc[train_index]
    strat_test_set= df.iloc[test_index]

In [5]:
X_train = strat_train_set.drop(['meter_reading'], axis='columns')
y_train = strat_train_set['meter_reading'].apply(np.log1p) # use logarithm bc of cost function at the end

X_test = strat_test_set.drop(['meter_reading'], axis='columns')
y_test = strat_test_set['meter_reading'].apply(np.log1p) # use logarithm bc of cost function at the end

In [6]:
X_train.columns

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'surf_area',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'had_air_temperature', 'had_cloud_coverage',
       'had_dew_temperature', 'had_precip_depth_1_hr',
       'had_sea_level_pressure', 'had_wind_direction', 'had_wind_speed',
       'tm_day_of_week', 'tm_hour_of_day'],
      dtype='object')

Convert y_train and y_test to log space as this helps minimize error

## Train Linear Regression and LGBM Model

Fit LGBM model (note that this class takes in a DataFrame)

In [7]:
def np_sample(a, frac):
    return a if frac == 1 else np.random.choice(a, int(len(a) * frac), replace=False)

def make_8121_splits(X, sample_frac):
    np.random.seed(0)
    time_sorted_idx = np.argsort(X.timestamp.values, kind='stable')
    sections = np.array_split(time_sorted_idx, 12)
    folds = []
    for start_ix in range(0, 12, 2):
        val_idxs = np.concatenate(sections[start_ix:start_ix + 2])  # no modulo necessary
        train_idxs = np.concatenate(
            [sections[ix % 12] for ix in range(start_ix + 3, start_ix + 11)])
        folds.append((np_sample(train_idxs, sample_frac), np_sample(val_idxs, sample_frac)))
    return folds

def make_cv_predictions(model, split, X, y):
    preds = []
    for ix, (train_fold, val_fold) in enumerate(split):
        # Train
        Xt = X.iloc[train_fold]
        yt = y.reindex_like(Xt)
        # Validation
        Xv = X.iloc[val_fold]
        yv = y.reindex_like(Xv)
        # Fit Train
        model.fit(Xt, yt)
        # Make Predictions
        preds.append(pd.DataFrame(dict(target=yv, prediction=model.predict(Xv)), index=yv.index))
    result = pd.concat(preds).sort_index()
    return result.target, result.prediction

Create model.

In [8]:
model = CatSplitRegressor(LGBMWrapper(categorical_feature = categorical_columns), "had_air_temperature")

Split training set by time stamp. drop timestamps, then make cross val precitions.

In [9]:
splits = make_8121_splits(X_train, 0.2)
X_train = X_train.drop(columns="timestamp")  # Raw timestamp doesn't carry over to test data
sampled_y, sampled_prediction = make_cv_predictions(model, splits, X_train, y_train)

## Look at Predictions

In [10]:
sampled_y = np.expm1(sampled_y)
sampled_prediction = np.expm1(sampled_prediction)
rmsle = np.sqrt(mean_squared_log_error(sampled_y, np.clip(sampled_prediction, 0, None)))
print(rmsle)

1.3372759206617992


In [11]:
importances = pd.Series(model.feature_importances_, index=X_train.columns).rename("Importance")
display(importances.sort_values(ascending=False).to_frame().T)

Unnamed: 0,building_id,meter,air_temperature,square_feet,dew_temperature,tm_hour_of_day,site_id,year_built,sea_level_pressure,wind_direction,...,floor_count,surf_area,tm_day_of_week,had_precip_depth_1_hr,had_wind_direction,had_wind_speed,had_dew_temperature,had_cloud_coverage,had_air_temperature,had_sea_level_pressure
Importance,1217.0,437.0,250.5,236.5,185.0,125.5,91.0,72.0,64.0,61.0,...,35.5,34.0,25.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
importances

building_id               1217.0
meter                      437.0
site_id                     91.0
primary_use                 37.0
square_feet                236.5
year_built                  72.0
floor_count                 35.5
surf_area                   34.0
air_temperature            250.5
cloud_coverage              38.0
dew_temperature            185.0
precip_depth_1_hr           50.0
sea_level_pressure          64.0
wind_direction              61.0
wind_speed                  38.0
had_air_temperature          0.0
had_cloud_coverage           0.0
had_dew_temperature          0.0
had_precip_depth_1_hr        2.0
had_sea_level_pressure       0.0
had_wind_direction           1.0
had_wind_speed               0.0
tm_day_of_week              25.0
tm_hour_of_day             125.5
Name: Importance, dtype: float64

# Save Model

In [13]:
pkl_filename = "lgbm_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lgbm_reg_clf, file)

In [14]:
np.clip(sampled_prediction, 0, None)

0            77.219372
2             9.460569
5            11.210402
17           34.775734
18          358.453320
               ...    
20216078     46.297244
20216080     75.458812
20216085    575.919918
20216088    862.679601
20216098    258.818447
Name: prediction, Length: 3234576, dtype: float64

In [15]:
np.sqrt(mean_squared_log_error(sampled_y, sampled_prediction))

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [None]:
X_train.head()['site_id']