In [1]:
from LGBM_classes_methods import *

All classes are imported from another file.

# Classifier

In [2]:
lgbm_reg_clf = CatSplitRegressor(LGBMWrapper(categorical_feature = categorical_columns), "had_air_temperature")

# Train Model

## Open Data, Get Train/Test Sets

In [3]:
c =CombinedAttributesAdder()

In [4]:
df = pd.read_pickle(r'C:\Users\lukep\Documents\big_data\ASHRAE\PROCESSED_TRAIN_DF.pkl')
# df = df[df['meter']==0] # Remove Later
df = c.transform(df)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['building_id']):
    strat_train_set = df.iloc[train_index]
    strat_test_set= df.iloc[test_index]

In [5]:
X_train = strat_train_set.drop(['meter_reading'], axis='columns')
y_train = strat_train_set['meter_reading'].apply(np.log1p) # use logarithm bc of cost function at the end

X_test = strat_test_set.drop(['meter_reading'], axis='columns')
y_test = strat_test_set['meter_reading'].apply(np.log1p) # use logarithm bc of cost function at the end

In [6]:
X_train.columns

Index(['building_id', 'meter', 'timestamp', 'site_id', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed',
       'had_air_temperature', 'had_cloud_coverage', 'had_dew_temperature',
       'had_precip_depth_1_hr', 'had_sea_level_pressure', 'had_wind_direction',
       'had_wind_speed', 'tm_day_of_week', 'tm_hour_of_day'],
      dtype='object')

Convert y_train and y_test to log space as this helps minimize error

In [7]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

## Train Linear Regression and LGBM Model

Fit LGBM model (note that this class takes in a DataFrame)

In [8]:
def np_sample(a, frac):
    return a if frac == 1 else np.random.choice(a, int(len(a) * frac), replace=False)

def make_8121_splits(X, sample_frac):
    np.random.seed(0)
    time_sorted_idx = np.argsort(X.timestamp.values, kind='stable')
    sections = np.array_split(time_sorted_idx, 12)
    folds = []
    for start_ix in range(0, 12, 2):
        val_idxs = np.concatenate(sections[start_ix:start_ix + 2])  # no modulo necessary
        train_idxs = np.concatenate(
            [sections[ix % 12] for ix in range(start_ix + 3, start_ix + 11)])
        folds.append((np_sample(train_idxs, sample_frac), np_sample(val_idxs, sample_frac)))
    return folds

def make_cv_predictions(model, split, X, y):
    preds = []
    for ix, (train_fold, val_fold) in enumerate(split):
        # Train
        Xt = X.iloc[train_fold]
        yt = y.reindex_like(Xt)
        # Validation
        Xv = X.iloc[val_fold]
        yv = y.reindex_like(Xv)
        # Fit Train
        model.fit(Xt, yt)
        # Make Predictions
        preds.append(pd.DataFrame(dict(target=yv, prediction=model.predict(Xv)), index=yv.index))
    result = pd.concat(preds).sort_index()
    return result.target, result.prediction

Create model.

In [9]:
model = CatSplitRegressor(LGBMWrapper(categorical_feature = categorical_columns), "meter")

Split training set by time stamp. drop timestamps, then make cross val precitions.

In [10]:
splits = make_8121_splits(X_train, 0.2)
X_train = X_train.drop(columns="timestamp")  # Raw timestamp doesn't carry over to test data
sampled_y, sampled_prediction = make_cv_predictions(model, splits, X_train, y_train)

## Look at Predictions

In [17]:
sampled_y = np.expm1(sampled_y)
sampled_prediction = np.expm1(sampled_prediction)
rmsle = np.sqrt(mean_squared_log_error(sampled_y, np.clip(sampled_prediction, 0, None)))
print(rmsle)

0.42364902072234817


In [12]:
importances = pd.Series(model.feature_importances_, index=X_train.columns).rename("Importance")
display(importances.sort_values(ascending=False).to_frame().T)

Unnamed: 0,building_id,air_temperature,dew_temperature,tm_hour_of_day,sea_level_pressure,wind_direction,tm_day_of_week,square_feet,cloud_coverage,wind_speed,...,floor_count,had_cloud_coverage,primary_use,had_precip_depth_1_hr,had_wind_speed,had_sea_level_pressure,had_dew_temperature,had_wind_direction,had_air_temperature,meter
Importance,1372.0,365.5,295.25,273.5,171.5,93.5,87.75,83.75,68.0,51.25,...,12.0,9.75,8.5,1.0,0.75,0.5,0.25,0.25,0.0,0.0


In [16]:
importances

building_id               1372.00
meter                        0.00
site_id                     47.25
primary_use                  8.50
square_feet                 83.75
year_built                  22.00
floor_count                 12.00
air_temperature            365.50
cloud_coverage              68.00
dew_temperature            295.25
precip_depth_1_hr           35.75
sea_level_pressure         171.50
wind_direction              93.50
wind_speed                  51.25
had_air_temperature          0.00
had_cloud_coverage           9.75
had_dew_temperature          0.25
had_precip_depth_1_hr        1.00
had_sea_level_pressure       0.50
had_wind_direction           0.25
had_wind_speed               0.75
tm_day_of_week              87.75
tm_hour_of_day             273.50
Name: Importance, dtype: float64

# Save Model

In [13]:
pkl_filename = "lgbm_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lgbm_reg_clf, file)

In [14]:
np.clip(sampled_prediction, 0, None)

0           1.167155
2           0.730165
5           0.560158
17          1.051912
18          1.293673
              ...   
20216078    1.521131
20216080    1.483939
20216085    1.909814
20216088    2.084989
20216098    1.823721
Name: prediction, Length: 3234576, dtype: float64

In [15]:
np.sqrt(mean_squared_log_error(sampled_y, sampled_prediction))

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.