# Model

After defining the features to use, a model will now be fitted to the data and feature selection will be performed based on performance against the validation dataset witheld earlier.

In [16]:
from lib.data.features import read_raw_data, add_all_features
from lib.common.paths import DATABASE_LOCATION

multi_location_windows = ["5min", "30min", "1h", "2h"]

df_train = add_all_features(read_raw_data(DATABASE_LOCATION, train=True), multi_location_windows)
df_valid = add_all_features(read_raw_data(DATABASE_LOCATION, valid=True), multi_location_windows)

# add a constant to use as an intercept
df_train["one"] = 1.0
df_valid["one"] = 1.0

In [17]:
import numpy as np

multi_room_features = [f"multiple_room_triggers_{window}_per_hour" for window in multi_location_windows]
event_rate_features = ["total_per_hour"]
bathroom_features = ["bathroom_proportion"]
all_features = multi_room_features + event_rate_features + bathroom_features

# add several noise variables to help guage overfitting
n_fake_features = len(all_features) // 2
fake_features = []
for i in range(n_fake_features):
    fake_feature_col = f"fake_{i}"
    df_train[fake_feature_col] = np.random.randn(df_train.shape[0])
    df_valid[fake_feature_col] = np.random.randn(df_valid.shape[0])
    fake_features.append(fake_feature_col)
total_features = all_features + fake_features
print(f"Using features: \n{total_features}")

Using features: 
['multiple_room_triggers_5min_per_hour', 'multiple_room_triggers_30min_per_hour', 'multiple_room_triggers_1h_per_hour', 'multiple_room_triggers_2h_per_hour', 'total_per_hour', 'bathroom_proportion', 'fake_0', 'fake_1', 'fake_2']


## Model Pipeline

Scikit-Learn's model pipeline provides a clean abstraction over the pre-processing, fitting and prediction stages of a relatively simple machine learning model. In addition, it is well integrated with ONNX via `skl2onnx` package.


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from lib.model.stepwise import StepwiseFeatureSelector

def valid_locator(df):
    return df["total_cumulative"] > 5

response = "multiple_occupancy"
X, y = df_train.loc[valid_locator(df_train), total_features].values.astype(np.float32), df_train.loc[valid_locator(df_train), response].values.astype(np.float32)

# Check the shape of features and labels
print("Shape of features:", X.shape)
print("Shape of labels:", y.shape)

# # Create the pipeline
# sub_pipeline = Pipeline([])

pipeline = Pipeline([
    ('feature_selector', StepwiseFeatureSelector(estimator=LogisticRegression(), scoring=accuracy_score)),
    ('classifier', LogisticRegression())
])
pipeline.fit(X, y)
pred_skl = pipeline.predict(X.astype(np.float64))
accuracy = roc_auc_score(y, pred_skl)

Shape of features: (286420, 9)
Shape of labels: (286420,)
Selected 0 with best score of 0.7595105090426646
Selected 3 with best score of 0.7595174917952657
Finishing as selection best score 0.7595174917952657 was not better than existing model 0.7595174917952657
Selected features: [0, 3]


NameError: name 'roc_auc_score' is not defined

In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(y, pred_skl)

0.7594302073877522