# Setup

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

Raw data retrieved from https://www.kaggle.com/datasets/pooriamst/occupancy-detection?resource=download.

# Data Processing

In [11]:
# # Read the training and testing data
# training = pd.read_csv("datatrain.csv")
# testing = pd.read_csv("datatest.csv")

# # Combine the training and testing data into one dataframe
# combined_data = pd.concat([training, testing], ignore_index=True)

# # Shuffle the dataset randomly 
# np.random.seed(24)
# shuffled_data = combined_data.sample(frac=1).reset_index(drop=True)

# # Split by the 2:8 ratio (training is bigger)
# train_ratio = 0.8
# train_size = int(train_ratio * len(shuffled_data))

# # split 
# new_train = shuffled_data[:train_size]
# new_test = shuffled_data[train_size:]

# # Write to their respective files
# new_train.to_csv("new_train.csv", index=False)
# new_test.to_csv("new_test.csv", index=False)

In [12]:
def split_date_and_time(df):
    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.strftime('%Y%m%d').astype(float)
    df['time'] = df['date'].dt.hour + df['date'].dt.minute / 60 + df['date'].dt.second / 3600
    return df

# use manually split mixed data from across 2 weeks
training_df = pd.read_csv("new_train.csv")
testing_df = pd.read_csv("new_test.csv")

train_features = split_date_and_time(training_df).drop(columns=["Occupancy", "id", "date"])
train_labels = training_df["Occupancy"]

testing_features = split_date_and_time(testing_df).drop(columns=["Occupancy", "id", "date"])
testing_labels = testing_df["Occupancy"]

X = train_features
# print(X.head())
Y = train_labels
# print(Y.head())

Z = testing_features
# print(Z.head())
Z_labels = testing_labels
# print(Z_labels.head())

  df['date'] = pd.to_datetime(df['date'])
  df['date'] = pd.to_datetime(df['date'])


# ML Models

### Setup

In [13]:
model_accuracy = {}
model_precision = {}
model_recall = {}
model_f1 = {}

In [14]:
def return_metrics(labels, predictions, model_name):

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, zero_division=0)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    model_accuracy[model_name] = accuracy
    model_precision[model_name] = precision
    model_recall[model_name] = recall
    model_f1[model_name] = f1
    print(f"{model_name} accuracy: ", accuracy)
    print(f"{model_name} precision: ", precision)
    print(f"{model_name} recall: ", recall)
    print(f"{model_name} f1: ", f1)

### Decision Tree

In [15]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt = dt.fit(X, Y) # insert correct training data paths (X is training features, Y is training labels)
predictions = dt.predict(Z) # insert correct testing data paths (Z is testing features)

return_metrics(Z_labels, predictions, "Decision Tree") # compare performance against true testing labels

Decision Tree accuracy:  0.9934338521400778
Decision Tree precision:  0.987667009249743
Decision Tree recall:  0.9846311475409836
Decision Tree f1:  0.9861467419189328


### Ridge

In [16]:
from sklearn import linear_model
rc = linear_model.RidgeClassifier()
rc.fit(X, Y)
predictions = rc.predict(Z)

return_metrics(Z_labels, predictions, "Ridge")

Ridge accuracy:  0.9890564202334631
Ridge precision:  0.9586206896551724
Ridge recall:  0.9969262295081968
Ridge f1:  0.9773982923154194


### Support Vector

In [17]:
from sklearn import svm
svc = svm.SVC()
svc.fit(X, Y)
predictions = svc.predict(Z)

return_metrics(Z_labels, predictions, "Support Vector")


# view breakdowns to evaluate model performance
print("\nClass Distribution in True Labels:")
print(pd.Series(Z_labels).value_counts())  # count occurrences of each class in true labels

print("\nClass Distribution in Predictions:")
print(pd.Series(predictions).value_counts())  # count occurrences of each class in predictions

decision_function = svc.decision_function(Z)  # get decision function values
print("\nDecision Function Values:")
print(decision_function)

Support Vector accuracy:  0.7626459143968871
Support Vector precision:  0.0
Support Vector recall:  0.0
Support Vector f1:  0.0

Class Distribution in True Labels:
Occupancy
0    3136
1     976
Name: count, dtype: int64

Class Distribution in Predictions:
0    4112
Name: count, dtype: int64

Decision Function Values:
[-1.0000058  -1.00000441 -1.0000051  ... -1.00000572 -1.0000056
 -1.00000483]


### Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X, Y)
predictions = clf.predict(Z)

return_metrics(Z_labels, predictions, "Random Forest")

Random Forest accuracy:  0.9929474708171206
Random Forest precision:  0.984646878198567
Random Forest recall:  0.985655737704918
Random Forest f1:  0.9851510496671787


### Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

est = GradientBoostingClassifier()
est = est.fit(X, Y) 
predictions = est.predict(Z)

return_metrics(Z_labels, predictions, "Gradient Boosting")

Gradient Boosting accuracy:  0.9922178988326849
Gradient Boosting precision:  0.9777327935222672
Gradient Boosting recall:  0.9897540983606558
Gradient Boosting f1:  0.9837067209775967


### Simple Classifier Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier with our simple models
simple_ensemble = VotingClassifier(
    estimators=[
        ('dt', dt),
        ('rc', rc),
        ('svc', svc)
    ],
    voting='hard'
)

simple_ensemble.fit(X, Y)
ensemble_preds = simple_ensemble.predict(Z)
return_metrics(Z_labels, ensemble_preds, "Simple Voting Ensemble")


Simple Voting Ensemble accuracy:  0.9931906614785992
Simple Voting Ensemble precision:  0.9866529774127311
Simple Voting Ensemble recall:  0.9846311475409836
Simple Voting Ensemble f1:  0.9856410256410256


# Parameter Tuning

In [22]:
# optimized return_metrics function for the parameter tuning
def return_metrics(labels, predictions, model_name):
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, zero_division=0)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    model_accuracy[model_name] = accuracy
    model_precision[model_name] = precision
    model_recall[model_name] = recall
    model_f1[model_name] = f1

    print(f"{model_name} accuracy: {accuracy:.4f}")
    print(f"{model_name} precision: {precision:.4f}")
    print(f"{model_name} recall: {recall:.4f}")
    print(f"{model_name} f1: {f1:.4f}")
    
    # return model_accuracy, model_precision, model_recall

# function to help run everything
def run(func, *args, **kwargs):
    result = func(*args, **kwargs)
    return result

### Decision Tree Tuning

In [23]:
# Random Search for Decision Tree
dt_param_dist = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt_random = run(
    RandomizedSearchCV,
    tree.DecisionTreeClassifier(random_state=42),
    param_distributions=dt_param_dist,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    random_state=42
)

dt_random = run(dt_random.fit, X, Y)
print("Best parameters found by Random Search:")
print(dt_random.best_params_)

best_dt = dt_random.best_estimator_
dt_predictions = best_dt.predict(Z)
return_metrics(Z_labels, dt_predictions, "Optimized Decision Tree")

Best parameters found by Random Search:
{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'criterion': 'gini'}
Optimized Decision Tree accuracy: 0.9934
Optimized Decision Tree precision: 0.9877
Optimized Decision Tree recall: 0.9846
Optimized Decision Tree f1: 0.9861


### Ridge Tuning

In [24]:
# Random Search for Ridge Classifier
ridge_param_dist = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky'],
    'class_weight': [None, 'balanced']
}

ridge_random = run(
    RandomizedSearchCV,
    linear_model.RidgeClassifier(random_state=42),
    param_distributions=ridge_param_dist,
    n_iter=8,
    cv=3,
    scoring='accuracy',
    random_state=42
)

ridge_random = run(ridge_random.fit, X, Y)
print("Best parameters found by Random Search:")
print(ridge_random.best_params_)

best_ridge = ridge_random.best_estimator_
ridge_predictions = best_ridge.predict(Z)
return_metrics(Z_labels, ridge_predictions, "Optimized Ridge")

Best parameters found by Random Search:
{'solver': 'cholesky', 'class_weight': None, 'alpha': 0.1}
Optimized Ridge accuracy: 0.9891
Optimized Ridge precision: 0.9586
Optimized Ridge recall: 0.9969
Optimized Ridge f1: 0.9774


### SVM Tuning

In [25]:
sample_size = min(100, X.shape[0]) # 100 samples
indices = np.random.choice(X.shape[0], size=sample_size, replace=False)
X_sample = X.iloc[indices]
Y_sample = Y.iloc[indices]

svm_param_dist = {
    'C': [0.1, 1.0,5.0],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_random = run(
    RandomizedSearchCV,
    svm.SVC(probability=True, random_state=42),
    param_distributions=svm_param_dist,
    n_iter=5,
    cv=2,
    scoring='precision',
    random_state=42
)

svm_random = run(svm_random.fit, X_sample, Y_sample)
print("Best parameters found by Random Search (on sample):")
print(svm_random.best_params_)

best_svm = svm_random.best_estimator_
svm_predictions = best_svm.predict(Z)
return_metrics(Z_labels, svm_predictions, "Sample-trained SVM")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters found by Random Search (on sample):
{'kernel': 'linear', 'gamma': 'auto', 'C': 5.0}
Sample-trained SVM accuracy: 0.7626
Sample-trained SVM precision: 0.0000
Sample-trained SVM recall: 0.0000
Sample-trained SVM f1: 0.0000


### Random Forest Tuning

In [26]:
# Random Search for Random Forest
rf_param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

rf_random = run(
    RandomizedSearchCV,
    RandomForestClassifier(random_state=42),
    param_distributions=rf_param_dist,
    n_iter=8,
    cv=3,
    scoring='accuracy',
    random_state=42
)

rf_random = run(rf_random.fit, X, Y)
print("Best parameters found by Random Search:")
print(rf_random.best_params_)

best_rf = rf_random.best_estimator_
rf_predictions = best_rf.predict(Z)
return_metrics(Z_labels, rf_predictions, "Optimized Random Forest")

Best parameters found by Random Search:
{'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': False}
Optimized Random Forest accuracy: 0.9934
Optimized Random Forest precision: 0.9867
Optimized Random Forest recall: 0.9857
Optimized Random Forest f1: 0.9862


### Gradient Boosting Tuning

In [27]:
# Random Search for Gradient Boosting
gb_param_dist = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'subsample': [0.8, 1.0]
}

gb_random = run(
    RandomizedSearchCV,
    GradientBoostingClassifier(random_state=42),
    param_distributions=gb_param_dist,
    n_iter=8,
    cv=3,
    scoring='accuracy',
    random_state=42
)

gb_random = run(gb_random.fit, X, Y)
print("Best parameters found by Random Search:")
print(gb_random.best_params_)

best_gb = gb_random.best_estimator_
gb_predictions = best_gb.predict(Z)
return_metrics(Z_labels, gb_predictions, "Optimized Gradient Boosting")

Best parameters found by Random Search:
{'subsample': 1.0, 'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 7, 'learning_rate': 0.1}
Optimized Gradient Boosting accuracy: 0.9939
Optimized Gradient Boosting precision: 0.9837
Optimized Gradient Boosting recall: 0.9908
Optimized Gradient Boosting f1: 0.9872


### Ensemble Tuning

In [28]:
# Create a simple voting classifier with our tuned simple models
simple_ensemble = VotingClassifier(
    estimators=[
        ('dt', best_dt),
        ('ridge', best_ridge),
        ('svm', best_svm)
    ],
    voting='hard'
)

simple_ensemble = run(simple_ensemble.fit, X, Y)
ensemble_preds = simple_ensemble.predict(Z)
return_metrics(Z_labels, ensemble_preds, "Optimized Simple Voting Ensemble")

Optimized Simple Voting Ensemble accuracy: 0.9932
Optimized Simple Voting Ensemble precision: 0.9877
Optimized Simple Voting Ensemble recall: 0.9836
Optimized Simple Voting Ensemble f1: 0.9856


### Determine best model

In [29]:
best_accuracy_model = max(model_accuracy, key = model_accuracy.get)
best_precision_model = max(model_precision, key = model_precision.get)
best_recall_model = max(model_recall, key = model_recall.get)
best_f1_model = max(model_f1, key = model_f1.get)

print(f"Best ACCURACY for binary occupancy detection model is {best_accuracy_model} with an accuracy of {model_accuracy[best_accuracy_model]}")
for model in sorted(model_accuracy.items(), key=lambda item: item[1]):
    print(f"\t{model}")
print(f"Best PRECISION for binary occupancy detection model is {best_precision_model} with a precision of {model_precision[best_precision_model]}")
for model in sorted(model_precision.items(), key=lambda item: item[1]):
    print(f"\t{model}")
print(f"Best RECALL for binary occupancy detection model is {best_recall_model} with a recall of {model_precision[best_recall_model]}")
for model in sorted(model_recall.items(), key=lambda item: item[1]):
    print(f"\t{model}")
print(f"Best F1 for binary occupancy detection model is {best_f1_model} with an f1 of {model_f1[best_f1_model]}")
for model in sorted(model_f1.items(), key=lambda item: item[1]):
    print(f"\t{model}")

Best ACCURACY for binary occupancy detection model is Optimized Gradient Boosting with an accuracy of 0.993920233463035
	('Support Vector', 0.7626459143968871)
	('Sample-trained SVM', 0.7626459143968871)
	('Ridge', 0.9890564202334631)
	('Optimized Ridge', 0.9890564202334631)
	('Gradient Boosting', 0.9922178988326849)
	('Random Forest', 0.9929474708171206)
	('Simple Voting Ensemble', 0.9931906614785992)
	('Optimized Simple Voting Ensemble', 0.9931906614785992)
	('Decision Tree', 0.9934338521400778)
	('Optimized Decision Tree', 0.9934338521400778)
	('Optimized Random Forest', 0.9934338521400778)
	('Optimized Gradient Boosting', 0.993920233463035)
Best PRECISION for binary occupancy detection model is Decision Tree with a precision of 0.987667009249743
	('Support Vector', 0.0)
	('Sample-trained SVM', 0.0)
	('Ridge', 0.9586206896551724)
	('Optimized Ridge', 0.9586206896551724)
	('Gradient Boosting', 0.9777327935222672)
	('Optimized Gradient Boosting', 0.9837232960325534)
	('Random Forest',

### Balance Dataset (50/50 Split Occupied/Unoccupied)

To fix the issue with the SVM evaluation results, we decided to modify the dataset by balancing the number of data points labeled Occupied vs Unoccupied and retrained all the models based off the new dataset.

In [30]:
from sklearn.utils import resample
import pandas as pd
from sklearn.model_selection import train_test_split

# Combine features and labels into one DataFrame
df_full = pd.concat([X, Y], axis=1)
df_full.columns = list(X.columns) + ['Occupancy']

# Separate the classes
occupied = df_full[df_full['Occupancy'] == 1]
unoccupied = df_full[df_full['Occupancy'] == 0]

# Undersample the majority class to match the minority
unoccupied_downsampled = resample(unoccupied,
                                  replace=False,
                                  n_samples=len(occupied),
                                  random_state=42)

# Combine to get balanced dataset
balanced_df = pd.concat([occupied, unoccupied_downsampled])

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split features and labels again
X_balanced = balanced_df.drop(columns=['Occupancy'])
Y_balanced = balanced_df['Occupancy']

print("Original class distribution:")
print(Y.value_counts())

print("\nBalanced class distribution:")
print(Y_balanced.value_counts())

# Split balanced data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_balanced, Y_balanced,
    test_size=0.2,
    random_state=42,
    stratify=Y_balanced  # preserve class balance
)

Original class distribution:
Occupancy
0    12674
1     3774
Name: count, dtype: int64

Balanced class distribution:
Occupancy
0    3774
1    3774
Name: count, dtype: int64


### Retrain SVM

In [31]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Scale the balanced data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test_bal)

# Using best parameters found from tuning
best_svm = SVC(C=5.0, kernel='linear', gamma='auto', probability=True) 
best_svm.fit(X_train_scaled, y_train_bal)

# Predict and evaluate
y_pred_bal_scaled = best_svm.predict(X_test_scaled)
return_metrics(y_test_bal, y_pred_bal_scaled, "Tuned SVM on Balanced + Scaled Data")

# Checking predictions
import pandas as pd
print("\nClass Distribution in Predictions:")
print(pd.Series(y_pred_bal_scaled).value_counts())


Tuned SVM on Balanced + Scaled Data accuracy: 0.9881
Tuned SVM on Balanced + Scaled Data precision: 0.9804
Tuned SVM on Balanced + Scaled Data recall: 0.9960
Tuned SVM on Balanced + Scaled Data f1: 0.9882

Class Distribution in Predictions:
1    767
0    743
Name: count, dtype: int64


### Retrain Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

dt_bal = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)
dt_bal.fit(X_train_bal, y_train_bal)
y_pred_dt = dt_bal.predict(X_test_bal)
return_metrics(y_test_bal, y_pred_dt, "Balanced Decision Tree")


Balanced Decision Tree accuracy: 0.9894
Balanced Decision Tree precision: 0.9855
Balanced Decision Tree recall: 0.9934
Balanced Decision Tree f1: 0.9894


### Retrain Ridge

In [33]:
from sklearn.linear_model import RidgeClassifier

ridge_bal = RidgeClassifier(
    alpha=0.1,
    solver='cholesky',
    class_weight=None
)
ridge_bal.fit(X_train_scaled, y_train_bal)
y_pred_ridge = ridge_bal.predict(X_test_scaled)
return_metrics(y_test_bal, y_pred_ridge, "Balanced Ridge Classifier")


Balanced Ridge Classifier accuracy: 0.9841
Balanced Ridge Classifier precision: 0.9716
Balanced Ridge Classifier recall: 0.9974
Balanced Ridge Classifier f1: 0.9843


### Retrain Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

rf_bal = RandomForestClassifier(
    n_estimators=150,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=False,
    random_state=42
)
rf_bal.fit(X_train_bal, y_train_bal)
y_pred_rf = rf_bal.predict(X_test_bal)
return_metrics(y_test_bal, y_pred_rf, "Balanced Random Forest")


Balanced Random Forest accuracy: 0.9901
Balanced Random Forest precision: 0.9868
Balanced Random Forest recall: 0.9934
Balanced Random Forest f1: 0.9901


### Retrain Gradient Boosting

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

gb_bal = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=7,
    min_samples_split=5,
    subsample=1.0,
    random_state=42
)
gb_bal.fit(X_train_scaled, y_train_bal)
y_pred_gb = gb_bal.predict(X_test_scaled)
return_metrics(y_test_bal, y_pred_gb, "Balanced Gradient Boosting")


Balanced Gradient Boosting accuracy: 0.9894
Balanced Gradient Boosting precision: 0.9830
Balanced Gradient Boosting recall: 0.9960
Balanced Gradient Boosting f1: 0.9895


### Retrain Ensemble

In [36]:
from sklearn.ensemble import VotingClassifier

ensemble_bal = VotingClassifier(
    estimators=[
        ('dt', dt_bal),
        ('ridge', ridge_bal),
        ('svm', best_svm)
    ],
    voting='hard'
)
ensemble_bal.fit(X_train_bal, y_train_bal)
y_pred_ensemble = ensemble_bal.predict(X_test_bal)
return_metrics(y_test_bal, y_pred_ensemble, "Balanced Simple Ensemble")


Balanced Simple Ensemble accuracy: 0.9894
Balanced Simple Ensemble precision: 0.9855
Balanced Simple Ensemble recall: 0.9934
Balanced Simple Ensemble f1: 0.9894
