In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.utils._testing import ignore_warnings
from sklearn import svm
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
import random

In [8]:
# Read the CSV file
df = pd.read_csv("../data/Occupancy_Estimation.csv")

# Display the first few rows of the DataFrame
df
# df_drop = df.drop(columns = ['Date', 'Time', 'Room_Occupancy_Count'])
# # df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])


features = [ 'S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light',
            'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound',
            'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR' ]
# for col in df_drop.
for feature in features:
    df[feature] = (df[feature] - df[feature].min()) / (df[feature].max() - df[feature].min())


In [9]:
# Compute cross-validation score
@ignore_warnings(category=ConvergenceWarning)

# Add documetnation to the function
def Kfold_CV_score(df, features, hyperparams=(1), num_folds=5):
    # TODO add doc string

    fold_size = len(df) // num_folds

    fold_train_scores = []
    fold_valid_scores = []


    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size

        # Extract the training and validation df for this fold 
        valid_data = df[start:end]
        train_data = df[~df.index.isin(range(start, end))]

        valid_not_norms = df[start:end]
        train_not_norms = df[~df.index.isin(range(start, end))]

        X_train = train_data[features]
        Y_train = train_not_norms['Room_Occupancy_Count']

        X_valid = valid_data[features]
        Y_valid = valid_not_norms['Room_Occupancy_Count']

        # Instantiate and train the model on the training data
        mod = svm.SVC(C=hyperparams['C'], kernel=hyperparams['kernel'], degree=hyperparams['degree'], class_weight=hyperparams['class_weight'])
        #mod = svm.SVC(C = hyperparams['C'],class_weight=hyperparams['class_weight'])
        mod.fit(X_train, Y_train)

        train_score = mod.score(X_train, Y_train)
        valid_score = mod.score(X_valid, Y_valid)

        fold_train_scores.append(train_score)
        fold_valid_scores.append(valid_score)

        # print(f"Fold {i} score: {score}")

    # Calculate the mean score across all folds
    mean_train_score = sum(fold_train_scores) / len(fold_train_scores)
    mean_valid_score = sum(fold_valid_scores) / len(fold_valid_scores)

    return mean_train_score, mean_valid_score

In [10]:

#fit the model to the training data
features = [ 'S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light',
            'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound',
            'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR' ]

regularizers = []
#Hyperparameters included here are 
hyperparams = {}
hyperparams['C'] = 1
hyperparams['kernel'] = 'linear'
hyperparams['degree'] = 6
hyperparams['class_weight'] = 'balanced'   
hyperparams['max_iter'] = 10,000,000
num_folds = 5

# Shuffle the datai
df = df.sample(frac=1)

mean_train_score, mean_valid_score = Kfold_CV_score(
    df=df, features=features, hyperparams=hyperparams, num_folds=num_folds)

print(f"MODEL ARCH ==============================")
print(f"Features: {features}")
print(f"Regularizer: {hyperparams}")
print()
print(f"MODEL EVAL ==============================")
print(f"Num CV folds: {num_folds}")
print(f"Mean train score: {mean_train_score}")
print(f"Mean valid score: {mean_valid_score}")

Features: ['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light', 'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR']
Regularizer: {'C': 1, 'kernel': 'linear', 'degree': 6, 'class_weight': 'balanced', 'max_iter': (10, 0, 0)}

Num CV folds: 5
Mean train score: 0.9942497532082921
Mean valid score: 0.9917037037037038
