In [27]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [28]:
# Read the CSV file
df = pd.read_csv("../data/Occupancy_Estimation.csv")

In [29]:
# Print headers
features = ['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light',
            'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound',
            'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR']

X_train = df[features]
Y_train = df['Room_Occupancy_Count']

In [33]:
# Compute cross-validation score
@ignore_warnings(category=ConvergenceWarning)

# Add documetnation to the function
def Kfold_CV_score(df, hyperparams=(0.1), num_folds=5):
     """
    Perform k-fold cross-validation for a Logistic Regression model.

    Parameters:
    - df : pandas DataFrame
        The dataset containing features and the target variable.
    - hyperparams : tuple, optional (default=(0.1))
        Regularization parameter(s) for the Logistic Regression model.
    - num_folds : int, optional (default=5)
        Number of folds for cross-validation.

    Returns:
    - mean_train_score : float
        Mean training score across all folds.
    - mean_valid_score : float
        Mean validation score across all folds.
    """

    fold_size = len(df) // num_folds

    fold_train_scores = []
    fold_valid_scores = []

    regularizer = hyperparams

    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size

        # Extract the training and validation df for this fold 
        valid_data = df[start:end]
        train_data = df[~df.index.isin(range(start, end))]

        X_train = train_data[features]
        Y_train = train_data['Room_Occupancy_Count']

        X_valid = valid_data[features]
        Y_valid = valid_data['Room_Occupancy_Count']

        # Instantiate and train the model on the training data
        clf = LogisticRegression(class_weight='balanced', C=regularizer)

        clf.fit(X_train, Y_train)

        train_score = clf.score(X_train, Y_train)
        valid_score = clf.score(X_valid, Y_valid)

        fold_train_scores.append(train_score)
        fold_valid_scores.append(valid_score)

        # print(f"Fold {i} score: {score}")

    # Calculate the mean score across all folds
    mean_train_score = sum(fold_train_scores) / len(fold_train_scores)
    mean_valid_score = sum(fold_valid_scores) / len(fold_valid_scores)

    return mean_train_score, mean_valid_score

In [36]:
mean_train_score, mean_valid_score = Kfold_CV_score(df, hyperparams=(1), num_folds=5)

print(f"Mean train score: {mean_train_score}")
print(f"Mean valid score: {mean_valid_score}")

Mean train score: 0.9917324777887464
Mean valid score: 0.9546666666666667
