In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import statsmodels.api as sm
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

In [2]:
# read in dataset
data = pd.read_csv('Occupancy.csv')

data

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-02 14:19:00,23.7000,26.2720,585.200000,749.200000,0.004764,1
1,2015-02-02 14:19:59,23.7180,26.2900,578.400000,760.400000,0.004773,1
2,2015-02-02 14:21:00,23.7300,26.2300,572.666667,769.666667,0.004765,1
3,2015-02-02 14:22:00,23.7225,26.1250,493.750000,774.750000,0.004744,1
4,2015-02-02 14:23:00,23.7540,26.2000,488.600000,779.000000,0.004767,1
...,...,...,...,...,...,...,...
20555,2015-02-18 09:15:00,20.8150,27.7175,429.750000,1505.250000,0.004213,1
20556,2015-02-18 09:16:00,20.8650,27.7450,423.500000,1514.500000,0.004230,1
20557,2015-02-18 09:16:59,20.8900,27.7450,423.500000,1521.500000,0.004237,1
20558,2015-02-18 09:17:59,20.8900,28.0225,418.750000,1632.000000,0.004279,1


In [3]:
# create datetime variable and drop date object variable
data['Datetime'] = pd.to_datetime(data['date'])
data = data.drop("date", axis=1)


In [4]:
# check for missing values
data.isna().sum()

Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupancy        0
Datetime         0
dtype: int64

In [5]:
# dependent variable (Y) is Occupancy (binary 0/1)
# independent variables (X) are continuous:
#     Temperature, Humidity, Light, CO2, HumidityRatio are float

# Split the dataset into independent variables (X) and the dependent variable (y)
y_dep = data["Occupancy"]
x_indep = data[["Temperature","Humidity","Light","CO2","HumidityRatio"]]

In [6]:
#LOGISTIC REGRESSION: DEPENDENT VARIABLES ARE BINARY

# Split the data into training and testing sets
x_indep_train, x_indep_test, y_dep_train, y_dep_test = train_test_split(x_indep, y_dep, test_size=0.2, random_state=42)

# Create a logistic regression model
model_reg = LogisticRegression()

# Fit the model to the training data
model_reg.fit(x_indep_train, y_dep_train)

# Make predictions on the test data
y_dep_pred = model_reg.predict(x_indep_test)

# Evaluate the model
accuracy_reg = accuracy_score(y_dep_test, y_dep_pred)
report_reg = classification_report(y_dep_test, y_dep_pred)
cm_reg = confusion_matrix(y_dep_test, y_dep_pred)

# Print the results
print(f'Accuracy of Logistic Regression: {accuracy_reg}')
print("\n------------------------------------------------------------\n")
print('Classification Report of Logistic Regression:\n', report_reg)
print("\n------------------------------------------------------------\n")
print('Confusion Matrix of Logistic Regression:\n', cm_reg)


Accuracy of Logistic Regression: 0.9917315175097277

------------------------------------------------------------

Classification Report of Logistic Regression:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      3192
           1       0.97      1.00      0.98       920

    accuracy                           0.99      4112
   macro avg       0.98      0.99      0.99      4112
weighted avg       0.99      0.99      0.99      4112


------------------------------------------------------------

Confusion Matrix of Logistic Regression:
 [[3162   30]
 [   4  916]]


In [7]:
# GENERALIZED LINEAR MODEL

# create glm model and fit
model_glm = sm.GLM(y_dep_train, x_indep_train, family=sm.families.Binomial()).fit()

print(model_glm.summary())

# Make predictions on the test data
y_dep_pred1 = model_glm.predict(x_indep_test)

# Evaluate the model
#accuracy_glm = accuracy_score(y_dep_test, y_dep_pred1)
#cm_glm = confusion_matrix(y_dep_test, y_dep_pred1)

# Print the results
#print(f'Accuracy of GLM: {accuracy_glm}')
#print("\n------------------------------------------------------------\n")
#print('Confusion Matrix of GLM:\n', cm_glm)


                 Generalized Linear Model Regression Results                  
Dep. Variable:              Occupancy   No. Observations:                16448
Model:                            GLM   Df Residuals:                    16443
Model Family:                Binomial   Df Model:                            4
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -906.95
Date:                Mon, 22 Jan 2024   Deviance:                       1813.9
Time:                        13:16:17   Pearson chi2:                 6.33e+11
No. Iterations:                    10   Pseudo R-squ. (CS):             0.6229
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Temperature      -0.6173      0.026    -23.414

In [8]:
# RUN REGRESSION FOR EACH VARIABLE SEPARATELY

factor_list = ["Temperature","Humidity","Light","CO2","HumidityRatio"]

# function to run all the predictor variables in regression and capture the statistics
def runreg(var, num):
    tempset = data.copy()
    # x is the independent variable that influences y
    x1 = pd.DataFrame(tempset[var])
    # define the target
    # y is the dependent variable we are trying to predict
    y1 = pd.DataFrame(tempset["Occupancy"])
    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.2, random_state=42)    
    # Create a logistic regression model
    model1 = LogisticRegression()
    # Fit the model to the training data
    model1.fit(x_train, y_train)
    # Make predictions on the test data
    y_pred = model1.predict(x_test)
    # Evaluate the model
    accuracy1 = accuracy_score(y_test, y_pred)
    # Print the results
    print(f'Accuracy of Logistic Regression for Predictor variable {var}: {accuracy1}')
    print("\n------------------------------------------------------------\n")
    # create glm model and fit
    model2 = sm.GLM(y_train, x_train, family=sm.families.Binomial()).fit()
    # Print the results
    print(model2.summary())
    print("\n------------------------------------------------------------\n")

# iterate through factor_list
for idx, x in enumerate(factor_list):
    runreg(x, idx+1)
    

Accuracy of Logistic Regression for Predictor variable Temperature: 0.8120136186770428

------------------------------------------------------------

                 Generalized Linear Model Regression Results                  
Dep. Variable:              Occupancy   No. Observations:                16448
Model:                            GLM   Df Residuals:                    16447
Model Family:                Binomial   Df Model:                            0
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -9159.8
Date:                Mon, 22 Jan 2024   Deviance:                       18320.
Time:                        13:16:17   Pearson chi2:                 1.67e+04
No. Iterations:                     5   Pseudo R-squ. (CS):           -0.02879
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|     

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                 Generalized Linear Model Regression Results                  
Dep. Variable:              Occupancy   No. Observations:                16448
Model:                            GLM   Df Residuals:                    16447
Model Family:                Binomial   Df Model:                            0
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -10570.
Date:                Mon, 22 Jan 2024   Deviance:                       21139.
Time:                        13:16:17   Pearson chi2:                 1.70e+04
No. Iterations:                     4   Pseudo R-squ. (CS):            -0.2212
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
CO2           -0.0009   2.28e-05    -38.743      0.0

  y = column_or_1d(y, warn=True)
