# Import packages

In [1]:
import sys
sys.path.append('..')
from modules import preprocess

Invoking __init__.py for modules


In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.neural_network import MLPClassifier

# 1. Load dataset

In [3]:
df = preprocess.load_csv_data('../dataset/Occupancy_Estimation.csv')

In [4]:
df.head()

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,2017/12/22,10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,2017/12/22,10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,2017/12/22,10:50:42,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,2017/12/22,10:51:13,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,2017/12/22,10:51:44,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


# 2. Features transformation
- Recode 'Room_Occupancy_Count' to be binary
- Drop 'Time' and 'Date' columns

In [5]:
# Step 1: Recode Room_Occupancy_Count column
preprocess.recode_dataset_output(df)

# Step 2: Filter Date and Time columns
preprocess.remove_time_columns(df)

df.to_csv('dataset.csv', index=False)

In [6]:
df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


# 3. Drop duplicated rows

In [7]:
#preprocess.delete_duplicates(df)

In [8]:
#df.head()

# 4. Split dataset

In [9]:
# Split dataset into features and output
X = preprocess.get_features(df)
y = preprocess.get_output(df)

# Split dataset into training and test
X_train, X_test, y_train, y_test = preprocess.split_dataset(X, y)

# 4. Feature scaling

## 4.1 MinMaxScaler

In [10]:
pipeline = Pipeline([('scaler', MinMaxScaler()), ('estimator', LogisticRegression(max_iter=500))])

# Fit the pipeline to your training data
pipeline.fit(X_train, y_train)

# Predict using the trained pipeline
y_pred = pipeline.predict(X_test)

score = pipeline.score(X_test,y_test)
print("Accuracy for our testing dataset using MinMaxScaler is : {:.3f}%".format(score*100) )

Accuracy for our testing dataset using MinMaxScaler is : 99.309%


## 4.2 StandardScaler

In [11]:
pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', LogisticRegression(max_iter=500))])

# Fit the pipeline to your training data
pipeline.fit(X_train, y_train)

# Predict using the trained pipeline
y_pred = pipeline.predict(X_test)

score = pipeline.score(X_test,y_test)
print("Accuracy for our testing dataset using StandardScaler is : {:.3f}%".format(score*100) )

Accuracy for our testing dataset using StandardScaler is : 99.737%


## 4.3 RobustScaler

In [12]:
pipeline = Pipeline([('scaler', RobustScaler()), ('estimator', LogisticRegression(max_iter=500))])

# Fit the pipeline to your training data
pipeline.fit(X_train, y_train)

# Predict using the trained pipeline
y_pred = pipeline.predict(X_test)

score = pipeline.score(X_test,y_test)
print("Accuracy for our testing dataset using RobustScaler is : {:.3f}%".format(score*100) )

Accuracy for our testing dataset using RobustScaler is : 99.638%


## 4.4 MaxAbsScaler

In [13]:
pipeline = Pipeline([('scaler', MaxAbsScaler()), ('estimator', LogisticRegression(max_iter=500))])

# Fit the pipeline to your training data
pipeline.fit(X_train, y_train)

# Predict using the trained pipeline
y_pred = pipeline.predict(X_test)

score = pipeline.score(X_test,y_test)
print("Accuracy for our testing dataset using MaxAbsScaler is : {:.3f}%".format(score*100) )

Accuracy for our testing dataset using MaxAbsScaler is : 99.276%
