### Import modules for dataset generation

In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


### Generate Dataset

This dataset represents a yearly pattern of devices states based on different features:

To build a machine learning model that automatically sets the air conditioning (AC) temperature and decides whether to turn on or off the lights based on user behavior, you'll need a dataset with relevant features, such as time, temperature, and light preferences. Since I cannot provide an actual dataset, I can generate synthetic data and provide a Python code framework for training the model.
Step 1: Synthetic Data Generation

The synthetic dataset will include:

    Date/Time: Time and date to capture seasonal patterns.
    Room Temperature: The current temperature in the room.
    Outside Temperature: External temperature that might influence AC settings.
    Occupancy: Whether the room is occupied or not.
    AC Temperature: The temperature the user typically sets.
    Light On/Off: Whether the light is turned on or off.

In [2]:
# Data generation parameters
days = 365
data_per_day = 24  # Data points per day (1 per hour)
n_samples = days * data_per_day

# Create a date range for a year
start_date = datetime(2023, 1, 1)
timestamps = [start_date + timedelta(hours=i) for i in range(n_samples)]

# Generate synthetic data
np.random.seed(42)
outside_temperature = np.random.normal(20, 10, n_samples)  # Simulating outside temp
room_occupancy = np.random.choice([0, 1], n_samples, p=[0.7, 0.3])  # 70% unoccupied, 30% occupied
ac_temperature = np.where(room_occupancy == 1, np.random.normal(22, 2, n_samples), np.nan)  # Set temp if occupied
room_temperature = outside_temperature + np.random.normal(2, 1, n_samples)  # Simulate room temp

# Light status based on occupancy and time of day
light_on = np.where((room_occupancy == 1) & ((np.array([d.hour for d in timestamps]) < 6) |
                                       (np.array([d.hour for d in timestamps]) > 18)), 1, 0)

# Create a DataFrame
df = pd.DataFrame({
    'timestamp': timestamps,
    'outside_temperature': outside_temperature,
    'room_temperature': room_temperature,
    'occupancy': room_occupancy,
    'ac_temperature': ac_temperature,
    'light_on': light_on
})

# Fill in missing AC temperatures when room is unoccupied
df['ac_temperature'].ffill(inplace=True)

# Display the first few rows of the dataset
print(df.tail(10))

# Save to CSV
# df.to_csv('synthetic_ac_light_data.csv', index=False)

               timestamp  outside_temperature  room_temperature  occupancy   
8750 2023-12-31 14:00:00            28.810622         31.771647          1  \
8751 2023-12-31 15:00:00            25.405895         29.001936          1   
8752 2023-12-31 16:00:00            26.190832         27.373568          1   
8753 2023-12-31 17:00:00            16.244910         19.623233          1   
8754 2023-12-31 18:00:00            13.439028         14.088931          0   
8755 2023-12-31 19:00:00            32.692557         35.840841          1   
8756 2023-12-31 20:00:00            15.026829         16.745072          1   
8757 2023-12-31 21:00:00            22.940820         26.503644          1   
8758 2023-12-31 22:00:00            21.633347         23.169316          0   
8759 2023-12-31 23:00:00            34.617170         36.307279          0   

      ac_temperature  light_on  
8750       20.862715         0  
8751       23.640876         0  
8752       24.075359         0  
8753     

### Import modules for Model Training

In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import pickle

### Load data from csv and clear nan values

In [4]:
df = pd.read_csv('synthetic_ac_light_data.csv')

# Check for missing values in target columns (ac_temperature and light_on)
print("Checking for missing values...")
print(df[['ac_temperature', 'light_on']].isna().sum())

# Drop rows where target values are NaN
df = df.dropna(subset=['ac_temperature', 'light_on'])

Checking for missing values...
ac_temperature    7
light_on          0
dtype: int64


In [5]:
# Convert timestamp to datetime format (if not already done)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract time-based features
df['hour'] = df['timestamp'].dt.hour  # Extract hour of the day (0-23)
df['day_of_week'] = df['timestamp'].dt.dayofweek  # Extract day of the week (0=Monday, 6=Sunday)

# Features (X) and labels (y)
X = df[['outside_temperature', 'room_temperature', 'occupancy', 'hour', 'day_of_week']]
y_temp = df['ac_temperature']  # For AC temperature prediction (regression)
y_light = df['light_on']  # For light on/off prediction (classification)

# Split data into training and test sets
X_train, X_test, y_temp_train, y_temp_test, y_light_train, y_light_test = train_test_split(X, y_temp, y_light, test_size=0.3, random_state=42)

### Train the model: Random Forest Regressor

Why Choose Random Forest Regressor?

    Robustness to Overfitting:
        Random forests are an ensemble learning method that combines multiple decision trees. This reduces the risk of overfitting, especially compared to individual decision trees, which can be prone to overfitting when capturing noise in the data.

    Handling Non-Linearity:
        Random forests can capture complex relationships between features and the target variable, making them effective for datasets with non-linear patterns.

    Feature Importance:
        Random forests provide insights into feature importance, allowing you to see which features (like outside temperature, room temperature, occupancy, etc.) have the most impact on the prediction.

    Versatility:
        They can handle both regression (predicting continuous values, such as AC temperature) and classification (predicting categorical values, such as light status) tasks effectively.

    Robust to Outliers:
        Random forests are less sensitive to outliers compared to linear regression models, making them suitable for real-world data, which can often contain anomalies.

    Automatic Handling of Missing Values:
        Random forests can handle missing values and maintain accuracy, making them practical for datasets where some data may be missing.

How Random Forest Works
1. Ensemble of Decision Trees:

    Random forest is built upon the idea of creating a "forest" of decision trees. Each tree in the forest is trained on a random subset of the training data (both in terms of samples and features).

2. Bootstrap Sampling:

    During training, a random sample (with replacement) of the dataset is created. This is known as bootstrapping. Each tree is built using a different bootstrapped sample, which introduces diversity among the trees.

3. Random Feature Selection:

    When splitting nodes in each decision tree, a random subset of features is selected instead of considering all features. This further decorrelates the trees, which helps reduce variance and improve generalization.

4. Making Predictions:

    For regression tasks, each tree in the forest makes a prediction, and the final prediction is obtained by averaging the predictions of all individual trees. For classification tasks, the mode of the predictions from all trees is taken as the final output.

5. Feature Importance Calculation:

    Random forests provide a measure of feature importance based on how much each feature contributes to reducing the impurity (e.g., Gini impurity or mean squared error) across all trees in the forest. This allows you to identify which features are most influential in predicting the target variable.

Visual Representation

    Training Phase:
        The training dataset is split into multiple subsets using bootstrap sampling.
        Decision trees are trained on these subsets using random feature selection.

    Prediction Phase:
        Each tree provides a prediction, which are then aggregated to get the final output.

In [6]:
# Train regression model for AC temperature
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_temp_train)

# Train classification model for light on/off
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_light_train)

# Predictions
y_temp_pred = regressor.predict(X_test)
y_light_pred = classifier.predict(X_test)

# Evaluate the models
mse_temp = mean_squared_error(y_temp_test, y_temp_pred)
accuracy_light = accuracy_score(y_light_test, y_light_pred)

print(f'AC Temperature Prediction MSE: {mse_temp:.2f}')
print(f'Light On/Off Prediction Accuracy: {accuracy_light:.2%}')

AC Temperature Prediction MSE: 4.36
Light On/Off Prediction Accuracy: 100.00%



## Save the models using pickle

In [7]:
with open('ac_temperature_model.pkl', 'wb') as temp_model_file:
    pickle.dump(regressor, temp_model_file)

with open('light_on_off_model.pkl', 'wb') as light_model_file:
    pickle.dump(classifier, light_model_file)

print("Models saved successfully.")

Models saved successfully.


## Import the model and run

In [8]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

## Load model from pikle

In [9]:
# Load the models from the pickle files
with open('ac_temperature_model.pkl', 'rb') as temp_model_file:
    regressor = pickle.load(temp_model_file)

with open('light_on_off_model.pkl', 'rb') as light_model_file:
    classifier = pickle.load(light_model_file)

# Define the feature names for the input data
feature_names = ['outside_temperature', 'room_temperature', 'occupancy', 'hour', 'day_of_week']

In [12]:
# Function to get input from the user and make predictions
def get_user_input(outside_temp:float, room_temp:float, room_occupancy:bool, use_curr_time:bool):

    if use_curr_time:
        current_time = datetime.now()
    else:
        time_str = input("Enter time (HH:MM, 24-hour format): ")
        current_time = datetime.strptime(time_str, '%H:%M')

    # Extract hour and day of the week
    hour = current_time.hour
    day_of_week = current_time.weekday()  # Monday=0, Sunday=6

    return pd.DataFrame([[outside_temp, room_temp, room_occupancy, hour, day_of_week]], columns=feature_names)


def predict_ac_and_light(features):
    # Predict the AC temperature
    ac_temp_pred = regressor.predict(features)
    
    # Predict whether the light should be on or off
    light_on_pred = classifier.predict(features)
    
    return ac_temp_pred[0], light_on_pred[0]

# Test case

In [13]:
features = get_user_input(
    outside_temp=45,
    room_temp=36,
    room_occupancy=1,
    use_curr_time=1
)
room_occupancy = features['occupancy'].iloc[0]
ac_temp, light_on = predict_ac_and_light(features)

if room_occupancy == 1: # if room occupied
    print(f"\nPredicted AC Temperature: {ac_temp:.2f} °C")
    if light_on == 1:
        print("Light Status: ON")
    else:
        print("Light Status: OFF")
else:
    print("                   AC : OFF")	
    print("Empty room ---> ")	
    print("                Light : OFF")	


Predicted AC Temperature: 22.55 °C
Light Status: ON
