# Applying Machine Learning and Deep Learning to identify home appliances consuming excess power

## Copyright (c) 2018, Faststream Technologies
## Author: Sudhanva Narayana

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

### Import dataset ignoring headers

In [2]:
df = pd.read_csv('../data/trail1/home_data.csv')

### Dataset

In [3]:
df.head()

Unnamed: 0,device,room,weather_type,date,from_time,to_time,time,no_of_people,time_stayed_mins,power
0,AC,100,low cold,2018-01-01,03:00:00,04:00:00,midnight,12,29,1091
1,AC,100,low hot,2018-01-01,05:00:00,06:00:00,early morning,15,53,1055
2,AC,100,low hot,2018-01-01,09:00:00,10:00:00,morning,6,26,1914
3,AC,100,cold,2018-01-01,13:00:00,14:00:00,afternoon,8,40,1980
4,AC,100,low cold,2018-01-01,14:00:00,15:00:00,afternoon,5,50,1531


### Importing dataset

In [4]:
X = df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]].values
y = df.iloc[:, 9].values

### Encoding Categorical Variables

In [5]:
# Encoding categorical data
labelencoder_X_0 = LabelEncoder()
X[:, 0] = labelencoder_X_0.fit_transform(X[:, 0])
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
labelencoder_X_3 = LabelEncoder()
X[:, 3] = labelencoder_X_3.fit_transform(X[:, 3])
labelencoder_X_4 = LabelEncoder()
X[:, 4] = labelencoder_X_4.fit_transform(X[:, 4])
labelencoder_X_5 = LabelEncoder()
X[:, 5] = labelencoder_X_5.fit_transform(X[:, 5])
labelencoder_X_6 = LabelEncoder()
X[:, 6] = labelencoder_X_6.fit_transform(X[:, 6])

onehotencoder = OneHotEncoder(categorical_features=[0, 1, 2, 3, 4, 5, 6])
hot_X = onehotencoder.fit_transform(X).toarray()

### Avoiding the dummy variable trap

In [6]:
columns = df.columns
dummies = []
dummies_sum = 0
categories = [0, 1, 2, 3, 4, 5, 6]

for category in categories:
    dummies_sum += category * (df.iloc[:, category].unique().size)
    dummies.append(dummies_sum)
    
# Removing dummy variables
hot_X = np.delete(hot_X, dummies, 1)

  # This is added back by InteractiveShellApp.init_path()


### Splitting the dataset into the Training set and Test set (75%, 25%)

In [7]:
with tf.device('/gpu:0'):
    X_train, X_test, y_train, y_test = train_test_split(hot_X, y, test_size=0.25, random_state=0)

### Random Forest Regression

In [8]:
with tf.device('/gpu:0'):
    regressor = RandomForestRegressor(n_estimators=300, random_state=0)
    regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
y_pred = regressor.predict(X_test)
y_pred

array([2332.19333333, 2416.32      , 1417.82666667, ...,  197.94      ,
        209.99      , 1423.96      ])

In [10]:
joblib.dump(regressor, '../model/random_forest.pkl') 

['../model/random_forest.pkl']