# Applying Machine Learning and Deep Learning to identify home appliances consuming excess power

## Copyright (c) 2018, Faststream Technologies
## Author: Sudhanva Narayana

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

### Import dataset ignoring headers

In [2]:
df = pd.read_csv('../data/home_data.csv')

### Dataset

In [3]:
df.head()

Unnamed: 0,device,building,floor,room,weather_type,date,from_time,to_time,no_of_people,time_stayed_mins,power
0,AC,1,1,101,low cold,02-01-2018,23:00:00,00:00:00,1,1,1180
1,AC,1,1,101,low cold,03-01-2018,20:00:00,21:00:00,3,11,1959
2,AC,1,1,101,low hot,04-01-2018,15:00:00,16:00:00,1,13,1708
3,Computer,1,1,101,very cold,27-01-2018,03:00:00,04:00:00,9,29,339
4,Computer,1,1,101,very hot,28-01-2018,22:00:00,23:00:00,4,25,443


### Importing dataset

In [4]:
X = df.iloc[:, [0, 3, 4, 5, 6, 7, 8, 9]].values
y = df.iloc[:, 10].values

### Encoding Categorical Variables

In [5]:
# Encoding categorical data
labelencoder_X_0 = LabelEncoder()
X[:, 0] = labelencoder_X_0.fit_transform(X[:, 0])
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
labelencoder_X_2 = LabelEncoder()
X[:, 3] = labelencoder_X_2.fit_transform(X[:, 2])
labelencoder_X_4 = LabelEncoder()
X[:, 4] = labelencoder_X_4.fit_transform(X[:, 4])
labelencoder_X_5 = LabelEncoder()
X[:, 5] = labelencoder_X_5.fit_transform(X[:, 5])

onehotencoder = OneHotEncoder(categorical_features=[0, 1, 2, 3, 4, 5])
hot_X = onehotencoder.fit_transform(X).toarray()

### Avoiding the dummy variable trap

In [6]:
columns = df.columns
dummies = []
dummies_sum = 0
categories = [0, 1, 2, 3, 4, 5]

for category in categories:
    dummies_sum += category * (df.iloc[:, category].unique().size)
    dummies.append(dummies_sum)
    
# Removing dummy variables
hot_X = np.delete(hot_X, dummies, 1)

  # This is added back by InteractiveShellApp.init_path()


### Splitting the dataset into the Training set and Test set (75%, 25%)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Feature Scaling

In [8]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)



In [9]:
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [10]:
y_pred = regressor.predict(X_test)
y_pred

array([1383.6, 1614. ,  182.2, ..., 1725. , 1657.3, 1464.1])

In [None]:
joblib.dump(regressor, 'random_forest.pkl') 