In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import featuretools as ft
from scipy import stats
from scipy.stats import norm
sns.set()
%matplotlib inline

In [None]:
raw_data=pd.read_csv('training-data.csv')
raw_data

In [None]:
testing_data = pd.read_csv('testing-data.csv')

In [None]:
testing_data

In [None]:
raw_data.describe()

In [None]:
#We need to check info() to know if there's any missing values
raw_data.info()

In [None]:
y=raw_data['Appliances']
x1=raw_data.loc[:, :'rv2']

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(raw_data.corr(), annot=True, ax=ax)

## Scatter Plot
We use scatter plot to see the Correlation between Feature and Target

In [None]:
fig = plt.figure(figsize=(10,10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 10):
    plt.subplot(3,3, i)
    plt.xlabel(raw_data.columns[i])
    plt.ylabel(raw_data.columns[-2])
    plt.scatter(x=raw_data.iloc[:, i], y=y)

In [None]:
fig = plt.figure(figsize=(10,10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(11, 20):
    plt.subplot(3,3, i-10)
    plt.xlabel(raw_data.columns[i])
    plt.ylabel(raw_data.columns[-2])
    plt.scatter(x=raw_data.iloc[:, i], y=y)

In [None]:
fig = plt.figure(figsize=(10,10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(21, 27):
    plt.subplot(3,3, i-20)
    plt.xlabel(raw_data.columns[i])
    plt.ylabel(raw_data.columns[-2])
    plt.scatter(x=raw_data.iloc[:, i], y=y)

In [None]:
sns.pairplot(raw_data.iloc[:, :5]);

## Plot the Data Distribution

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 10):
    plt.subplot(3,3,i)
    sns.distplot(raw_data.iloc[:, i], fit=norm)

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(11, 20):
    plt.subplot(3,3,i-10)
    sns.distplot(raw_data.iloc[:, i], fit=norm)

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(21, 28):
    plt.subplot(3,3,i-20)
    sns.distplot(raw_data.iloc[:, i], fit=norm)

## Normalize the Abnormal Data Distribution using Log Transformation
After visualize the data, we see that some of the features is not Normally Distributed. So we can normalize it by using log transformation.
Hence we want to list abnormal distribution of Features:
* RH_5
* Windspeed

In [None]:
sns.distplot(raw_data.loc[:, 'RH_5'])

In [None]:
log_rh5 = raw_data[['RH_5']].applymap(lambda x: np.log(x+1))

In [None]:
log_rh5

In [None]:
sns.distplot(log_rh5, fit=norm)

In [None]:
sns.distplot(raw_data.loc[:, 'Windspeed'])

In [None]:
log_windspeed = raw_data[['Windspeed']].applymap(lambda x: np.log(x+1))

In [None]:
log_windspeed.describe()

In [None]:
sns.distplot(log_windspeed)

In [None]:
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
plt.title('Before Log Transformation')
sns.distplot(raw_data.loc[:, 'Windspeed'], fit=norm)

plt.subplot(2,2,2)
plt.title('After Log Transformation')
sns.distplot(log_windspeed, fit=norm)

In [None]:
copy_data = pd.DataFrame.copy(raw_data)

In [None]:
copy_data

In [None]:
copy_data['RH_5'] = log_rh5

In [None]:
copy_data['Windspeed'] = log_windspeed

In [None]:
copy_data

In [None]:
fig, ax = plt.subplots(figsize=(25,20))
sns.heatmap(copy_data.corr(), annot=True, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(25,20))
sns.heatmap(raw_data.corr(), annot=True, ax=ax)

## Boxplot to see the Outlier from Data

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(1, 10):
    plt.subplot(3,3,i)
    sns.boxplot(raw_data.iloc[:, i])

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(11, 20):
    plt.subplot(3,3,i-10)
    sns.boxplot(raw_data.iloc[:, i])

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(21, 28):
    plt.subplot(3,3,i-20)
    sns.boxplot(raw_data.iloc[:, i])

## Let's Remove the Outlier
As we see from the boxplot, there are many outliers from some Features. Thus, we need to remove it. We could remove it using 2 ways:
* IQR (Q3 - Q1)
* Z-Score

In [None]:
detect_outlier_data = pd.DataFrame.copy(raw_data)

In [None]:
detect_outlier_data

In [None]:
detect_outlier_data.describe()

In [None]:
#IQR = Q3-Q1
#We could Detect Outlier using IQR (Interquartile Range)
Q1 = detect_outlier_data.quantile(0.25)
Q3 = detect_outlier_data.quantile(0.75)
IQR = Q3-Q1

In [None]:
IQR

In [None]:
iqr_outlier = (detect_outlier_data < (Q1 - 1.5 * IQR)) |(detect_outlier_data > (Q3 + 1.5 * IQR))

In [None]:
iqr_outlier

In [None]:
z = np.abs(stats.zscore(detect_outlier_data))

In [None]:
z

In [None]:
z < 3

## Detect Outlier using Z Score

In [None]:
no_outlier = detect_outlier_data[(z < 3).all(axis=1)]

In [None]:
no_outlier

In [None]:
no_outlier.describe()

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using Z-Score', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'RH_5'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(no_outlier.loc[:, 'RH_5'])

## Detect Outlier using IQR

In [None]:
iqr_no_outlier = detect_outlier_data[~iqr_outlier.any(axis=1)]

In [None]:
iqr_no_outlier

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using IQR', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'RH_5'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(iqr_no_outlier.loc[:, 'RH_5'])

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using IQR', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'RH_1'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(iqr_no_outlier.loc[:, 'RH_1'])

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using IQR', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'RH_2'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(iqr_no_outlier.loc[:, 'RH_2'])

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using IQR', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'T2'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(iqr_no_outlier.loc[:, 'T2'])

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using IQR', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'T3'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(iqr_no_outlier.loc[:, 'T3'])

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('Using IQR', fontsize=18)
plt.subplot(3,2, 1)
plt.title('Before Removing Outlier')
sns.boxplot(raw_data.loc[:, 'RH_3'])

plt.subplot(3,2, 2)
plt.title('After Removing Outlier')
sns.boxplot(iqr_no_outlier.loc[:, 'RH_3'])

In [None]:
iqr_no_outlier

In [None]:
iqr_no_outlier.loc[:, 'Appliances'].describe()

## Transform the Response / Dependent Data
We have 2 response here, Appliances and Lights. Thus we need to transform it in order to Normal Distributed.
Based on: https://www.researchgate.net/post/Regression_tree_analysis_does_not_require_transforming_variables_but_is_it_a_problem_if_we_for_eg_log-transform_the_response_variable

In [None]:
log_appliances = copy_data[['Appliances']].applymap(lambda x: np.log(x+1))

In [None]:
log_appliances

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
plt.title('Before Log on Appliances')
sns.distplot(copy_data.loc[:, 'Appliances'], fit=norm)

plt.subplot(2,2,2)
plt.title('After Log on Appliances')
sns.distplot(log_appliances, fit=norm)

In [None]:
log_lights = copy_data[['lights']].applymap(lambda x: np.log(x+1))

In [None]:
log_lights

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
plt.title('Before Log on Lights')
sns.distplot(copy_data.loc[:, 'lights'], fit=norm)

plt.subplot(2,2,2)
plt.title('After Log on Lights')
sns.distplot(log_lights, fit=norm)

In [None]:
raw_data.describe()

In [None]:
log_lights.describe()

## Plot the Data Distribution after Removing the Outlier

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(0, 9):
    plt.subplot(3,3,i+1)
    sns.distplot(iqr_no_outlier.iloc[:, i], fit=norm)

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(10, 19):
    plt.subplot(3,3,i-9)
    sns.distplot(iqr_no_outlier.iloc[:, i], fit=norm)

In [None]:
fig = plt.figure(figsize=(10, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(20, 28):
    plt.subplot(3,3,i-19)
    sns.distplot(iqr_no_outlier.iloc[:, i], fit=norm)

## Normalization using Min-Max
We use the Combined data from Training and Testing.

In [None]:
raw_data

In [None]:
testing_data

In [None]:
combined_data = pd.concat([raw_data.iloc[:,:24], testing_data.iloc[:,:24]]).reset_index(drop=True)

In [None]:
combined_data

### Do the Normalization

In [None]:
for i in range(0,24):
    combined_data.iloc[:,i] = combined_data.iloc[:,i].apply(lambda x: (x - np.min(combined_data.iloc[:,i])) / (np.max(combined_data.iloc[:,i]) - np.min(combined_data.iloc[:,i])))

In [None]:
combined_data

In [None]:
combined_data.iloc[:11999]

In [None]:
combined_data.iloc[11999:]

## Save to CSV
After the Normalization, save the Dataframe into CSV

In [None]:
training_data = pd.concat([combined_data.iloc[:11999], raw_data[['Appliances', 'lights']]], axis=1)

In [None]:
training_data

In [None]:
training_data.to_csv('normalisasi_training.csv')

In [None]:
combined_data.iloc[11999:].to_csv('normalisasi_testing.csv')