<center> <h2> Data Preparation </h2> </center>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

Attribute Information:

1. date: Date in format dd/mm/yyyy
2. time: time in format hh:mm:ss
3. global_active_power: household global minute-averaged active power (in kilowatt)
4. global_reactive_power: household global minute-averaged reactive power (in kilowatt)
5. voltage: minute-averaged voltage (in volt)
6. global_intensity: household global minute-averaged current intensity (in ampere)
7. sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
8. sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
9. sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner

In [2]:
dataset=pd.read_table('household_power_consumption.txt',delimiter=';')
dataset.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [9]:
### Taking sample rows

dataset=dataset.sample(40000,random_state=0,ignore_index=True)

In [10]:
dataset.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,19/2/2010,12:55:00,0.332,0.074,241.44,1.4,0.0,1.0,0.0
1,20/9/2008,01:19:00,0.624,0.128,242.91,2.6,0.0,0.0,0.0
2,28/2/2007,13:28:00,0.214,0.0,240.92,0.8,0.0,0.0,0.0
3,27/10/2010,00:27:00,0.876,0.238,246.75,3.6,0.0,0.0,1.0
4,14/7/2007,14:50:00,0.638,0.198,244.19,3.4,2.0,1.0,0.0


In [11]:
## checking for duplicates

dataset.duplicated().sum()

0

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   40000 non-null  object 
 1   Time                   40000 non-null  object 
 2   Global_active_power    40000 non-null  object 
 3   Global_reactive_power  40000 non-null  object 
 4   Voltage                40000 non-null  object 
 5   Global_intensity       40000 non-null  object 
 6   Sub_metering_1         40000 non-null  object 
 7   Sub_metering_2         40000 non-null  object 
 8   Sub_metering_3         39471 non-null  float64
dtypes: float64(1), object(8)
memory usage: 2.7+ MB


###### Chaning date object to date time feature

In [13]:
dataset.Date=pd.to_datetime(dataset.Date)

###### Extracting year,month and day from Date feature

In [14]:
dataset['year']=dataset['Date'].dt.year
dataset['month']=dataset['Date'].dt.month
dataset['day']=dataset['Date'].dt.day

##### changing the object feature into their respected datatype fields

In [15]:
## Some rows contains ? in their features and we are dropping them

for feature in dataset.columns:
    index_drop=dataset[dataset[feature]=='?'].index
    dataset.drop(index_drop,inplace=True)

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39471 entries, 0 to 39999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   39471 non-null  datetime64[ns]
 1   Time                   39471 non-null  object        
 2   Global_active_power    39471 non-null  object        
 3   Global_reactive_power  39471 non-null  object        
 4   Voltage                39471 non-null  object        
 5   Global_intensity       39471 non-null  object        
 6   Sub_metering_1         39471 non-null  object        
 7   Sub_metering_2         39471 non-null  object        
 8   Sub_metering_3         39471 non-null  float64       
 9   year                   39471 non-null  int64         
 10  month                  39471 non-null  int64         
 11  day                    39471 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory

In [17]:
for feature in dataset.columns[2:8]:
    dataset[feature]=dataset[feature].astype('float')

In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39471 entries, 0 to 39999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   39471 non-null  datetime64[ns]
 1   Time                   39471 non-null  object        
 2   Global_active_power    39471 non-null  float64       
 3   Global_reactive_power  39471 non-null  float64       
 4   Voltage                39471 non-null  float64       
 5   Global_intensity       39471 non-null  float64       
 6   Sub_metering_1         39471 non-null  float64       
 7   Sub_metering_2         39471 non-null  float64       
 8   Sub_metering_3         39471 non-null  float64       
 9   year                   39471 non-null  int64         
 10  month                  39471 non-null  int64         
 11  day                    39471 non-null  int64         
dtypes: datetime64[ns](1), float64(7), int64(3), object(1)
memory

In [19]:
### Creating target column

dataset['Total_power_consumed']=dataset['Sub_metering_1']+dataset['Sub_metering_2']+dataset['Sub_metering_3']

dataset.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,year,month,day,Total_power_consumed
0,2010-02-19,12:55:00,0.332,0.074,241.44,1.4,0.0,1.0,0.0,2010,2,19,1.0
1,2008-09-20,01:19:00,0.624,0.128,242.91,2.6,0.0,0.0,0.0,2008,9,20,0.0
2,2007-02-28,13:28:00,0.214,0.0,240.92,0.8,0.0,0.0,0.0,2007,2,28,0.0
3,2010-10-27,00:27:00,0.876,0.238,246.75,3.6,0.0,0.0,1.0,2010,10,27,1.0
4,2007-07-14,14:50:00,0.638,0.198,244.19,3.4,2.0,1.0,0.0,2007,7,14,3.0


In [20]:
dataset.to_csv('house_data_prepared')