# Tugas Data Cleaning

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
data = pd.read_csv('travel-times.csv')

In [85]:
data = data.drop('Comments', 1)

In [86]:
#ukuran tabel
data.shape

(205, 12)

In [87]:
#5 data teratas
data.head(n=5)

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All
0,1/6/2012,16:37,Friday,Home,51.29,127.4,78.3,84.8,,39.3,36.3,No
1,1/6/2012,8:20,Friday,GSK,51.63,130.3,81.8,88.9,,37.9,34.9,No
2,1/4/2012,16:17,Wednesday,Home,51.27,127.4,82.0,85.8,,37.5,35.9,No
3,1/4/2012,7:53,Wednesday,GSK,49.17,132.3,74.2,82.9,,39.8,35.6,No
4,1/3/2012,18:57,Tuesday,Home,51.15,136.2,83.4,88.1,,36.8,34.8,No


In [88]:
#5 data terbawah
data.tail(n=5)

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All
200,7/18/2011,8:09,Monday,GSK,54.52,125.6,49.9,82.4,7.89,65.5,39.7,No
201,7/14/2011,8:03,Thursday,GSK,50.9,123.7,76.2,95.1,7.89,40.1,32.1,Yes
202,7/13/2011,17:08,Wednesday,Home,51.96,132.6,57.5,76.7,,54.2,40.6,Yes
203,7/12/2011,17:51,Tuesday,Home,53.28,125.8,61.6,87.6,,51.9,36.5,Yes
204,7/11/2011,16:56,Monday,Home,51.73,125.0,62.8,92.5,,49.5,33.6,Yes


In [89]:
data.describe() #descriptive statistics buat data numerik

Unnamed: 0,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime
count,205.0,205.0,205.0,205.0,186.0,205.0,205.0
mean,50.981512,127.591707,74.477561,81.97561,8.690591,41.90439,37.871707
std,1.321205,4.12845,11.409816,10.111544,0.504971,6.849476,4.835072
min,48.32,112.2,38.1,50.3,7.81,28.2,27.1
25%,50.65,124.9,68.9,76.6,8.37,38.4,35.7
50%,51.14,127.4,73.6,81.4,8.52,41.3,37.6
75%,51.63,129.8,79.9,86.0,8.97,44.4,39.9
max,60.32,140.9,107.7,112.1,10.05,82.3,62.4


In [90]:
#untuk menghitung satu kolom
data['Take407All'].value_counts()

No     164
Yes     35
Name: Take407All, dtype: int64

In [91]:
data['DayOfWeek'].value_counts()

Tuesday      48
Wednesday    47
Thursday     44
Monday       39
Friday       27
Name: DayOfWeek, dtype: int64

In [92]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 12 columns):
Date              205 non-null object
StartTime         205 non-null object
DayOfWeek         205 non-null object
GoingTo           205 non-null object
Distance          205 non-null float64
MaxSpeed          205 non-null float64
AvgSpeed          205 non-null float64
AvgMovingSpeed    205 non-null float64
FuelEconomy       186 non-null float64
TotalTime         205 non-null float64
MovingTime        205 non-null float64
Take407All        199 non-null object
dtypes: float64(7), object(5)
memory usage: 19.3+ KB


## Imputation Missing Value

In [93]:
np.sum(data.isnull()) #number of NA for each variable

Date               0
StartTime          0
DayOfWeek          0
GoingTo            0
Distance           0
MaxSpeed           0
AvgSpeed           0
AvgMovingSpeed     0
FuelEconomy       19
TotalTime          0
MovingTime         0
Take407All         6
dtype: int64

In [94]:
data.isnull().sum().sum()

25

In [96]:
data['FuelEconomy'] = data['FuelEconomy'].fillna((data['FuelEconomy'].mean()))
data['Take407All'] = data.fillna(data['Take407All'].value_counts().index[0])
data.isnull().any()

Date              False
StartTime         False
DayOfWeek         False
GoingTo           False
Distance          False
MaxSpeed          False
AvgSpeed          False
AvgMovingSpeed    False
FuelEconomy       False
TotalTime         False
MovingTime        False
Take407All        False
dtype: bool

## Encode Labels

In [97]:
from sklearn.preprocessing import LabelEncoder

In [98]:
data.head()

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All
0,1/6/2012,16:37,Friday,Home,51.29,127.4,78.3,84.8,8.690591,39.3,36.3,1/6/2012
1,1/6/2012,8:20,Friday,GSK,51.63,130.3,81.8,88.9,8.690591,37.9,34.9,1/6/2012
2,1/4/2012,16:17,Wednesday,Home,51.27,127.4,82.0,85.8,8.690591,37.5,35.9,1/4/2012
3,1/4/2012,7:53,Wednesday,GSK,49.17,132.3,74.2,82.9,8.690591,39.8,35.6,1/4/2012
4,1/3/2012,18:57,Tuesday,Home,51.15,136.2,83.4,88.1,8.690591,36.8,34.8,1/3/2012


In [99]:
data['DayOfWeek'] = LabelEncoder().fit_transform(data['DayOfWeek'])
data['GoingTo'] = LabelEncoder().fit_transform(data['GoingTo'])
data['Take407All'] = LabelEncoder().fit_transform(data['Take407All'])
data.head()

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All
0,1/6/2012,16:37,0,1,51.29,127.4,78.3,84.8,8.690591,39.3,36.3,3
1,1/6/2012,8:20,0,0,51.63,130.3,81.8,88.9,8.690591,37.9,34.9,3
2,1/4/2012,16:17,4,1,51.27,127.4,82.0,85.8,8.690591,37.5,35.9,2
3,1/4/2012,7:53,4,0,49.17,132.3,74.2,82.9,8.690591,39.8,35.6,2
4,1/3/2012,18:57,3,1,51.15,136.2,83.4,88.1,8.690591,36.8,34.8,1


## Standardize

In [100]:
from sklearn.preprocessing import StandardScaler

In [101]:
#standardize hanya untuk data numerik
data['Distance'] = StandardScaler().fit(data[['Distance']]).transform(data[['Distance']])
data['MaxSpeed'] = StandardScaler().fit(data[['MaxSpeed']]).transform(data[['MaxSpeed']])
data['AvgSpeed'] = StandardScaler().fit(data[['AvgSpeed']]).transform(data[['AvgSpeed']])
data['AvgMovingSpeed'] = StandardScaler().fit(data[['AvgMovingSpeed']]).transform(data[['AvgMovingSpeed']])
data['FuelEconomy'] = StandardScaler().fit(data[['FuelEconomy']]).transform(data[['FuelEconomy']])
data.head()

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All
0,1/6/2012,16:37,0,1,0.234061,-0.046549,0.335833,0.280007,-3.703009e-15,39.3,36.3,3
1,1/6/2012,8:20,0,0,0.492032,0.657613,0.643338,0.686477,-3.703009e-15,37.9,34.9,3
2,1/4/2012,16:17,4,1,0.218887,-0.046549,0.660909,0.379146,-3.703009e-15,37.5,35.9,2
3,1/4/2012,7:53,4,0,-1.374462,1.143242,-0.024386,0.091643,-3.703009e-15,39.8,35.6,2
4,1/3/2012,18:57,3,1,0.127838,2.090219,0.783911,0.607166,-3.703009e-15,36.8,34.8,1


## Normalize

In [80]:
from sklearn.preprocessing import Normalizer 

In [102]:
#normalize hanya untuk data numerik
data['Distance'] = Normalizer().fit(data[['Distance']]).transform(data[['Distance']])
data['MaxSpeed'] = Normalizer().fit(data[['MaxSpeed']]).transform(data[['MaxSpeed']])
data['AvgSpeed'] = Normalizer().fit(data[['AvgSpeed']]).transform(data[['AvgSpeed']])
data['AvgMovingSpeed'] = Normalizer().fit(data[['AvgMovingSpeed']]).transform(data[['AvgMovingSpeed']])
data['FuelEconomy'] = Normalizer().fit(data[['FuelEconomy']]).transform(data[['FuelEconomy']])
data['TotalTime'] = Normalizer().fit(data[['TotalTime']]).transform(data[['TotalTime']])
data['MovingTime'] = Normalizer().fit(data[['MovingTime']]).transform(data[['MovingTime']])
data.head()

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All
0,1/6/2012,16:37,0,1,1.0,-1.0,1.0,1.0,-1.0,1.0,1.0,3
1,1/6/2012,8:20,0,0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,3
2,1/4/2012,16:17,4,1,1.0,-1.0,1.0,1.0,-1.0,1.0,1.0,2
3,1/4/2012,7:53,4,0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,2
4,1/3/2012,18:57,3,1,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1
