# Data Cleaning and Preparation

# 7.1 Handling Missing Data. 

In [1]:
import pandas as pd
import numpy as np

In [22]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [23]:
s1 = pd.Series([None])
s2 = pd.concat([string_data, s1])

In [24]:
s2

0     aardvark
1    artichoke
2          NaN
3      avocado
0         None
dtype: object

In [25]:
type(s2)

pandas.core.series.Series

In [26]:
s2.isnull()

0    False
1    False
2     True
3    False
0     True
dtype: bool

# Filtering out missing data: 

In [27]:
from numpy import nan as NA
data = pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [28]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [30]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [32]:
data = pd.DataFrame([
    [1,6,5,3.],
    [1,NA,NA],
    [NA,NA,NA],
    [NA,6.5,3]
])
data

Unnamed: 0,0,1,2,3
0,1.0,6.0,5.0,3.0
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [34]:
cleaned = data.dropna(how='all')
cleaned

Unnamed: 0,0,1,2,3
0,1.0,6.0,5.0,3.0
1,1.0,,,
3,,6.5,3.0,


In [35]:
data[4] = NA

In [36]:
data

Unnamed: 0,0,1,2,3,4
0,1.0,6.0,5.0,3.0,
1,1.0,,,,
2,,,,,
3,,6.5,3.0,,


In [37]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,3
0,1.0,6.0,5.0,3.0
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [39]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,1.72581,0.766501,0.310638
1,-0.239889,0.351687,0.478007
2,-0.438432,0.367812,-1.002714
3,0.055252,-0.272082,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [40]:
df.iloc[:4,1]= NA

In [42]:
df.iloc[:2,2] = NA

In [43]:
df

Unnamed: 0,0,1,2
0,1.72581,,
1,-0.239889,,
2,-0.438432,,-1.002714
3,0.055252,,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [44]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [45]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.438432,,-1.002714
3,0.055252,,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


# Filling in Missing Data

In [46]:
df

Unnamed: 0,0,1,2
0,1.72581,,
1,-0.239889,,
2,-0.438432,,-1.002714
3,0.055252,,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [47]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.72581,0.0,0.0
1,-0.239889,0.0,0.0
2,-0.438432,0.0,-1.002714
3,0.055252,0.0,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [48]:
df.fillna({1:0.5, 2:22})

Unnamed: 0,0,1,2
0,1.72581,0.5,22.0
1,-0.239889,0.5,22.0
2,-0.438432,0.5,-1.002714
3,0.055252,0.5,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [52]:
a = df.fillna(0, inplace=True)

In [53]:
df

Unnamed: 0,0,1,2
0,1.72581,0.0,0.0
1,-0.239889,0.0,0.0
2,-0.438432,0.0,-1.002714
3,0.055252,0.0,0.807073
4,-1.301827,1.184065,0.727327
5,-1.367738,1.198419,0.258431
6,-0.477807,-0.60525,1.316633


In [55]:
print(a)

None


In [56]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.854669,0.954529,0.585362
1,1.301801,0.022586,2.949496
2,-0.386537,,-1.884663
3,0.803706,,0.705416
4,0.869107,,
5,0.084765,,


In [57]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.854669,0.954529,0.585362
1,1.301801,0.022586,2.949496
2,-0.386537,0.022586,-1.884663
3,0.803706,0.022586,0.705416
4,0.869107,0.022586,0.705416
5,0.084765,0.022586,0.705416


In [58]:
df


Unnamed: 0,0,1,2
0,-0.854669,0.954529,0.585362
1,1.301801,0.022586,2.949496
2,-0.386537,,-1.884663
3,0.803706,,0.705416
4,0.869107,,
5,0.084765,,


In [59]:
df.fillna(method='ffill', limit=2
         )

Unnamed: 0,0,1,2
0,-0.854669,0.954529,0.585362
1,1.301801,0.022586,2.949496
2,-0.386537,0.022586,-1.884663
3,0.803706,0.022586,0.705416
4,0.869107,,0.705416
5,0.084765,,0.705416


In [60]:
data = pd.Series([1., NA, 3.5, NA, 7])
data


0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [61]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 7.2 Data Transformation PG - 215