# Cleaning and Preparing Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame, Series

In [9]:
string_data = pd.Series(['aa','ar', np.nan, 'av'])
string_data
string_data[0] = None
string_data.isnull()
# we can drop any NaN etc.
string_data.dropna() # this drops the NaN etc values
cleaned = string_data.dropna()
cleaned

1    ar
3    av
dtype: object

#### Other Ways to Manage Missing Data

In [24]:
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.],[1., NA, 3.5], [NA, NA, NA],[NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,3.5
2,,,
3,,6.5,3.0


In [25]:
# axis 0 goes across rows
# axis 1 goes down columns
data.dropna(how='all', axis=0) # matches when ALL of a row is NaN or simila

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,3.5
3,,6.5,3.0


In [36]:
# set a threshold for dropping
data.dropna(thresh=3) # leaves only those rows with at least three actual values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


### Replace missing data

In [70]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[1:4, 1] = NA # start at zero, stop before 4
df.iloc[:2, 2] = NA
df.dropna(thresh=2)
# df.fillna(0, inplace=True) # persistent
# df.fillna({1:0.5, 2:0.8}) # fill values for specific columns
df.fillna(method='ffill') # forward fill

Unnamed: 0,0,1,2
0,1.254543,-0.086288,
1,-1.379945,-0.086288,
2,0.400708,-0.086288,1.158781
3,0.285836,-0.086288,-0.416592
4,0.089175,0.314512,-0.871684
5,0.059658,0.526913,0.361291
6,-1.634001,-0.477401,1.150026


In [78]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[2:, 1] = NA # iloc is index location (loc is column location)
df.iloc[4:, 2]  =NA
df.fillna(method='ffill', limit=2)
df.fillna(df.mean()) # use mean value for the column to fill

Unnamed: 0,0,1,2
0,-0.614914,0.918626,0.029601
1,-0.188387,-0.448434,-0.023362
2,-0.414601,0.235096,0.599397
3,-0.337621,0.235096,0.409499
4,2.669036,0.235096,0.253784
5,-1.8226,0.235096,0.253784
6,-0.745447,0.235096,0.253784


In [80]:
# remove duplicates
data =pd.DataFrame({'k1':['one','two']*3 +['two'],
                    'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [88]:
data.duplicated() # spot duplicate rows
data.drop_duplicates(keep='last') # remove duplicate rows
data.replace(1, np.nan)
data.replace([1,2,3],[9,8,7])

Unnamed: 0,k1,k2
0,one,9
1,two,9
2,one,8
3,two,7
4,one,7
5,two,4
6,two,4


### Discrete Data and Bins

In [93]:
ages = [20,22,25,27,21,23,37,31,61,45,41,42]
bins = [18, 25, 35, 60, 100]
categories = pd.cut(ages,bins)
categories # (] is a bin
categories.codes # which bin by index 0, 1, 2
categories.categories # what are the bins?
pd.value_counts(categories)

(18, 25]     5
(35, 60]     4
(25, 35]     2
(60, 100]    1
dtype: int64

In [98]:
pd.cut(ages, [18, 22, 36, 61, 100], right=False) # up to or above inclusive?
group_names = ['Youth', 'Adult', 'MiddleAge','Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'Adult', 'Youth', ..., 'Adult', 'Senior', 'MiddleAge', 'MiddleAge', 'MiddleAge']
Length: 12
Categories (4, object): ['Youth' < 'Adult' < 'MiddleAge' < 'Senior']

### Grouping Data

In [112]:
data.groupby('k1').count()

Unnamed: 0_level_0,k2
k1,Unnamed: 1_level_1
one,3
two,4


In [108]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4
