# Data Cleaning

In [1]:
import numpy as np
import pandas as pd


In [9]:
string_data = pd.Series(['aa', 'ar', np.nan, 'av'])
string_data[0] = None
string_data.isnull()
string_data.dropna()
string_data[string_data.notnull()]
cleaned = string_data.dropna()
cleaned

1    ar
3    av
dtype: object

## Ways to manage missing data

In [16]:
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.],[1., NA, 3.5],
                    [NA, NA, NA],[NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,3.5
2,,,
3,,6.5,3.0


In [18]:
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,3.5
2,,,
3,,6.5,3.0


In [25]:
data.dropna(thresh=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,3.5
3,,6.5,3.0


## Replace Missing Data

In [40]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
# df.dropna(thresh=2)
# df.fillna(0, inplace=True)
df
# df.fillna({1:0.5, 2:0.8})
df.fillna(method='ffill')


Unnamed: 0,0,1,2
0,0.407794,,
1,-0.700467,,
2,0.755187,,0.304518
3,0.609017,,-1.813547
4,0.819418,-1.406735,0.503051
5,0.831261,2.354619,0.139204
6,-0.789028,0.263668,-1.090641


In [48]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.196263,-0.247596,0.417758
1,-1.036047,0.063574,-0.312196
2,0.139902,-0.092011,0.279284
3,-0.070768,-0.092011,-1.133628
4,-0.765307,-0.092011,-0.187195
5,-1.523567,-0.092011,-0.187195


## Removing Duplicates

In [50]:
data =pd.DataFrame({'k1':['one','two']*3 +['two'],
                    'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [59]:
data.duplicated()
data.drop_duplicates(keep='last') # but they are identical!!!
data.replace(1, np.nan) # or any other alue of the same dtype
data.replace([1,2,3], [9,8,7])

Unnamed: 0,k1,k2
0,one,9
1,two,9
2,one,8
3,two,7
4,one,7
5,two,4
6,two,4


## Discrete data and bins

In [66]:
ages = [20,22,25,27,21,23,37,31,61,45,41,42]
bins = [18, 25, 35, 60, 100]
categories = pd.cut(ages, bins)
categories # NB (...] is a bin)
categories.codes# ordinal members of bins
categories.categories
pd.value_counts(categories)


(18, 25]     5
(35, 60]     4
(25, 35]     2
(60, 100]    1
dtype: int64

In [69]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False) # notice [...)]
group_names = ['Youth', 'Adult', 'MiddleAge', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, Adult, Youth, ..., Adult, Senior, MiddleAge, MiddleAge, MiddleAge]
Length: 12
Categories (4, object): [Youth < Adult < MiddleAge < Senior]

In [88]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.73, 0.97], (0.73, 0.97], (0.0048, 0.25], (0.25, 0.49], (0.0048, 0.25], ..., (0.0048, 0.25], (0.0048, 0.25], (0.73, 0.97], (0.73, 0.97], (0.73, 0.97]]
Length: 20
Categories (4, interval[float64]): [(0.0048, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.97]]

In [93]:
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # cut into quartiles
cats
pd.value_counts(cats)

(0.682, 2.934]       250
(-0.0236, 0.682]     250
(-0.628, -0.0236]    250
(-2.905, -0.628]     250
dtype: int64

In [122]:
data=pd.DataFrame(np.random.randn(1000,4)) # now we have 4 columns
data.describe()
data
col = data[2]
col[np.abs(col)>3]
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
8,1.278848,-0.999466,-0.239978,-3.151399
221,-0.485164,0.763746,-0.780542,-3.008285
340,1.124747,0.69148,3.08183,0.789188
467,0.685334,-2.02035,3.49054,0.294088
517,-3.029596,0.5332,0.151939,-0.19642
698,0.157241,3.201615,1.485395,0.077278
735,-1.498414,-0.606198,-0.64628,-3.071002
825,-3.11669,-1.158136,0.999421,-0.891285
859,-0.965512,0.427806,-0.351376,-3.048226
