In [1]:
#Data Cleaning and Preparation 

In [2]:
import numpy as np 
import pandas as pd 
PREVIOUS_MAX_ROWS= pd.options.display.max_rows
pd.options.display.max_rows=20
np.random.seed(12345)

In [3]:
import matplotlib.pyplot as plt 
plt.rc('figure', figsize=(10,6))
np.set_printoptions(precision=4, suppress=True)

In [4]:
#handling Missing Data 

In [5]:
string_data=pd.Series(['aardvark','artuchoke','np.nan','avocado'])
string_data


0     aardvark
1    artuchoke
2       np.nan
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2    False
3    False
dtype: bool

In [7]:
string_data[0]=None
string_data.isnull()

0     True
1    False
2    False
3    False
dtype: bool

In [8]:
#filtering Out Missing Data
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data=pd.DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
cleaned=data.dropna()
data


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [12]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [13]:
data[4]=NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [14]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
df=pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]=NA
df.iloc[:2,2]=NA
df

Unnamed: 0,0,1,2
0,-0.204708,,
1,-0.55573,,
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [16]:
df.dropna()

Unnamed: 0,0,1,2
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [17]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [18]:
#filing in missing data

In [19]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [20]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.204708,0.5,0.0
1,-0.55573,0.5,0.0
2,0.092908,0.5,0.769023
3,1.246435,0.5,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [21]:
df=pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1]=NA
df.iloc[4:,2]=NA
df

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,,1.34381
3,-0.713544,,-2.370232
4,-1.860761,,
5,-1.265934,,


In [22]:
df.fillna(method='ffill')
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,0.124121,1.34381
3,-0.713544,0.124121,-2.370232
4,-1.860761,,-2.370232
5,-1.265934,,-2.370232


In [23]:
data=pd.Series([1,NA,3.5,NA,7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [24]:
#Removing Duplicates 

In [25]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4,]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [26]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [27]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [28]:
data['v1']=range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [29]:
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [30]:
#transforming Data using a Lambda Function 
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'], 'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [31]:
meat_to_animal = {'bacon': 'pig','pulled pork': 'pig','pastrami': 'cow','corned beef': 'cow','honey ham': 'pig','nova lox': 'salmon'}
#meat_to_animal
lowercased=data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [32]:
data['animal']=lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [33]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [34]:
#Replacing values inplace of missing values 

In [35]:
data=pd.Series([1.,-999.,2.,-999.,-1000.,3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [36]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [37]:
data.replace([-999,-100],np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [38]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [39]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [40]:
#renaming Axis Indexes
data=pd.DataFrame(np.arange(12).reshape((3,4)), index=['ohio','Colorado','New York'], columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [41]:
transform=lambda x:x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [42]:
data.index=data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [43]:
data.rename(index=str.title, columns=str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [44]:
data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'})
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [45]:
#binning or bucketizing for continuous variables
ages=[20,22,25,27,21,23,37,31,61,45,41,32]

In [46]:
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [47]:
cats.codes
cats.categories
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [48]:
pd.cut(ages,[18,26,36,61,100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [49]:
group_names=['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,[18,26,36,61,100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [50]:
data=np.random.rand(20)
pd.cut(data,4,precision=2)

[(0.34, 0.55], (0.34, 0.55], (0.76, 0.97], (0.76, 0.97], (0.34, 0.55], ..., (0.34, 0.55], (0.34, 0.55], (0.55, 0.76], (0.34, 0.55], (0.12, 0.34]]
Length: 20
Categories (4, interval[float64]): [(0.12, 0.34] < (0.34, 0.55] < (0.55, 0.76] < (0.76, 0.97]]

In [51]:
data=np.random.randn(1000) #normally distributed
cats=pd.qcut(data,4) #cut into quartiles
cats

[(-0.0265, 0.62], (0.62, 3.928], (-0.68, -0.0265], (0.62, 3.928], (-0.0265, 0.62], ..., (-0.68, -0.0265], (-0.68, -0.0265], (-2.9499999999999997, -0.68], (0.62, 3.928], (-0.68, -0.0265]]
Length: 1000
Categories (4, interval[float64]): [(-2.9499999999999997, -0.68] < (-0.68, -0.0265] < (-0.0265, 0.62] < (0.62, 3.928]]

In [52]:
pd.value_counts(cats)

(0.62, 3.928]                   250
(-0.0265, 0.62]                 250
(-0.68, -0.0265]                250
(-2.9499999999999997, -0.68]    250
dtype: int64

In [53]:
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(-0.0265, 1.286], (-0.0265, 1.286], (-1.187, -0.0265], (-0.0265, 1.286], (-0.0265, 1.286], ..., (-1.187, -0.0265], (-1.187, -0.0265], (-2.9499999999999997, -1.187], (-0.0265, 1.286], (-1.187, -0.0265]]
Length: 1000
Categories (4, interval[float64]): [(-2.9499999999999997, -1.187] < (-1.187, -0.0265] < (-0.0265, 1.286] < (1.286, 3.928]]

In [54]:
#filter out the outliers

In [55]:
data=pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.049091,0.026112,-0.002544,-0.051827
std,0.996947,1.007458,0.995232,0.998311
min,-3.64586,-3.184377,-3.745356,-3.428254
25%,-0.599807,-0.612162,-0.687373,-0.747478
50%,0.047101,-0.013609,-0.022158,-0.088274
75%,0.756646,0.695298,0.699046,0.623331
max,2.653656,3.525865,2.735527,3.366626


In [56]:
col=data[2]
col[np.abs(col)>3]

41    -3.399312
136   -3.745356
Name: 2, dtype: float64

In [57]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
41,0.457246,-0.025907,-3.399312,-0.974657
60,1.951312,3.260383,0.963301,1.201206
136,0.508391,-0.196713,-3.745356,-1.520113
235,-0.242459,-3.05699,1.918403,-0.578828
258,0.682841,0.326045,0.425384,-3.428254
322,1.179227,-3.184377,1.369891,-1.074833
544,-3.548824,1.553205,-2.186301,1.277104
635,-0.578093,0.193299,1.397822,3.366626
782,-0.207434,3.525865,0.28307,0.544635
803,-3.64586,0.255475,-0.549574,-1.907459


In [58]:
data[np.abs(data)>3]=np.sign(data)+3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.060286,0.037567,0.008601,-0.045765
std,0.987306,1.005829,0.986316,0.997059
min,-2.989741,-2.970822,-2.881858,-2.969411
25%,-0.596286,-0.609704,-0.679089,-0.743886
50%,0.056212,-0.008168,-0.020077,-0.086309
75%,0.77028,0.715334,0.705783,0.624413
max,2.653656,4.0,2.735527,4.0


In [59]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,1.0,1.0,1.0,-1.0
3,-1.0,-1.0,1.0,-1.0
4,-1.0,1.0,-1.0,-1.0


In [60]:
#reshaping data
pd.options.display.max_rows=20
np.random.seed(12345)
import matplotlib.pyplot as plt 
plt.rc('figure', figsize=(10,6))
np.set_printoptions(precision=4, suppress=True)

In [61]:
data=pd.Series(np.random.randn(9), index=[['a','a','a','b','b','c','c','d','d'], [1,2,3,1,3,1,2,2,3]])
data

a  1   -0.204708
   2    0.478943
   3   -0.519439
b  1   -0.555730
   3    1.965781
c  1    1.393406
   2    0.092908
d  2    0.281746
   3    0.769023
dtype: float64

In [62]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [63]:
data['b']
data['b':'c']
data.loc[['b','d']]

b  1   -0.555730
   3    1.965781
d  2    0.281746
   3    0.769023
dtype: float64

In [64]:
data.loc[:,2]

a    0.478943
c    0.092908
d    0.281746
dtype: float64

In [65]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.204708,0.478943,-0.519439
b,-0.55573,,1.965781
c,1.393406,0.092908,
d,,0.281746,0.769023


In [66]:
data.unstack().stack()

a  1   -0.204708
   2    0.478943
   3   -0.519439
b  1   -0.555730
   3    1.965781
c  1    1.393406
   2    0.092908
d  2    0.281746
   3    0.769023
dtype: float64

In [67]:
frame=pd.DataFrame(np.arange(12).reshape((4,3)), index=[['a','a','b','b'],[1,2,1,2]], columns=[['ohio','ohio','Colorado'],['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,ohio,ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [68]:
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame

Unnamed: 0_level_0,state,ohio,ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [69]:
frame['ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [70]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,ohio,ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [71]:
frame.sort_index(level=1)
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,ohio,ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11
