## Task
Explore cleaning, transforming, merging, re-shaping series and dataframes

## Notebook summary
* Merging 
* Concatenating 
* Combining dataframes to get missing values
* Reshaping
* Pivoting
* Removing duplicates, map, replace
* Binning
* Outliers
* Permuations / random sampling
* Indicator/dummy variables
* String manipulation, regex (findall, search, match, split, sub)

## References
* *Python for Data Analysis*, Wes McKinney, O'Reilly, 2012
* *Numerical Python*, Robert Johansson, APress, 2015
* *Python Data Science Handbook*, Jake VanderPlas, O'Reilly, 2016


In [1]:
# display output from all cmds just like Python shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import platform
print 'python.version = ', platform.python_version()
import IPython
print 'ipython.version =', IPython.version_info

import numpy as np
print 'numpy.version = ', np.__version__
import pandas as pd
print 'pandas.version = ', pd.__version__
from pandas import Series, DataFrame


python.version =  2.7.10
ipython.version = (5, 1, 0, '')
numpy.version =  1.11.2
pandas.version =  0.19.1


In [10]:

df1 = DataFrame(np.array([['a','b','c','a','b','d','f'], range(7)]))
df1 = df1.T
df1.columns = ['Key1', 'Val1']
df1

df2 = DataFrame(np.array([['a','b','c','d','e'],range(5)]))
df2 = df2.T
df2.columns = ['Key2','Val2']
df2

# many to many joins result in Cartesian product of rows
pd.merge(df1, df2, left_on='Key1', right_on='Key2') # inner join by default
pd.merge(df1, df2, left_on='Key1', right_on='Key2', how='left') 
pd.merge(df1, df2, left_on='Key1', right_on='Key2', how='right')
pd.merge(df1, df2, left_on='Key1', right_on='Key2', how='outer') 


Unnamed: 0,Key1,Val1
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,d,5
6,f,6


Unnamed: 0,Key2,Val2
0,a,0
1,b,1
2,c,2
3,d,3
4,e,4


Unnamed: 0,Key1,Val1,Key2,Val2
0,a,0,a,0
1,a,3,a,0
2,b,1,b,1
3,b,4,b,1
4,c,2,c,2
5,d,5,d,3


Unnamed: 0,Key1,Val1,Key2,Val2
0,a,0,a,0.0
1,b,1,b,1.0
2,c,2,c,2.0
3,a,3,a,0.0
4,b,4,b,1.0
5,d,5,d,3.0
6,f,6,,


Unnamed: 0,Key1,Val1,Key2,Val2
0,a,0.0,a,0
1,a,3.0,a,0
2,b,1.0,b,1
3,b,4.0,b,1
4,c,2.0,c,2
5,d,5.0,d,3
6,,,e,4


Unnamed: 0,Key1,Val1,Key2,Val2
0,a,0.0,a,0.0
1,a,3.0,a,0.0
2,b,1.0,b,1.0
3,b,4.0,b,1.0
4,c,2.0,c,2.0
5,d,5.0,d,3.0
6,f,6.0,,
7,,,e,4.0


In [11]:

df1 = DataFrame(np.array([['a','b','a','b','c'],range(5)]))
df1 = df1.T
df1.columns = ['Key1', 'Val1']
df1

df2 = DataFrame([101,102], index=['a','b'], columns=['Val2'])
df2

pd.merge(df1, df2, left_on='Key1', right_index=True)
pd.merge(df1, df2, left_on='Key1', right_index=True, how='outer')

df1.join(df2)
df2.join(df1)

print '---'

# join works on indexes; merge works on columns
df1 = DataFrame([[1,2],[3,4]], columns=['Key1','Val1'], index=['a','b'])
df1

df2 = DataFrame([[1,9],[5,6],[7,8]], columns=['Key2', 'Val2'], index=['a','b','c'])
df2

df1.join(df2)
df1.join(df2, how='outer')
df2.join(df1)

df3 = DataFrame([[11],[12],[13]], columns=['Val3'], index=['c','d','e'])
df3

df1.join([df2,df3])
df1.join([df2,df3], how='outer')


Unnamed: 0,Key1,Val1
0,a,0
1,b,1
2,a,2
3,b,3
4,c,4


Unnamed: 0,Val2
a,101
b,102


Unnamed: 0,Key1,Val1,Val2
0,a,0,101
2,a,2,101
1,b,1,102
3,b,3,102


Unnamed: 0,Key1,Val1,Val2
0,a,0,101.0
2,a,2,101.0
1,b,1,102.0
3,b,3,102.0
4,c,4,


Unnamed: 0,Key1,Val1,Val2
0,a,0,
1,b,1,
2,a,2,
3,b,3,
4,c,4,


Unnamed: 0,Val2,Key1,Val1
a,101,,
b,102,,


---


Unnamed: 0,Key1,Val1
a,1,2
b,3,4


Unnamed: 0,Key2,Val2
a,1,9
b,5,6
c,7,8


Unnamed: 0,Key1,Val1,Key2,Val2
a,1,2,1,9
b,3,4,5,6


Unnamed: 0,Key1,Val1,Key2,Val2
a,1.0,2.0,1,9
b,3.0,4.0,5,6
c,,,7,8


Unnamed: 0,Key2,Val2,Key1,Val1
a,1,9,1.0,2.0
b,5,6,3.0,4.0
c,7,8,,


Unnamed: 0,Val3
c,11
d,12
e,13


Unnamed: 0,Key1,Val1,Key2,Val2,Val3
a,1,2,1,9,
b,3,4,5,6,


Unnamed: 0,Key1,Val1,Key2,Val2,Val3
a,1.0,2.0,1.0,9.0,
b,3.0,4.0,5.0,6.0,
c,,,7.0,8.0,11.0
d,,,,,12.0
e,,,,,13.0


In [12]:
# Concatenating DataFrames using NumPy

myarr = np.arange(6).reshape(2,3)
myarr

np.concatenate([myarr,myarr])
np.concatenate([myarr,myarr], axis=1)

print '---'

# pd.concat - Series

s1 = Series(range(5), index=['a','b','c','d','e'])
s2 = Series([11,12,13], index=['a','f','g'])
pd.concat([s1,s2])

pd.concat([s1,s2], axis=1)
pd.concat([s1,s2], axis=1, join='inner')
pd.concat([s1,s2], axis=1, join_axes=[['a','b','c']])

pd.concat([s1,s2], keys=['A','B','C'])
pd.concat([s1,s2], keys=['A','B','C']).unstack()

pd.concat([s1,s2], keys=['A','B','C'], axis=1)
pd.concat([s1,s2], keys=['A','B','C'], axis=1).unstack()

print '---'

# pd.concat DataFrames

df1 = DataFrame(np.arange(6).reshape(2,3), index=['a','b'], columns=['Col1', 'Col2', 'Col3'])
df1

df2 = DataFrame(np.arange(6).reshape(2,3) + 10, index=['c','d'], columns=['Col11', 'Col12', 'Col13'])
df2


pd.concat([df1,df2], axis=0)
pd.concat([df1,df2], axis=0, keys=['A','B'], names=['outer','inner'])
pd.concat([df1,df2], axis=1)
pd.concat([df1,df2], axis=1, keys=['A','B'], names=['upper','lower'])


array([[0, 1, 2],
       [3, 4, 5]])

array([[0, 1, 2],
       [3, 4, 5],
       [0, 1, 2],
       [3, 4, 5]])

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5]])

---


a     0
b     1
c     2
d     3
e     4
a    11
f    12
g    13
dtype: int64

Unnamed: 0,0,1
a,0.0,11.0
b,1.0,
c,2.0,
d,3.0,
e,4.0,
f,,12.0
g,,13.0


Unnamed: 0,0,1
a,0,11


Unnamed: 0,0,1
a,0,11.0
b,1,
c,2,


A  a     0
   b     1
   c     2
   d     3
   e     4
B  a    11
   f    12
   g    13
dtype: int64

Unnamed: 0,a,b,c,d,e,f,g
A,0.0,1.0,2.0,3.0,4.0,,
B,11.0,,,,,12.0,13.0


Unnamed: 0,A,B
a,0.0,11.0
b,1.0,
c,2.0,
d,3.0,
e,4.0,
f,,12.0
g,,13.0


A  a     0.0
   b     1.0
   c     2.0
   d     3.0
   e     4.0
   f     NaN
   g     NaN
B  a    11.0
   b     NaN
   c     NaN
   d     NaN
   e     NaN
   f    12.0
   g    13.0
dtype: float64

---


Unnamed: 0,Col1,Col2,Col3
a,0,1,2
b,3,4,5


Unnamed: 0,Col11,Col12,Col13
c,10,11,12
d,13,14,15


Unnamed: 0,Col1,Col11,Col12,Col13,Col2,Col3
a,0.0,,,,1.0,2.0
b,3.0,,,,4.0,5.0
c,,10.0,11.0,12.0,,
d,,13.0,14.0,15.0,,


Unnamed: 0_level_0,Unnamed: 1_level_0,Col1,Col11,Col12,Col13,Col2,Col3
outer,inner,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,a,0.0,,,,1.0,2.0
A,b,3.0,,,,4.0,5.0
B,c,,10.0,11.0,12.0,,
B,d,,13.0,14.0,15.0,,


Unnamed: 0,Col1,Col2,Col3,Col11,Col12,Col13
a,0.0,1.0,2.0,,,
b,3.0,4.0,5.0,,,
c,,,,10.0,11.0,12.0
d,,,,13.0,14.0,15.0


upper,A,A,A,B,B,B
lower,Col1,Col2,Col3,Col11,Col12,Col13
a,0.0,1.0,2.0,,,
b,3.0,4.0,5.0,,,
c,,,,10.0,11.0,12.0
d,,,,13.0,14.0,15.0


In [13]:
# combine_first

s1 = Series(range(3), index=['a','b','c'])
s2 = s1.add(10)

s1['b'] = np.nan
s1
s2

s1.combine_first(s2)

print '---'
df1
df1['Col11'] = np.nan
df1.ix['c'] = np.nan
df1.ix['d'] = np.nan
df1

df1.combine_first(df2)


a    0.0
b    NaN
c    2.0
dtype: float64

a    10
b    11
c    12
dtype: int64

a     0.0
b    11.0
c     2.0
dtype: float64

---


Unnamed: 0,Col1,Col2,Col3
a,0,1,2
b,3,4,5


Unnamed: 0,Col1,Col2,Col3,Col11
a,0.0,1.0,2.0,
b,3.0,4.0,5.0,
c,,,,
d,,,,


Unnamed: 0,Col1,Col11,Col12,Col13,Col2,Col3
a,0.0,,,,1.0,2.0
b,3.0,,,,4.0,5.0
c,,10.0,11.0,12.0,,
d,,13.0,14.0,15.0,,


In [49]:
# stacking and unstacking

s1 = Series(np.arange(3), index=[['a','b','c'], ['Outer1','Outer1','Outer1']])
s1.name = 'Series1'
s1

s2 = s1 + 10
s2.index = [['d','e','f'],['Outer2','Outer2','Outer2']]
s2.name = 'Series2'
s2


s3 = pd.concat([s1,s2])
s3
s3.unstack()
s3.unstack().stack()
s3.unstack().stack(dropna=False)

print '---'

df = s3.unstack()
df.columns.name = 'MyCol'
df.index.name = 'MyIndex'
df
df.stack()
df.unstack('MyIndex')


a  Outer1    0
b  Outer1    1
c  Outer1    2
Name: Series1, dtype: int64

d  Outer2    10
e  Outer2    11
f  Outer2    12
Name: Series2, dtype: int64

a  Outer1     0
b  Outer1     1
c  Outer1     2
d  Outer2    10
e  Outer2    11
f  Outer2    12
dtype: int64

Unnamed: 0,Outer1,Outer2
a,0.0,
b,1.0,
c,2.0,
d,,10.0
e,,11.0
f,,12.0


a  Outer1     0.0
b  Outer1     1.0
c  Outer1     2.0
d  Outer2    10.0
e  Outer2    11.0
f  Outer2    12.0
dtype: float64

a  Outer1     0.0
   Outer2     NaN
b  Outer1     1.0
   Outer2     NaN
c  Outer1     2.0
   Outer2     NaN
d  Outer1     NaN
   Outer2    10.0
e  Outer1     NaN
   Outer2    11.0
f  Outer1     NaN
   Outer2    12.0
dtype: float64

---


MyCol,Outer1,Outer2
MyIndex,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,
b,1.0,
c,2.0,
d,,10.0
e,,11.0
f,,12.0


MyIndex  MyCol 
a        Outer1     0.0
b        Outer1     1.0
c        Outer1     2.0
d        Outer2    10.0
e        Outer2    11.0
f        Outer2    12.0
dtype: float64

MyIndex  MyCol 
a        Outer1     0.0
b        Outer1     1.0
c        Outer1     2.0
d        Outer2    10.0
e        Outer2    11.0
f        Outer2    12.0
dtype: float64

MyCol   MyIndex
Outer1  a           0.0
        b           1.0
        c           2.0
        d           NaN
        e           NaN
        f           NaN
Outer2  a           NaN
        b           NaN
        c           NaN
        d          10.0
        e          11.0
        f          12.0
dtype: float64

In [87]:
# pivoting long to wide and vice versa

idx = pd.date_range('1/1/2016', periods=10, name='Date')
s1 = Series(np.arange(10), index=idx)
s1

s2 = Series(np.random.randn(10), index=idx)
s2

s3 = pd.concat([s1,s2], axis=1)
s3.columns = ['item_id', 'value']
s3
s3.T

print '---'

df = DataFrame({
        'date':['2016-01-01','2016-01-01','2016-01-01','2016-01-02','2016-01-02','2016-01-02','2016-01-03','2016-01-03','2016-01-03','2016-01-03'], 
        'item_id':['item1','item2','item3','item1','item2','item3','item1','item2','item3','item4'], 
        'value1':np.random.randn(10),
        'value2':np.random.randn(10) + 10,
})
df
df.pivot('date', 'item_id', 'value1')
df.pivot('date', 'item_id')
df.pivot('date', 'item_id')['value1']
df.pivot('date', 'item_id')['value1']['item2']

print '---'

df.set_index(['date', 'item_id'])
df.set_index(['date', 'item_id']).unstack()
df.set_index(['date', 'item_id']).unstack('item_id')


Date
2016-01-01    0
2016-01-02    1
2016-01-03    2
2016-01-04    3
2016-01-05    4
2016-01-06    5
2016-01-07    6
2016-01-08    7
2016-01-09    8
2016-01-10    9
Freq: D, dtype: int64

Date
2016-01-01    0.129039
2016-01-02    0.203448
2016-01-03   -0.603091
2016-01-04   -0.892362
2016-01-05   -0.408186
2016-01-06    0.257805
2016-01-07    1.347035
2016-01-08   -0.118249
2016-01-09   -0.549134
2016-01-10    0.886883
Freq: D, dtype: float64

Unnamed: 0_level_0,item_id,value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01,0,0.129039
2016-01-02,1,0.203448
2016-01-03,2,-0.603091
2016-01-04,3,-0.892362
2016-01-05,4,-0.408186
2016-01-06,5,0.257805
2016-01-07,6,1.347035
2016-01-08,7,-0.118249
2016-01-09,8,-0.549134
2016-01-10,9,0.886883


Date,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00,2016-01-07 00:00:00,2016-01-08 00:00:00,2016-01-09 00:00:00,2016-01-10 00:00:00
item_id,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
value,0.129039,0.203448,-0.603091,-0.892362,-0.408186,0.257805,1.347035,-0.118249,-0.549134,0.886883


---


Unnamed: 0,date,item_id,value1,value2
0,2016-01-01,item1,1.647467,8.711448
1,2016-01-01,item2,1.516599,9.13162
2,2016-01-01,item3,0.598081,10.590419
3,2016-01-02,item1,-0.17374,9.587855
4,2016-01-02,item2,0.820445,9.609502
5,2016-01-02,item3,-0.268965,10.913042
6,2016-01-03,item1,0.910826,9.9208
7,2016-01-03,item2,-0.11969,10.532543
8,2016-01-03,item3,2.819035,10.732147
9,2016-01-03,item4,-0.2684,9.568307


item_id,item1,item2,item3,item4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01,1.647467,1.516599,0.598081,
2016-01-02,-0.17374,0.820445,-0.268965,
2016-01-03,0.910826,-0.11969,2.819035,-0.2684


Unnamed: 0_level_0,value1,value1,value1,value1,value2,value2,value2,value2
item_id,item1,item2,item3,item4,item1,item2,item3,item4
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2016-01-01,1.647467,1.516599,0.598081,,8.711448,9.13162,10.590419,
2016-01-02,-0.17374,0.820445,-0.268965,,9.587855,9.609502,10.913042,
2016-01-03,0.910826,-0.11969,2.819035,-0.2684,9.9208,10.532543,10.732147,9.568307


item_id,item1,item2,item3,item4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01,1.647467,1.516599,0.598081,
2016-01-02,-0.17374,0.820445,-0.268965,
2016-01-03,0.910826,-0.11969,2.819035,-0.2684


date
2016-01-01    1.516599
2016-01-02    0.820445
2016-01-03   -0.119690
Name: item2, dtype: float64

---


Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
date,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01,item1,1.647467,8.711448
2016-01-01,item2,1.516599,9.13162
2016-01-01,item3,0.598081,10.590419
2016-01-02,item1,-0.17374,9.587855
2016-01-02,item2,0.820445,9.609502
2016-01-02,item3,-0.268965,10.913042
2016-01-03,item1,0.910826,9.9208
2016-01-03,item2,-0.11969,10.532543
2016-01-03,item3,2.819035,10.732147
2016-01-03,item4,-0.2684,9.568307


Unnamed: 0_level_0,value1,value1,value1,value1,value2,value2,value2,value2
item_id,item1,item2,item3,item4,item1,item2,item3,item4
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2016-01-01,1.647467,1.516599,0.598081,,8.711448,9.13162,10.590419,
2016-01-02,-0.17374,0.820445,-0.268965,,9.587855,9.609502,10.913042,
2016-01-03,0.910826,-0.11969,2.819035,-0.2684,9.9208,10.532543,10.732147,9.568307


Unnamed: 0_level_0,value1,value1,value1,value1,value2,value2,value2,value2
item_id,item1,item2,item3,item4,item1,item2,item3,item4
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2016-01-01,1.647467,1.516599,0.598081,,8.711448,9.13162,10.590419,
2016-01-02,-0.17374,0.820445,-0.268965,,9.587855,9.609502,10.913042,
2016-01-03,0.910826,-0.11969,2.819035,-0.2684,9.9208,10.532543,10.732147,9.568307


In [107]:
# removing duplicates - Series

s = Series([1,1,2,2,3,3,4,4])
s

s.duplicated()
s[~s.duplicated()]
s.drop_duplicates()

print '---'

# removing duplicates - DataFrame
df = DataFrame({
        'Col1': [1,1,1,2,2,2,3,3,3],
        'Col2': ['a','a','b','b','b','c','c','c','d']
    })
df
df.duplicated()
df.drop_duplicates()

df2 = df = DataFrame({
        'Col1': [1,1,1,2,2,2,3,3,3],
        'Col2': ['a','a','b','b','b','c','c','c','d'],
        'Col3': np.random.randn(9)
    })
df2
df2.duplicated()
df2.duplicated(['Col1','Col2'])
df2.drop_duplicates(['Col1','Col2'], keep='last')


0    1
1    1
2    2
3    2
4    3
5    3
6    4
7    4
dtype: int64

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
dtype: bool

0    1
2    2
4    3
6    4
dtype: int64

0    1
2    2
4    3
6    4
dtype: int64

---


Unnamed: 0,Col1,Col2
0,1,a
1,1,a
2,1,b
3,2,b
4,2,b
5,2,c
6,3,c
7,3,c
8,3,d


0    False
1     True
2    False
3    False
4     True
5    False
6    False
7     True
8    False
dtype: bool

Unnamed: 0,Col1,Col2
0,1,a
2,1,b
3,2,b
5,2,c
6,3,c
8,3,d


Unnamed: 0,Col1,Col2,Col3
0,1,a,0.383914
1,1,a,0.119964
2,1,b,0.671659
3,2,b,0.463517
4,2,b,-0.128094
5,2,c,-0.161958
6,3,c,0.311371
7,3,c,1.083029
8,3,d,-0.488031


0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
dtype: bool

0    False
1     True
2    False
3    False
4     True
5    False
6    False
7     True
8    False
dtype: bool

Unnamed: 0,Col1,Col2,Col3
1,1,a,0.119964
2,1,b,0.671659
4,2,b,-0.128094
5,2,c,-0.161958
7,3,c,1.083029
8,3,d,-0.488031


In [113]:
# map

df = DataFrame({
    'SubItem': ['Sub1.1','Sub1.2','Sub1.3','Sub2.1','Sub2.2','Sub2.3','Sub3.1','Sub3.2','Sub3.3',],
    'Item': ['Item1','Item1','Item1','Item2','Item2','Item2','Item3','Item3','Item3']
    })
df

mapping = {
    'Item1': 'Category1',
    'Item2': 'Category2',
    'Item3': 'Category3'
}
mapping

df['Category'] = df['Item'].map(mapping)
df

df['Item'].map(lambda x: mapping[x])


Unnamed: 0,Item,SubItem
0,Item1,Sub1.1
1,Item1,Sub1.2
2,Item1,Sub1.3
3,Item2,Sub2.1
4,Item2,Sub2.2
5,Item2,Sub2.3
6,Item3,Sub3.1
7,Item3,Sub3.2
8,Item3,Sub3.3


{'Item1': 'Category1', 'Item2': 'Category2', 'Item3': 'Category3'}

Unnamed: 0,Item,SubItem,Category
0,Item1,Sub1.1,Category1
1,Item1,Sub1.2,Category1
2,Item1,Sub1.3,Category1
3,Item2,Sub2.1,Category2
4,Item2,Sub2.2,Category2
5,Item2,Sub2.3,Category2
6,Item3,Sub3.1,Category3
7,Item3,Sub3.2,Category3
8,Item3,Sub3.3,Category3


0    Category1
1    Category1
2    Category1
3    Category2
4    Category2
5    Category2
6    Category3
7    Category3
8    Category3
Name: Item, dtype: object

In [141]:
# replace

s = Series(['a','b','blank','Blank','e','BLANK','f'])
s

s.replace(['blank','BLANK'], [np.nan, -999])
s.replace({'blank':np.nan, 'BLANK':-999})
s.replace('[Bb][Ll][Aa][Nn][Kk]', np.nan, regex=True)

print '---'

df = DataFrame({
        'Col1': np.arange(5),
        'Col2': ['a','blank','c','BLANK','e']
}, index = ['a','b','c','d','e'])
df.index.name = 'MyIdx'
df.columns.name = 'MyCols'
df
df.replace(['blank','BLANK'], [np.nan, -999])
df.replace('[Bb][Ll][Aa][Nn][Kk]', np.nan, regex=True)


# modify index

df.index = df.index.map(str.upper)
df.index.name = 'MyIdx'
df.columns = df.columns.map(str.upper)
df.columns.name = 'MyCols'
df

print '---'

df.rename(index=str.upper, columns=str.lower)
df.rename(index={'A':'XYZ'}, columns={'COL2':'Column2'})
df

df.rename(index={'A':'XYZ'}, columns={'COL2':'Column2'}, inplace=True)
df


0        a
1        b
2    blank
3    Blank
4        e
5    BLANK
6        f
dtype: object

0        a
1        b
2      NaN
3    Blank
4        e
5     -999
6        f
dtype: object

0        a
1        b
2      NaN
3    Blank
4        e
5     -999
6        f
dtype: object

0      a
1      b
2    NaN
3    NaN
4      e
5    NaN
6      f
dtype: object

---


MyCols,Col1,Col2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,a
b,1,blank
c,2,c
d,3,BLANK
e,4,e


MyCols,Col1,Col2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,a
b,1,
c,2,c
d,3,-999
e,4,e


MyCols,Col1,Col2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,a
b,1,
c,2,c
d,3,
e,4,e


MyCols,COL1,COL2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,a
B,1,blank
C,2,c
D,3,BLANK
E,4,e


---


MyCols,col1,col2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,a
B,1,blank
C,2,c
D,3,BLANK
E,4,e


MyCols,COL1,Column2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
XYZ,0,a
B,1,blank
C,2,c
D,3,BLANK
E,4,e


MyCols,COL1,COL2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,a
B,1,blank
C,2,c
D,3,BLANK
E,4,e


MyCols,COL1,Column2
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1
XYZ,0,a
B,1,blank
C,2,c
D,3,BLANK
E,4,e


In [166]:
# Binning

data = np.arange(50)
bin_defs = [0,18,25,35,50]
bins = pd.cut(data, bin_defs, labels=['Group1','Group2','Group3','Group4'])
bins
bins.describe()

bins.codes
pd.value_counts(bins)

bins = pd.cut(data,4)
pd.value_counts(bins)
bins

bins = pd.qcut(data, 4)
pd.value_counts(bins)
bins


[NaN, Group1, Group1, Group1, Group1, ..., Group4, Group4, Group4, Group4, Group4]
Length: 50
Categories (4, object): [Group1 < Group2 < Group3 < Group4]

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Group1,18,0.36
Group2,7,0.14
Group3,10,0.2
Group4,14,0.28
,1,0.02


array([-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3], dtype=int8)

Group1    18
Group4    14
Group3    10
Group2     7
dtype: int64

(36.75, 49]        13
(-0.049, 12.25]    13
(24.5, 36.75]      12
(12.25, 24.5]      12
dtype: int64

[(-0.049, 12.25], (-0.049, 12.25], (-0.049, 12.25], (-0.049, 12.25], (-0.049, 12.25], ..., (36.75, 49], (36.75, 49], (36.75, 49], (36.75, 49], (36.75, 49]]
Length: 50
Categories (4, object): [(-0.049, 12.25] < (12.25, 24.5] < (24.5, 36.75] < (36.75, 49]]

(36.75, 49]      13
[0, 12.25]       13
(24.5, 36.75]    12
(12.25, 24.5]    12
dtype: int64

[[0, 12.25], [0, 12.25], [0, 12.25], [0, 12.25], [0, 12.25], ..., (36.75, 49], (36.75, 49], (36.75, 49], (36.75, 49], (36.75, 49]]
Length: 50
Categories (4, object): [[0, 12.25] < (12.25, 24.5] < (24.5, 36.75] < (36.75, 49]]

In [182]:
# Outliers

df = DataFrame(np.random.randn(10,4))
df = df.round(2)
df
df.describe()
df[abs(df)>1.5] = np.sign(df)*1.5
df.describe()


Unnamed: 0,0,1,2,3
0,-0.08,-1.63,1.43,-0.04
1,-1.02,-1.16,0.12,-0.31
2,-1.26,0.99,-1.25,0.42
3,-0.97,1.07,-0.21,0.58
4,-1.14,0.05,-0.61,0.19
5,1.0,0.44,0.28,1.22
6,-0.16,-0.31,-1.37,0.65
7,-0.13,0.0,0.2,0.56
8,-1.75,-0.13,-1.98,-0.84
9,1.83,-0.03,-0.45,-0.58


Unnamed: 0,0,1,2,3
count,10.0,10.0,10.0,10.0
mean,-0.368,-0.071,-0.384,0.185
std,1.104262,0.842423,0.982154,0.62989
min,-1.75,-1.63,-1.98,-0.84
25%,-1.11,-0.265,-1.09,-0.2425
50%,-0.565,-0.015,-0.33,0.305
75%,-0.0925,0.3425,0.18,0.575
max,1.83,1.07,1.43,1.22


Unnamed: 0,0,1,2,3
count,10.0,10.0,10.0,10.0
mean,-0.376,-0.058,-0.336,0.185
std,1.000202,0.81629,0.904117,0.62989
min,-1.5,-1.5,-1.5,-0.84
25%,-1.11,-0.265,-1.09,-0.2425
50%,-0.565,-0.015,-0.33,0.305
75%,-0.0925,0.3425,0.18,0.575
max,1.5,1.07,1.43,1.22


In [201]:
# Permutation, random sampling

df = DataFrame(np.random.rand(5,3))
df = df.round(2)
df

ridx = np.random.permutation(df.shape[0])
ridx

# randomly sample 3 rows without replacement
df.ix[ridx[:3]]
df.take(ridx[:3])

# sample 3 rows with replacement
df
df.shape
len(df)

ridx = np.random.randint(0,(df.shape[0]-1),10)
ridx
df.take(ridx)


Unnamed: 0,0,1,2
0,0.3,0.03,0.23
1,0.92,0.75,0.93
2,0.33,0.84,0.46
3,0.24,0.68,0.86
4,0.05,0.1,0.82


array([3, 0, 4, 1, 2])

Unnamed: 0,0,1,2
3,0.24,0.68,0.86
0,0.3,0.03,0.23
4,0.05,0.1,0.82


Unnamed: 0,0,1,2
3,0.24,0.68,0.86
0,0.3,0.03,0.23
4,0.05,0.1,0.82


Unnamed: 0,0,1,2
0,0.3,0.03,0.23
1,0.92,0.75,0.93
2,0.33,0.84,0.46
3,0.24,0.68,0.86
4,0.05,0.1,0.82


(5, 3)

5

array([2, 0, 1, 3, 1, 3, 2, 0, 2, 3])

Unnamed: 0,0,1,2
2,0.33,0.84,0.46
0,0.3,0.03,0.23
1,0.92,0.75,0.93
3,0.24,0.68,0.86
1,0.92,0.75,0.93
3,0.24,0.68,0.86
2,0.33,0.84,0.46
0,0.3,0.03,0.23
2,0.33,0.84,0.46
3,0.24,0.68,0.86


In [218]:
# Indicator variables - get_dummies()

df = DataFrame({
        'item': ['a','b','c','a','b','c'],
        'value': np.arange(6)
    })
df

pd.get_dummies(df)
pd.get_dummies(df['item'], prefix='item')

df[['value']].join(pd.get_dummies(df['item'], prefix='item'))


print '---'

val = np.random.randn(20)
val = np.round(val,2)
val

bins = [-10,0,0.25,0.5,0.75,1,10]
b = pd.cut(val, bins)
pd.get_dummies(b)


Unnamed: 0,item,value
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5


Unnamed: 0,value,item_a,item_b,item_c
0,0,1,0,0
1,1,0,1,0
2,2,0,0,1
3,3,1,0,0
4,4,0,1,0
5,5,0,0,1


Unnamed: 0,item_a,item_b,item_c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0
5,0,0,1


Unnamed: 0,value,item_a,item_b,item_c
0,0,1,0,0
1,1,0,1,0
2,2,0,0,1
3,3,1,0,0
4,4,0,1,0
5,5,0,0,1


---


array([ 0.1 , -0.76, -1.74,  1.65,  0.79,  0.16, -0.81,  0.57, -1.37,
        1.52,  0.34,  0.47,  0.33, -0.68,  0.58,  1.59,  0.7 ,  0.72,
        0.58, -1.75])

Unnamed: 0,"(-10, 0]","(0, 0.25]","(0.25, 0.5]","(0.5, 0.75]","(0.75, 1]","(1, 10]"
0,0,1,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,0,0,0,0,0,1
4,0,0,0,0,1,0
5,0,1,0,0,0,0
6,1,0,0,0,0,0
7,0,0,0,1,0,0
8,1,0,0,0,0,0
9,0,0,0,0,0,1


In [236]:
# string manipulation

mystr = 'a , b , c , d , e '
fields = [x.strip() for x in mystr.split(',')]
type(fields)
fields

'|'.join(fields)

'a' in mystr
mystr.find('a')
mystr.index('a')
mystr.count('a')

'x' in mystr
mystr.find('x')
# mystr.index('x') - will throw ValueError
mystr.count('x')

mystr.replace(',', '|')
mystr.replace(' , ','')


list

['a', 'b', 'c', 'd', 'e']

'a|b|c|d|e'

True

0

0

1

False

-1

0

'a | b | c | d | e '

'abcde '

In [None]:
# RegEx

# findall, finditer
# search
# match
# split
# sub
