## Task
Explore Pandas basics

## Notebook Summary
* Creation
 * Series - from scalar, list, NumPy array, dict
 * DataFrame - from Series, list of dicts, dict of Series, 2d NumPy array, NumPy structured array
* Access
 * Series - as array (via implicit index) & as dict (via explicit index)
 * DataFrame - as dictionary, as 2d array
* Missing Data
* Summary & descriptive statistics
* drop, mask, get/set
* Index alignment / function application

## References
* *Python for Data Analysis*, Wes McKinney, O'Reilly, 2012
* *Numerical Python*, Robert Johansson, APress, 2015
* *Python Data Science Handbook*, O'Reilly, 2016


In [2]:
# display output from all cmds just like Python shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import platform
print 'python.version = ', platform.python_version()
import IPython
print 'ipython.version =', IPython.version_info

import numpy as np
print 'numpy.version = ', np.__version__
import pandas as pd
print 'pandas.version = ', pd.__version__
from pandas import Series, DataFrame


python.version =  2.7.10
ipython.version = (5, 1, 0, '')
numpy.version =  1.11.2
pandas.version =  0.19.1


In [9]:
# Series - Creation

print 'create from scalar - item is repeated to fill the index'
Series('MyItem', index=['a','b','c','d'])

print 'create from list'
Series(['First', 'Second', 'Third', 'Fourth'], index=['a','b','c','d'])

print 'create from NumPy array'
Series(np.array(['First', 'Second', 'Third', 'Fourth']), index=['a','b','c','d'])

print 'create from dict - index has keys in sorted order'
Series({'d':'Fourth', 'a': 'First', 'c':'Third', 'b':'Second'})

print 'create from dict - with index explicitly specified; dict keys/values not in index are ignored'
Series({'d':'Fourth', 'a': 'First', 'c':'Third', 'b':'Second'}, index=['a','b'])


create from scalar - item is repeated to fill the index


a    MyItem
b    MyItem
c    MyItem
d    MyItem
dtype: object

create from list


a     First
b    Second
c     Third
d    Fourth
dtype: object

create from NumPy array


a     First
b    Second
c     Third
d    Fourth
dtype: object

create from dict - index has keys in sorted order


a     First
b    Second
c     Third
d    Fourth
dtype: object

create from dict - with index explicitly specified; dict keys/values not in index are ignored


a     First
b    Second
dtype: object

In [34]:
# DataFrame - Creation

print 'from a Series'
s = Series(['First', 'Second', 'Third', 'Fourth'], index=['a','b','c','d'])
DataFrame(s, columns=['Col1'])

print 'from a list of dicts - missing keys have NaN'
DataFrame([{'Col1': 'First', 'Col2':'Item1'}, {'Col1': 'Second', 'Col3':'Item2.2'}, {'Col1': 'Third', 'Col2':'Item3'}], index=['x','y','z'])

print 'from a dict of Series'
s1 = Series(['First', 'Second', 'Third', 'Fourth'])
s2 = Series(['Item1','Item2','Item3','Item4'])
DataFrame({'Col1': s1, 'Col2':s2})

print 'from a 2d NumPy array'
arr = np.array([['First', 'Item1'],['Second', 'Item2'], ['Third', 'Item3']])
DataFrame(arr, index=['a','b','c'], columns=['Col1', 'Col2'])

print 'from a NumPy structured array'
arr = np.zeros(3, dtype=[('Col1', 'U10'),('Col2', 'f8')])
DataFrame(arr, columns=['Col1', 'Col2'], index=['a', 'b', 'c'])


from a Series


Unnamed: 0,Col1
a,First
b,Second
c,Third
d,Fourth


from a list of dicts - missing keys have NaN


Unnamed: 0,Col1,Col2,Col3
x,First,Item1,
y,Second,,Item2.2
z,Third,Item3,


from a dict of Series


Unnamed: 0,Col1,Col2
0,First,Item1
1,Second,Item2
2,Third,Item3
3,Fourth,Item4


from a 2d NumPy array


Unnamed: 0,Col1,Col2
a,First,Item1
b,Second,Item2
c,Third,Item3


from a NumPy structured array


Unnamed: 0,Col1,Col2
a,,0.0
b,,0.0
c,,0.0


In [52]:
# Series - Access

print 'values = ', s.values, type(s.values)
print 'index = ', s.index, type(s.index)

print 'dict-like access via explicit index'
s['a':'c']
s[['c','a','d','b']] # fancy indexing with non-sequential indices
'a' in s 
s.keys()
s['x'] = 'Xth'
s

# array-like access
print 'array-like access via implicit integer index'
s[0:3] # slices
s[[2,0,3,1]] # fancy indexing
s[(s == 'First') | (s == 'Third')] # masking


values =  ['First' 'Second' 'Third' 'Fourth' 'Xth'] <type 'numpy.ndarray'>
index =  Index([u'a', u'b', u'c', u'd', u'x'], dtype='object') <class 'pandas.indexes.base.Index'>
dict-like access via explicit index


a     First
b    Second
c     Third
dtype: object

c     Third
a     First
d    Fourth
b    Second
dtype: object

True

Index([u'a', u'b', u'c', u'd', u'x'], dtype='object')

a     First
b    Second
c     Third
d    Fourth
x       Xth
dtype: object

array-like access via implicit integer index


a     First
b    Second
c     Third
dtype: object

c     Third
a     First
d    Fourth
b    Second
dtype: object

a    First
c    Third
dtype: object

In [None]:

s = Series([1,2,3,4])
s
s[s%2 == 0]
s**2

0 in s
1 in s
s.values
s.isnull()
s.notnull()

s.name = 'SeriesName'
s.index.name = 'IndexName'
s

s.index = ['aa', 'bb', 'cc', 'dd']
s


In [57]:
# DataFrame

mydict = {
    'key1' : 'val1', 
    'key2' : 'val2',
    'key3' : 'val3'
}

DataFrame(mydict, index=['a'])

print '-----'

mydict = {
    'key1' : ['val1'], 
    'key2' : ['val2'],
    'key3' : ['val3']
}

DataFrame(mydict)
df = DataFrame(mydict, columns=['key4', 'key2', 'key3', 'key1'])
df['key4']
df.key3
df.key3 = 5
df

df.key3 = Series([1,2,3], index=[0, 1, 2])
df

df['key5'] = 5
df

del df['key5']
df


df.index.name = 'MyIndexName'
df.columns.name = 'MyColumnName'
df

df.index
df.columns
df.values

df.T


Unnamed: 0,key1,key2,key3
a,val1,val2,val3


-----


Unnamed: 0,key1,key2,key3
0,val1,val2,val3


0    NaN
Name: key4, dtype: object

0    val3
Name: key3, dtype: object

Unnamed: 0,key4,key2,key3,key1
0,,val2,5,val1


Unnamed: 0,key4,key2,key3,key1
0,,val2,1,val1


Unnamed: 0,key4,key2,key3,key1,key5
0,,val2,1,val1,5


Unnamed: 0,key4,key2,key3,key1
0,,val2,1,val1


MyColumnName,key4,key2,key3,key1
MyIndexName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,val2,1,val1


RangeIndex(start=0, stop=1, step=1, name=u'MyIndexName')

Index([u'key4', u'key2', u'key3', u'key1'], dtype='object', name=u'MyColumnName')

array([[nan, 'val2', 1, 'val1']], dtype=object)

MyIndexName,0
MyColumnName,Unnamed: 1_level_1
key4,
key2,val2
key3,1
key1,val1


### Missing data - Series

NumPy
* NumPy uses sentinel values NaN and None to indicate missing values
* Since None is an object, dtype of arrays containing None will be object
* NaN is only for floating point values; there is no NaN for other data types; 
* If NaN is present, int dtype will be upcast to float
* Any operation with Nan results in NaN

### Pandas
* pandas will convert None to NaN


### Operations on null values
* `isnull`
* `notnull`
* `dropna`
* `fillna`


In [72]:
# Missing data - Series

np.array([0, 1, None, 3, None, 5])
# np.array([0, 1, None, 3, None, 5]).sum() - TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
np.array([0, 1, np.nan, 3, np.nan, 5])
_.dtype


print '-----'
s = Series([0, 1, np.nan, np.nan, np.nan, np.nan, 3, np.nan, np.nan, 4])
s

print 'isnull'
s.isnull()

print 'ffill'
s.ffill()

print 'ffill, limit=2'
s.ffill(limit=2)

print 'ffill, limit=1'
s.bfill(limit=1)

print 'dropna'
s.dropna()


array([0, 1, None, 3, None, 5], dtype=object)

array([  0.,   1.,  nan,   3.,  nan,   5.])

dtype('float64')

-----


0    0.0
1    1.0
2    NaN
3    NaN
4    NaN
5    NaN
6    3.0
7    NaN
8    NaN
9    4.0
dtype: float64

isnull


0    False
1    False
2     True
3     True
4     True
5     True
6    False
7     True
8     True
9    False
dtype: bool

ffill


0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    3.0
7    3.0
8    3.0
9    4.0
dtype: float64

ffill, limit=2


0    0.0
1    1.0
2    1.0
3    1.0
4    NaN
5    NaN
6    3.0
7    3.0
8    3.0
9    4.0
dtype: float64

ffill, limit=1


0    0.0
1    1.0
2    NaN
3    NaN
4    NaN
5    3.0
6    3.0
7    NaN
8    4.0
9    4.0
dtype: float64

dropna


0    0.0
1    1.0
6    3.0
9    4.0
dtype: float64

In [85]:
# Missing data - DataFrame

df = DataFrame(np.arange(12).reshape(4,3))
df.ix[0,1] = df.ix[1,2] = np.nan
df[3] = np.nan
df

print '\n----- dropna'
df.dropna()
df.dropna(how='all') # drop rows with all NaN
df.dropna(how='all', axis=1) # drop columns with all NaN
df.dropna(thresh=3) # keep only rows with at least 3 non-NaN

print '\n----- fillna'

df.fillna(-99)
df.mean()
df.fillna(df.mean())
df.fillna({1:-1,2:-2,3:-3}, inplace=True)
df


Unnamed: 0,0,1,2,3
0,0,,2.0,
1,3,4.0,,
2,6,7.0,8.0,
3,9,10.0,11.0,



----- dropna


Unnamed: 0,0,1,2,3


Unnamed: 0,0,1,2,3
0,0,,2.0,
1,3,4.0,,
2,6,7.0,8.0,
3,9,10.0,11.0,


Unnamed: 0,0,1,2
0,0,,2.0
1,3,4.0,
2,6,7.0,8.0
3,9,10.0,11.0


Unnamed: 0,0,1,2,3
2,6,7.0,8.0,
3,9,10.0,11.0,



----- fillna


Unnamed: 0,0,1,2,3
0,0,-99.0,2.0,-99.0
1,3,4.0,-99.0,-99.0
2,6,7.0,8.0,-99.0
3,9,10.0,11.0,-99.0


0    4.5
1    7.0
2    7.0
3    NaN
dtype: float64

Unnamed: 0,0,1,2,3
0,0,7.0,2.0,
1,3,4.0,7.0,
2,6,7.0,8.0,
3,9,10.0,11.0,


Unnamed: 0,0,1,2,3
0,0,-1.0,2.0,-3.0
1,3,4.0,-2.0,-3.0
2,6,7.0,8.0,-3.0
3,9,10.0,11.0,-3.0


Unnamed: 0,0,1,2,3
0,0,-1.0,2.0,-3.0
1,3,4.0,-2.0,-3.0
2,6,7.0,8.0,-3.0
3,9,10.0,11.0,-3.0


In [10]:
# Summary & descriptive statistics

s = Series(np.arange(4), index=['Col1','Col2','Col3','Col4'])
s
s.mean()
s.idxmax()
s.cumsum()

s.describe()

print '---'

df = DataFrame(np.arange(12).reshape(3,4), index=['Row1', 'Row2', 'Row3'], columns=['Col1','Col2','Col3','Col4'])
df

df.sum()
df.mean(axis=1)
df.idxmax()
df.idxmin()
df.cumsum()
df.cumsum(axis=1)
df.describe()


df.ix['Row1']
s
df.ix['Row1'].corr(s)

df.corr()
df.cov()
df.corrwith(s, axis=1)


Col1    0
Col2    1
Col3    2
Col4    3
dtype: int64

1.5

'Col4'

Col1    0
Col2    1
Col3    3
Col4    6
dtype: int64

count    4.000000
mean     1.500000
std      1.290994
min      0.000000
25%      0.750000
50%      1.500000
75%      2.250000
max      3.000000
dtype: float64

---


Unnamed: 0,Col1,Col2,Col3,Col4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11


Col1    12
Col2    15
Col3    18
Col4    21
dtype: int64

Row1    1.5
Row2    5.5
Row3    9.5
dtype: float64

Col1    Row3
Col2    Row3
Col3    Row3
Col4    Row3
dtype: object

Col1    Row1
Col2    Row1
Col3    Row1
Col4    Row1
dtype: object

Unnamed: 0,Col1,Col2,Col3,Col4
Row1,0,1,2,3
Row2,4,6,8,10
Row3,12,15,18,21


Unnamed: 0,Col1,Col2,Col3,Col4
Row1,0,1,3,6
Row2,4,9,15,22
Row3,8,17,27,38


Unnamed: 0,Col1,Col2,Col3,Col4
count,3.0,3.0,3.0,3.0
mean,4.0,5.0,6.0,7.0
std,4.0,4.0,4.0,4.0
min,0.0,1.0,2.0,3.0
25%,2.0,3.0,4.0,5.0
50%,4.0,5.0,6.0,7.0
75%,6.0,7.0,8.0,9.0
max,8.0,9.0,10.0,11.0


Col1    0
Col2    1
Col3    2
Col4    3
Name: Row1, dtype: int64

Col1    0
Col2    1
Col3    2
Col4    3
dtype: int64

1.0

Unnamed: 0,Col1,Col2,Col3,Col4
Col1,1.0,1.0,1.0,1.0
Col2,1.0,1.0,1.0,1.0
Col3,1.0,1.0,1.0,1.0
Col4,1.0,1.0,1.0,1.0


Unnamed: 0,Col1,Col2,Col3,Col4
Col1,16.0,16.0,16.0,16.0
Col2,16.0,16.0,16.0,16.0
Col3,16.0,16.0,16.0,16.0
Col4,16.0,16.0,16.0,16.0


Row1    1.0
Row2    1.0
Row3    1.0
dtype: float64

In [86]:
# drop, mask, get/set value

df = DataFrame(np.arange(12).reshape(4,3), index=['a', 'b', 'c', 'd'], columns=['Col1', 'Col2', 'Col3'])
df
print '-----'

print 'Drop rows & columns by index'
df.drop('a')
df.drop(['a','b'])
df.drop('Col1', axis=1)
df.drop(['Col1', 'Col2', 'Col3'], axis=1)

print '-----'
df<5
df[df<5]

print '-----'
df.set_value('a', 'Col1', 99)
df.get_value('a', 'Col1')


Unnamed: 0,Col1,Col2,Col3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


-----
Drop rows & columns by index


Unnamed: 0,Col1,Col2,Col3
b,3,4,5
c,6,7,8
d,9,10,11


Unnamed: 0,Col1,Col2,Col3
c,6,7,8
d,9,10,11


Unnamed: 0,Col2,Col3
a,1,2
b,4,5
c,7,8
d,10,11


a
b
c
d


-----


Unnamed: 0,Col1,Col2,Col3
a,True,True,True
b,True,True,False
c,False,False,False
d,False,False,False


Unnamed: 0,Col1,Col2,Col3
a,0.0,1.0,2.0
b,3.0,4.0,
c,,,
d,,,


-----


Unnamed: 0,Col1,Col2,Col3
a,99,1,2
b,3,4,5
c,6,7,8
d,9,10,11


99

In [59]:
# Index alignment - Series

s1 = Series(range(3), index=['Val1','Val2','Val3'])
s1
s2 = Series([1,11,12], index=['Val1','Val11','Val12'])
s2
s1 + s2

s1.add(s2, fill_value=0)


print '---'

# Index Alignment - DataFrame

df = DataFrame(np.arange(9).reshape(3,3), index=['Row1','Row2','Row3'], columns=['Col1','Col2','Col3'])
df

df2 = DataFrame([1,2,3], index=['Row1', 'Row2', 'Row4'], columns=['Col1'])
df2

print 'add df'
df + df2
print 'add df, fill_value = 0'
df.add(df2, fill_value=0)

print '---'

df.ix['Row1']
df - df.ix['Row1']

df2.ix['Row4']
df - df2.ix['Row4']

df.sub(df['Col1'], axis=0)


Val1    0
Val2    1
Val3    2
dtype: int64

Val1      1
Val11    11
Val12    12
dtype: int64

Val1     1.0
Val11    NaN
Val12    NaN
Val2     NaN
Val3     NaN
dtype: float64

Val1      1.0
Val11    11.0
Val12    12.0
Val2      1.0
Val3      2.0
dtype: float64

---


Unnamed: 0,Col1,Col2,Col3
Row1,0,1,2
Row2,3,4,5
Row3,6,7,8


Unnamed: 0,Col1
Row1,1
Row2,2
Row4,3


add df


Unnamed: 0,Col1,Col2,Col3
Row1,1.0,,
Row2,5.0,,
Row3,,,
Row4,,,


add df, fill_value = 0


Unnamed: 0,Col1,Col2,Col3
Row1,1.0,1.0,2.0
Row2,5.0,4.0,5.0
Row3,6.0,7.0,8.0
Row4,3.0,,


---


Col1    0
Col2    1
Col3    2
Name: Row1, dtype: int64

Unnamed: 0,Col1,Col2,Col3
Row1,0,0,0
Row2,3,3,3
Row3,6,6,6


Col1    3
Name: Row4, dtype: int64

Unnamed: 0,Col1,Col2,Col3
Row1,-3.0,,
Row2,0.0,,
Row3,3.0,,


Unnamed: 0,Col1,Col2,Col3
Row1,0,1,2
Row2,0,1,2
Row3,0,1,2
