In [6]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

obj =Series([4,7,-5,3])
print obj.values
# a default index of 0 to N - 1 is put into place because we didnt give an explicit index
print obj.index

obj2 = Series(obj.values, index=['d','b','a','c'])
print obj2
# can index regularly
print obj2[1]
# or use indices
print 'printing obj2 indices b, c, a\n', obj2[['b','c','a']]
print 'printing obj2 index A', obj2['a']

# NumPy operations such as filtering, scalar multiplication, or applying math functions work and preserve index
print obj2[obj2 > 0]
print obj2*2
print np.exp(obj2)

[ 4  7 -5  3]
Int64Index([0, 1, 2, 3], dtype='int64')
d    4
b    7
a   -5
c    3
dtype: int64
7
printing obj2 indices b, c, a
b    7
c    3
a   -5
dtype: int64
printing obj2 index A -5
d    4
b    7
c    3
dtype: int64
d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [2]:
# a series is a fixed length, ordered dict, as it is a mapping of index values to data values
# it can be subbed into many functions that expect a dictionary
print 'b' in obj2
# can also pass a dict directly into series where index will take on the keys in the dic
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = Series(sdata)
print obj3
print obj3['Texas']

True
Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64
71000


In [3]:
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata, index=states)
print obj4
# no value is found for California as an index, so it is represented as NaN (not a number)
# we can do all sorts of things with NaN, fillna(), dropna()
# check for NaNs with pd.isnull(object), pd.notnull(object)
print pd.isnull(obj4), 'in static class method'
# or the series has its own methods
print obj4.isnull(), 'in series object method'

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool in static class method
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool in series object method


In [None]:
# series are automatically aligned different indexed data in arithmetic operations
obj5 = Series([20, 50, 50, 100], index=['Texas', 'California','Utah','Wyoming'])
obj6 = Series([20, 50, 50, 100], index=['Texas', 'California','Wyoming', 'Utah'])
# Wyoming should be 150
print obj5 + obj6

In [None]:
# both the series object and index have name attributes
obj5.name = 'Awesomeness'
obj5.index.name = 'State'
obj5

In [None]:
# a series index can be altered in place by assignment
# give a name to an index
obj5.index = ['Bob','Jeff','Steve','Mark']
obj5.index.name = 'Person'
obj5

## Pandas DataFrames

In [8]:
# a dataframe is a tabular, spreadsheet like data structure containing an ordered collection of columns
# each of which can be a different value type
# has both row and column index
# can easily represent higher than 2D data using hierarchical indexing
# let's construct a DataFrame
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000,2001,2002,2001, 2002],
        'pop': [1.5,1.7,3.6,2.4,2.9]}
df = DataFrame(data)
# index is assigned automatically and columns are placed in sorted order:
df

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [9]:
DataFrame(data, columns = ['year','state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [12]:
# debt is added to columns, resulting in nans, because there is no identified array of data for debt
frame2 = DataFrame(data, columns = ['year','state','pop','debt'],
          index=['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [15]:
# retrieve a column by dict like nortation or by attribute
# the returned Series have the same index as the DataFrame
print frame2.year, '\n', frame2['state']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64 
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object


In [17]:
# rows can be retrieved individually by calling df.ix
# values for a Series in a DataFrame can be set directly by indexing and using assignment = 
print frame2.ix['two']
frame2['debt'] = 16.5
print frame2

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: two, dtype: object
       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5


In [19]:
frame2['moredebt'] = np.arange(5.)
print frame2

       year   state  pop  debt  moredebt
one    2000    Ohio  1.5  16.5         0
two    2001    Ohio  1.7  16.5         1
three  2002    Ohio  3.6  16.5         2
four   2001  Nevada  2.4  16.5         3
five   2002  Nevada  2.9  16.5         4


In [22]:
lister = Series([1,2,3], index=['two','three','five'])
frame2['etc'] = lister
print frame2

       year   state  pop  debt  moredebt  etc
one    2000    Ohio  1.5  16.5         0  NaN
two    2001    Ohio  1.7  16.5         1    1
three  2002    Ohio  3.6  16.5         2    2
four   2001  Nevada  2.4  16.5         3  NaN
five   2002  Nevada  2.9  16.5         4    3


In [25]:
# the column returned when indexing a DataFrame is a view on the underlying data, not a copy
# Any in place modifications to the Series will be reflected in the DataFrame
# can use Series.copy() to copy instead of making changes to underlying DataFrame
frame2['eastern'] = frame2.state == 'Ohio'
print frame2
del frame2['eastern']
print frame2.columns

       year   state  pop  debt  moredebt  etc eastern
one    2000    Ohio  1.5  16.5         0  NaN    True
two    2001    Ohio  1.7  16.5         1    1    True
three  2002    Ohio  3.6  16.5         2    2    True
four   2001  Nevada  2.4  16.5         3  NaN   False
five   2002  Nevada  2.9  16.5         4    3   False
Index([u'year', u'state', u'pop', u'debt', u'moredebt', u'etc'], dtype='object')


In [27]:
# nested dict of dicts format is passed into dataframe
# will interpret the outer dict keys as the columns and inner keys as row indices
pop = {'Nevada': {2001:2.4,2002:2.9},
      'Ohio': {2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
print frame3
# transpose the DataFrame
print frame3.T


      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6


In [34]:
# passing pandas Series works much the same way as passing a nested dict of dicts
pdata = {'Ohio':frame3['Ohio'][:-1],
        'Nevada' : frame3['Nevada'][:2]}
pdata=  DataFrame(pdata)
pdata.index.name = 'year'; pdata.columns.name = 'state'
print pdata

state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7


In [35]:
# the values attribute of the dataframe returns a 2D ndarray
pdata.values

array([[ nan,  1.5],
       [ 2.4,  1.7]])

In [36]:
# if a frame's columns are different dtypes, the dtype of the values array will be chosen to accomodate
# all of the columns
frame2.values

array([[2000, 'Ohio', 1.5, 16.5, 0, nan],
       [2001, 'Ohio', 1.7, 16.5, 1, 1.0],
       [2002, 'Ohio', 3.6, 16.5, 2, 2.0],
       [2001, 'Nevada', 2.4, 16.5, 3, nan],
       [2002, 'Nevada', 2.9, 16.5, 4, 3.0]], dtype=object)

## Index Objects

In [40]:
# pandas index objets are responsible for holding the axis labels and other metadata like axis name/s
obj = Series(range(3), index=['a','b','c'])
index = obj.index
print index
# index objects are immutable and cannot be modified
#index['a'] = 'test'
# this is because index objects need to be safely shared among data structures
index = pd.Index(np.arange(3))
obj2 = Series([1.5,2.5,0], index=index)
obj2.index is index

Index([u'a', u'b', u'c'], dtype='object')


True

In [None]:
# main index objects in pandas: Index, Int64Index, MultiIndex(hierarchical index object, array of tuples)
# DatetimeIndex - stores nanosecond timestamps via NumPy datetime64 dtype
# PeriodIndex - specialized index for Period data (timespans)
# each index has methods and properties for set logic and answering other questions about data it contains
# index.: append, diff, intersection, union, isin, delete, drop, insert, is_monotonic, is_unique, unique

## Essential Functionality

In [41]:
# reindexing
# create a new object with data conformed to a new index
obj = Series([4.5,7.2,5.3,3.6], index=['a','b','c','d'])
obj

a    4.5
b    7.2
c    5.3
d    3.6
dtype: float64

In [46]:
# calling reindex rearranges the data according to new index, introducing missing values if any index values
# were not already present
obj2 = obj.reindex(['a','b','c','d','e'])
print obj2
obj2 = obj.reindex(['a','b','c','d','e'], fill_value = 0)
print obj2
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3 = obj3.reindex(range(6),method='ffill')
print obj3

a    4.5
b    7.2
c    5.3
d    3.6
e    NaN
dtype: float64
a    4.5
b    7.2
c    5.3
d    3.6
e    0.0
dtype: float64
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [50]:
# reindex can alter the row index, columns or both
frame = DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'], columns = ['Ohio','Texas','California'])
print frame
frame2 = frame.reindex(['a','b','c','d'])
print frame2

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a     0      1           2
b   NaN    NaN         NaN
c     3      4           5
d     6      7           8


In [51]:
# columns can be reindexed by using the columns keyword
states = ['Texas','Utah','California']
frame.reindex(columns=states)
# nan shows due to Utah not being part of original data set

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [55]:
# can reindex both in one shot

print frame
print frame.reindex(index=['a','b','c','d'], columns=states, method='ffill')

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Texas  Utah  California
a      1   NaN           2
b      1   NaN           2
c      4   NaN           5
d      7   NaN           8


In [56]:
# reindexing done more easily with label indexing by ix:
frame.ix[['a','b','c','d'],states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [None]:
# reindex function args
# index - new sequence to use as index
# method - interpolation or fill method (ffill, bfill)
# fill_value - substitute value to use when introducing missing data by reindexing
# limit - when forward or backfilling, maximum size gap to fill
# level - match simple index on level of multiindex, otherwise select subset of
# copy

