## Pandas Arithmetic and Data Alignment
### Cory Nichols - MSDS Data Mining

In [1]:
import pandas as pd
from pandas import Series
from pandas import DataFrame
import numpy as np

In [5]:
# lets take two series
# behavior of arithmetic between objects with different indexes
# when adding together objects, if any index pairs are not the same, the respective index in result will be union of
# index pairs
s1 = Series([7.3, -2.5, 3.4, 1.5], index = list('acde'))
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = list('acefg'))
print s1+s2
# internal data alignment introduces NA values in indices that dont overlap

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


In [7]:
# in the case of a dataframe, alignment is performed on both rows and columns
df1 = DataFrame(np.arange(9.).reshape((3,3)), columns = list('bcd'), index = ['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'), index = ['Utah','Ohio','Texas','Oregon'])
print df1 +df2

           b   c   d   e
Colorado NaN NaN NaN NaN
Ohio       3 NaN   6 NaN
Oregon   NaN NaN NaN NaN
Texas      9 NaN  12 NaN
Utah     NaN NaN NaN NaN


In [29]:
# fill differently indexed object with values, like 0 when an axis label is found in one but not other
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)), columns = list('abcde'))
df1 + df2
# instead, to pass a default value where indices dont align, use .add() method
df1.add(df2, fill_value = 0)
# fill value 0 will take sequential positioning to fill in nans with current position of iteration
# flexible arithmetic methods:
# add
# sub
# div
# mul

Unnamed: 0,a,b,c,d,e
0,0,2,4,6,-6
1,9,11,13,15,-1
2,18,20,22,24,4
3,5,6,7,8,9


## Operations Between DataFrames and Series

In [32]:
# broadcasting, when one position in array is broadcasted across all other arrays
arr = np.arange(12.).reshape((3,4))
arr
print arr[0]
arr - arr[0]

[ 0.  1.  2.  3.]


array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [45]:
# more broadcasting
frame = DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'), index=['Utah','Ohio','Texas', 'Oregon'])
# get first row of DataFrame with index of column names
series = frame.ix[0]
# by default, arithmetic between DF and series matches the index of the series on the DataFrame's columns, broadcasting
# down the rows
print series
# in this case the series indices are 'bde' and the frame's columns are b, d, e, all rows will be affected
frame - series

b    0
d    1
e    2
Name: Utah, dtype: int64


Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [39]:
# if an index value is not found in either the DF's columns or the Series' index, the objects will be reindexed
# to form the union
series2 = Series(range(3), index = list('bef'))
print frame + series2

        b   d   e   f
Utah    0 NaN   3 NaN
Ohio    3 NaN   6 NaN
Texas   6 NaN   9 NaN
Oregon  9 NaN  12 NaN


Unnamed: 0,b,d,e,f
Utah,0,,3,
Ohio,3,,6,
Texas,6,,9,
Oregon,9,,12,


In [65]:
# if we want to broadcast across the columns instead and use the rows for matching, have to use one of the arithmetic
# methods
print frame, '\n-------------------'
series3 = frame['d']

print series3, '\n-------------------'
print frame - series3 # this adds utah, ohio, texas, oregon to DF because indices in columns do not contain them
# default is to broadcast over columns
# if we want to match on the rows, we need to use arithmetic methods and identify the axis
# the axis number passed is the axis to MATCH ON, we mean to match on the dataframe's row index and broadcast across
frame.ix[:,:2].sub(series3, axis = 0)


        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11 
-------------------
Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int64 
-------------------
        Ohio  Oregon  Texas  Utah   b   d   e
Utah     NaN     NaN    NaN   NaN NaN NaN NaN
Ohio     NaN     NaN    NaN   NaN NaN NaN NaN
Texas    NaN     NaN    NaN   NaN NaN NaN NaN
Oregon   NaN     NaN    NaN   NaN NaN NaN NaN


Unnamed: 0,b,d
Utah,-1,0
Ohio,-1,0
Texas,-1,0
Oregon,-1,0
