In [3]:
import pandas as pd
import numpy as np

In [None]:
# in Pandas, for unary operations like negation and trigonometric functions, the ufuncs will preserve index and column labels 
# in the output, and for binary operations such as addition and multiplication, Pandas will automatically align indices when 
# passing the objects to the ufunc. This means that keeping the context of data and combining data from different sources–both 
# potentially error-prone tasks with raw NumPy arrays–become essentially foolproof ones with Pandas.

In [None]:
# Because Pandas is designed to work with NumPy, any NumPy ufunc will work on Pandas Series and DataFrame objects. 
# Let's start by defining a simple Series and DataFrame on which to demonstrate this:

In [6]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [8]:
df = pd.DataFrame(rng.randint(0, 10, (3,4)), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,1,7,5,1
1,4,0,9,5
2,8,0,9,2


In [9]:
# If we apply a NumPy ufunc on either of these objects, the result will be another Pandas object with the indices preserved:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [10]:
# Or, for a slightly more complex calculation:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,0.7071068,-0.707107,-0.707107,0.707107
1,1.224647e-16,0.0,0.707107,-0.707107
2,-2.449294e-16,0.0,0.707107,1.0


In [None]:
# UFuncs: Index Alignment

In [None]:
# For binary operations on two Series or DataFrame objects, Pandas will align indices in the process of performing the 
# operation. This is very convenient when working with incomplete data, as we'll see in some of the examples that follow.

In [None]:
# Index alignment in Series

In [12]:
# As an example, suppose we are combining two different data sources, and find only the top three US states by area 
# and the top three US states by population:

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [13]:
# Let's see what happens when we divide these to compute the population density:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [14]:
# The resulting array contains the union of indices of the two input arrays, 
# which could be determined using standard Python set arithmetic on these indices:
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [18]:
# Any item for which one or the other does not have an entry is marked with NaN, or "Not a Number," 
# which is how Pandas marks missing data:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [19]:
# If using NaN values is not the desired behavior, the fill value can be modified using appropriate object methods in place of 
# the operators. For example, calling A.add(B) is equivalent to calling A + B, but allows optional explicit specification of 
# the fill value for any elements in A or B that might be missing:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [None]:
# Index alignment in DataFrame

In [20]:
# A similar type of alignment takes place for both columns and indices when performing operations on DataFrames:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,11,19
1,2,4


In [22]:
B = pd.DataFrame(rng.randint(0, 10, (3,3)), columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,9,4,1
1,3,6,7
2,2,0,3


In [23]:
A + B

Unnamed: 0,A,B,C
0,15.0,28.0,
1,8.0,7.0,
2,,,


In [28]:
# Notice that indices are aligned correctly irrespective of the order in the two objects, and indices in the result are sorted.
# As was the case with Series, we can use the associated object's arithmetic method and pass any desired fill_value to be used 
# in place of missing entries. Here we'll fill with the mean of all values in A (computed by first stacking the rows of A):

fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,15.0,28.0,10.0
1,8.0,7.0,16.0
2,9.0,11.0,12.0


In [None]:
"""
Python Operator	Pandas Method(s)
+	add()
-	sub(), subtract()
*	mul(), multiply()
/	truediv(), div(), divide()
//	floordiv()
%	mod()
**	pow()
"""

In [None]:
# Ufuncs: Operations Between DataFrame and Series

In [None]:
# When performing operations between a DataFrame and a Series, the index and column alignment is similarly maintained. 
# Operations between a DataFrame and a Series are similar to operations between a two-dimensional and one-dimensional 
# NumPy array.

In [30]:
A = rng.randint(10, size=(3, 4))
A

array([[1, 7, 3, 1],
       [5, 5, 9, 3],
       [5, 1, 9, 1]])

In [31]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 4, -2,  6,  2],
       [ 4, -6,  6,  0]])

In [47]:
# According to NumPy's broadcasting rules, subtraction between a two-dimensional array
# and one of its rows is applied row-wise.

# In Pandas, the convention similarly operates row-wise by default:
df = pd.DataFrame(A, columns=list('QRST'), index=list('ABC'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
A,0,0,0,0
B,4,-2,6,2
C,4,-6,6,0


In [40]:
# If you would instead like to operate column-wise, you can use the object methods mentioned earlier, 
# while specifying the axis keyword:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
A,-6,0,-4,-6
B,0,0,4,-2
C,4,0,8,0


In [44]:
# Note that these DataFrame/Series operations, like the operations discussed above, will automatically align indices 
# between the two elements:
halfrow = df.iloc[0, ::2]
halfrow

Q    1
S    3
Name: A, dtype: int32

In [45]:
df - halfrow

Unnamed: 0,Q,R,S,T
A,0.0,,0.0,
B,4.0,,6.0,
C,4.0,,6.0,


In [None]:
# This preservation and alignment of indices and columns means that operations on data in Pandas will always maintain 
# the data context, which prevents the types of silly errors that might come up when working with heterogeneous and/or 
# misaligned data in raw NumPy arrays.