In [1]:
%load_ext notexbook
%texify

In [2]:
# Pandas contains data structures and data manipulation tools
# designed to make data cleaning and analysis fast and easy in Python. 
# While pandas adopts many coding idioms from NumPy, 
# the biggest difference is that pandas is designed
# for working with tabular or heterogeneous data.
# NumPy, by contrast, is best suited for working with homogeneous numerical array data.

In [3]:
# Introduction to pandas Data Structures.

In [4]:
import pandas as pd
import numpy as np

In [5]:
# Series
# A Series is a one-dimensional array-like object containing a sequence of values
# (of similar types to NumPy types) and an associated array of data labels, called its index.

In [6]:
obj = pd.Series([4, 7, -5, 3])

In [7]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
obj.index # Get the index 

RangeIndex(start=0, stop=4, step=1)

In [9]:
# Often it will be desirable to create a Series with an index identifying each data point
# with a label:
obj2 = pd.Series([4, 7, -5, 3], index=['a', 'b', 'c', 'd'])
obj2.name

In [10]:
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [11]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [12]:
# Compared with NumPy arrays, you can use labels in the index when selecting
# single values or a set of values:
obj2['a']

4

In [13]:
obj[1]

7

In [14]:
obj2[['c', 'a', 'd']] # Here ['c', 'a', 'd'] is interpreted as a list of indices,
                      # even though it contains strings instead of integers.

c   -5
a    4
d    3
dtype: int64

In [15]:
obj > 3

0     True
1     True
2    False
3    False
dtype: bool

In [16]:
obj[obj > 3]

0    4
1    7
dtype: int64

In [17]:
obj * 2

0     8
1    14
2   -10
3     6
dtype: int64

In [18]:
np.exp(obj)

0      54.598150
1    1096.633158
2       0.006738
3      20.085537
dtype: float64

In [19]:
# you can create a Series from a python dict:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)

In [20]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [21]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [22]:
obj4 = pd.Series(sdata, index=states)

In [23]:
# Here, three values found in sdata were placed in the appropriate locations,
# but since no value for 'California' was found, it appears as NaN (not a number),
# which is con‐ sidered in pandas to mark missing or NA values.
# Since 'Utah' was not included in states, it is excluded from the resulting object.
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
# I will use the terms “missing” or “NA” interchangeably to refer to missing data.
# The isnull and notnull functions in pandas should be used to detect missing data:
pd.isnull(obj4) # or obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
pd.notnull(obj4) # or obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [26]:
# A useful Series feature for many applications is that it automatically aligns by index
# label in arithmetic operations:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [27]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [28]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [29]:
# Both the Series object itself and its index have a name attribute,
# which integrates with other key areas of pandas functionality:
obj4.name = 'population'

In [30]:
obj4.index.name = 'state'

In [31]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [32]:
# A Series’s index can be altered in-place by assignment:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [33]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [34]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [35]:
# DataFrame

In [36]:
# A DataFrame represents a rectangular table of data and contains an ordered collection
# of columns, each of which can be a different value type (numeric, string, boolean, etc.).

In [37]:
# There are many ways to construct a DataFrame, though one of the most common
# is from a dict of equal-length lists or NumPy arrays:

In [38]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
       }

In [39]:
frame = pd.DataFrame(data)

In [40]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
# For large DataFrames, the head method selects only the first five rows:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [42]:
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [43]:
pd.DataFrame(data, columns=['pop', 'year', 'state'])

Unnamed: 0,pop,year,state
0,1.5,2000,Ohio
1,1.7,2001,Ohio
2,3.6,2002,Ohio
3,2.4,2001,Nevada
4,2.9,2002,Nevada
5,3.2,2003,Nevada


In [44]:
# If you pass a column that isn’t contained in the dict,
# it will appear with missing values in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four','five', 'six'])

In [45]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [46]:
# A column in a DataFrame can be retrieved as a Series 
# either by dict-like notation or by attribute:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [47]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [48]:
# retrieving the whole columns
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [49]:
# Rows can also be retrieved by position or name with the special loc attribute
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [50]:
# Columns can be modified by assignment.
# For example, the empty 'debt' column could be assigned a scalar value or an array of values:
frame2['debt'] = 16.5

In [51]:
frame2['debt'] = np.arange(6.)

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [53]:
# When you are assigning lists or arrays to a column,
# the value’s length must match the length of the DataFrame.
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index,
# inserting missing values in any holes:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [54]:
frame2['debt'] = val

In [55]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [56]:
# Assigning a column that doesn’t exist will create a new column.
# The del keyword will delete columns as with a dict.
# As an example of del, I first add a new column of boolean values
# where the state column equals 'Ohio':
frame2['eastern'] = frame2.state == 'Ohio'

In [57]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [58]:
# NOTE: New columns cannot be created with the frame2.eastern syntax.

In [59]:
# The del method can then be used to remove this column:
del frame2['eastern']

In [60]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [61]:
# Another common form of data is a nested dict of dicts:
pop = {
    'Nevada': { 2001: 2.4, 2002: 2.9 },
    'Ohio': { 2000: 1.5, 2001: 1.7, 2002: 3.6 }
}
# If the nested dict is passed to the DataFrame,
# pandas will interpret the outer dict keys as the columns
# and the inner keys as the row indices:
frame3 = pd.DataFrame(pop)

In [62]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [63]:
# You can transpose the DataFrame (swap rows and columns) with similar syntax to a NumPy array:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [64]:
pdata = {
    'Ohio': frame3['Ohio'][:-1],
    'Nevada': frame3['Nevada'][:2]
}

In [65]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [66]:
frame3.index.name = 'year';
frame3.columns.name = 'state'

In [67]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [68]:
# As with Series, the values attribute returns the data
# contained in the DataFrame as a two-dimensional ndarray:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [69]:
# If the DataFrame’s columns are different dtypes,
# the dtype of the values array will be chosen to accommodate all of the columns:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [70]:
# Possible data inputs to DataFrame constructor

In [71]:
# 1: 2D ndarray
arr = np.array([[1,2,3], [10,11,12]])
df1 = pd.DataFrame(arr, index=[0,1], columns=['a','b','c'])
pd.DataFrame()
df1

Unnamed: 0,a,b,c
0,1,2,3
1,10,11,12


In [72]:
# 2: dict of arrays, lists, or tuples

In [73]:
dict = {
    'months': ['January', 'February', 'March', 'April', 'May',  'June', 'July', 'August', 'September', 'October', 'November', 'December'],
    'wieght': [65, 65.2, 64.8, 63., 65, 65.9, 66.3, 67.8, 69.6, 72.0, 73.8, 75]
}

df2 = pd.DataFrame(dict, index=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
df2

Unnamed: 0,months,wieght
1,January,65.0
2,February,65.2
3,March,64.8
4,April,63.0
5,May,65.0
6,June,65.9
7,July,66.3
8,August,67.8
9,September,69.6
10,October,72.0


In [74]:
# Each sequence becomes a column in the DataFrame; all sequences must be the same length
lst = [dict['months'], dict['wieght']]
df_from_lists = pd.DataFrame(lst)
df_from_lists

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,January,February,March,April,May,June,July,August,September,October,November,December
1,65,65.2,64.8,63,65,65.9,66.3,67.8,69.6,72,73.8,75


In [75]:
# Index Objects

In [76]:
# pandas’s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names).
# Any array or other sequence of labels you use when constructing a Series
# or DataFrame is internally converted to an Index:

In [77]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [78]:
index = obj.index

In [79]:
index

Index(['a', 'b', 'c'], dtype='object')

In [80]:
index[1:] # Index can be sliced

Index(['b', 'c'], dtype='object')

In [81]:
# Index objects are immutable and thus can’t be modified by the user:
index[1] = 'd' # TypeError

TypeError: Index does not support mutable operations

In [None]:
# Immutability makes it safer to share Index objects among data structures:
labels = pd.Index(np.arange(3.))

In [None]:
labels

In [None]:
obj2 = pd.Series([10,20,30], index=labels)

In [None]:
obj2

In [None]:
obj2.index is labels

In [None]:
frame3

In [None]:
# In addition to being array-like, an Index also behaves like a fixed-size set:
frame3.columns

In [82]:
'Ohio' in frame3.columns

True

In [83]:
'Boston' in frame3.columns

False

In [84]:
# Unlike Python sets, a pandas Index can contain duplicate labels:
dup_labels = pd.Index(['foo', 'bar', 'bar', 'foo'])

In [85]:
dup_labels

Index(['foo', 'bar', 'bar', 'foo'], dtype='object')

In [86]:
# NOTE: Selections with duplicate labels will select all occurrences of that label.

In [87]:
# Some Index methods and properties

In [88]:
# append
other_dups = pd.Index(['spam', 'foo'])
newIndex = dup_labels.append(other=other_dups) # Concatenate with additional Index objects, producing a new Index
newIndex

Index(['foo', 'bar', 'bar', 'foo', 'spam', 'foo'], dtype='object')

In [89]:
# difference
dup_labels.difference(other_dups) # Compute set difference as an Index

Index(['bar'], dtype='object')

In [90]:
# intersection
dup_labels.intersection(other_dups)

Index(['foo', 'foo'], dtype='object')

In [91]:
# This section will walk you through the fundamental mechanics
# of interacting with the data contained in a Series or DataFrame.

In [92]:
# Reindexing

In [93]:
# An important method on pandas objects is reindex,
# which means to create a new object with the data conformed to a new index.
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [94]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [95]:
obj2 = obj.reindex(['a','b','c','d','e']) # Calling reindex on this Series rearranges the data according to the new index,
                                          # intro‐ ducing missing values if any index values were not already present.

In [96]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [97]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [98]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

# Essential Functionality

## Reindexing

An important method on pandas objects is reindex,  
which means to create a new object with the data conformed to a new index

In [99]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [100]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [101]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [102]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [103]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [104]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [105]:
obj4 = obj3.reindex(range(6), method='ffill')

In [106]:
obj4

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [107]:
frame = pd.DataFrame(np.arange(9).reshape(3,3), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])

In [108]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [109]:
frame2 = frame.reindex(['a', 'c', 'd', 'e'])

In [110]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
c,3.0,4.0,5.0
d,6.0,7.0,8.0
e,,,


The columns can be reindexed with the columns keyword:

In [111]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


## Dropping Entries from an Axis

Dropping one or more entries from an axis is easy if you already have an index array or list without those entries.

In [113]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [114]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [116]:
obj_dropped = obj.drop('c')

In [117]:
obj_dropped

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [118]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

With DataFrame, index values can be deleted from either axis. To illustrate this, we first create an example DataFrame:

In [119]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [120]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Calling drop with a sequence of labels will drop values from the row labels (axis 0):

In [121]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


You can drop values from the columns by passing axis=1 or axis='columns':

In [124]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [123]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


Many functions, like drop, which modify the size or shape of a Series or DataFrame,  
can manipulate an object in-place without returning a new object:

In [125]:
obj.drop('c', inplace=True)

In [126]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

# Indexing, Selection, and Filtering

Series indexing (obj[...]) works analogously to NumPy array indexing,  
except you can use the Series’s index values instead of only integers. 

In [127]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [128]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [129]:
obj['b']

1.0

In [130]:
obj[1]

1.0

In [131]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [132]:
obj[['a', 'd', 'c']]

a    0.0
d    3.0
c    2.0
dtype: float64

In [133]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [134]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the end‐point is inclusive:

In [135]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

Setting using these methods modifies the corresponding section of the Series:

In [136]:
obj['b':'c'] = -1

In [137]:
obj

a    0.0
b   -1.0
c   -1.0
d    3.0
dtype: float64

Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence:

In [138]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four']
                   )

In [139]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [140]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [141]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


Indexing like this has a few special cases. First, slicing or selecting data with a boolean array:

In [142]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [144]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [148]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [149]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [150]:
data[data < 5] = 0

In [151]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Selection with loc and iloc

For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc.  
They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation  
using either axis labels (loc) or integers (iloc).

As a preliminary example, let’s select a single row and multiple columns by label:

In [152]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

We’ll then perform some similar selections with integers using iloc:

In [153]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [157]:
data.iloc[1, [1,2]]

two      5
three    6
Name: Colorado, dtype: int64

In [158]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


Both indexing functions work with slices in addition to single labels or lists of labels:

In [159]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [161]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


### Integer Indexes

In [162]:
ser = pd.Series(np.arange(3.))

In [163]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [164]:
ser[-1]

KeyError: -1

In this case, pandas could “fall back” on integer indexing,  
but it’s difficult to do this in general without introducing subtle bugs.  
Here we have an index containing 0, 1, 2, but inferring what the user wants  
(label-based indexing or position-based) is difficult

On the other hand, with a non-integer index, there is no potential for ambiguity:

In [165]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

In [166]:
ser2[-1]

2.0

To keep things consistent, if you have an axis index containing integers,  
data selection will always be label-oriented.  
For more precise handling, use loc (for labels) or iloc (for integers):  

In [167]:
ser[:1]

0    0.0
dtype: float64

In [168]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [169]:
ser.iloc[:1]

0    0.0
dtype: float64

### Arithmetic and Data Alignment

In [170]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [171]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [172]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [173]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [174]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [176]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [177]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [178]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [179]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


#### Arithmetic methods with fill values