In [1]:
from pandas import Series, DataFrame
import pandas as pd

In [2]:
import numpy as np

<h2>Series</h2>

In [2]:
obj = Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.dtype

dtype('int64')

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2['a']

-5

In [12]:
obj2['d'] = 6

In [13]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [15]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [16]:
obj2[obj2>0]

d    6
b    7
c    3
dtype: int64

In [17]:
obj2*2

d    12
b    14
a   -10
c     6
dtype: int64

In [19]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [20]:
'b' in obj2

True

In [21]:
'e' in obj2

False

In [22]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [23]:
obj3 = Series(sdata)

In [24]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [25]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [26]:
obj4 = Series(sdata, index=states)

In [27]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In this case, 3 values found in sdata were placed in the appropriate locations, but since
no value for 'California' was found, it appears as NaN (not a number) which is con-
sidered in pandas to mark missing or NA values. I will use the terms “missing” or “NA”
to refer to missing data. The isnull and notnull functions in pandas should be used to
detect missing data:

In [28]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [29]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [30]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [31]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [32]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [33]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [34]:
obj4.name = 'population'

In [35]:
obj4.index.name = 'state'

In [36]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [37]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

<h2>DataFrame</h2>

In [22]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

In [23]:
frame = DataFrame(data)

In [24]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [25]:
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [26]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'dept'],
                  index=['one', 'two', 'three', 'four', 'five'])

In [27]:
frame2

Unnamed: 0,year,state,pop,dept
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [28]:
frame2.columns

Index(['year', 'state', 'pop', 'dept'], dtype='object')

In [29]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [30]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [31]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
dept      NaN
Name: three, dtype: object

In [32]:
frame2['dept'] = 16.5

In [33]:
frame2

Unnamed: 0,year,state,pop,dept
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [34]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [35]:
frame2['debt'] = val

In [36]:
frame2

Unnamed: 0,year,state,pop,dept,debt
one,2000,Ohio,1.5,16.5,
two,2001,Ohio,1.7,16.5,-1.2
three,2002,Ohio,3.6,16.5,
four,2001,Nevada,2.4,16.5,-1.5
five,2002,Nevada,2.9,16.5,-1.7


Assigning a column that doesn’t exist will create a new column. The del keyword will
delete columns as with a dict:

In [37]:
frame2['eastern'] = frame2.state == 'Ohio'

In [38]:
frame2

Unnamed: 0,year,state,pop,dept,debt,eastern
one,2000,Ohio,1.5,16.5,,True
two,2001,Ohio,1.7,16.5,-1.2,True
three,2002,Ohio,3.6,16.5,,True
four,2001,Nevada,2.4,16.5,-1.5,False
five,2002,Nevada,2.9,16.5,-1.7,False


In [39]:
del frame2['eastern']

In [40]:
frame2.columns

Index(['year', 'state', 'pop', 'dept', 'debt'], dtype='object')

The column returned when indexing a DataFrame is a view on the un-
derlying data, not a copy. Thus, any in-place modifications to the Series
will be reflected in the DataFrame

In [11]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

If passed to DataFrame, it will interpret the outer dict keys as the columns and the inner
keys as the row indices:

In [12]:
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


transpose the result:

In [13]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [14]:
pd.DataFrame(pop, index=pd.Series([2001, 2002, 2003]))

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [15]:
pdata = {'Ohio': frame3['Ohio'][:-1],
'Nevada': frame3['Nevada'][:2]}

In [16]:
DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


If a DataFrame’s index and columns have their name attributes set, these will also be
displayed:

In [17]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [18]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [41]:
frame2.values

array([[2000, 'Ohio', 1.5, 16.5, nan],
       [2001, 'Ohio', 1.7, 16.5, -1.2],
       [2002, 'Ohio', 3.6, 16.5, nan],
       [2001, 'Nevada', 2.4, 16.5, -1.5],
       [2002, 'Nevada', 2.9, 16.5, -1.7]], dtype=object)

<h2>Index Objects</h2>

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names).

In [2]:
obj = Series(range(3), index=['a','b','c'])

In [3]:
index = obj.index

In [4]:
index

Index(['a', 'b', 'c'], dtype='object')

In [5]:
index[1:]

Index(['b', 'c'], dtype='object')

Index objects are immutable and thus can’t be modified by the user:

In [6]:
index[1] = 'd'

TypeError: Index does not support mutable operations

Immutability is important so that Index objects can be safely shared among data
structures:

In [7]:
index = pd.Index(np.arange(3))

In [8]:
obj2 = Series([1.5, -2.5, 0], index = index)

In [10]:
obj2.index is index

True

In [42]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [43]:
'Ohio' in frame3.columns

True

In [44]:
2003 in frame3.index

False

<h2>Essential Functionality</h2>
<h3>Reindexing</h3>

create a new object
with the data conformed to a new index.

In [45]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [46]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [47]:
#reindex
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [48]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [49]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [50]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [51]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [52]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


The columns can be reindexed using the columns keyword:

In [53]:
states = ['Texas', 'Utah', 'California']

In [54]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


Both can be reindexed in one shot, though interpolation will only apply row-wise (axis
0):

In [56]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', columns=states.sort())

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,0,1,2
c,3,4,5
d,6,7,8


As you’ll see soon, reindexing can be done more succinctly by label-indexing with loc:

In [57]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,California,Texas,Utah
a,2.0,1.0,
b,,,
c,5.0,4.0,
d,8.0,7.0,


<h3>Dropping entries from an axis</h3>

In [3]:
obj = Series(np.arange(5.), index=['a','b','c','d','e'])

In [4]:
new_obj = obj.drop('c')

In [5]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [6]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [7]:
data = DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])

In [8]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [9]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [10]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [11]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


<h3>Indexing, selection, and filtering</h3>

In [12]:
obj=Series(np.arange(4.), index=['a','b','c','d'])

In [13]:
obj['d']

3.0

In [14]:
obj[1]

1.0

In [15]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [16]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [17]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [18]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the endpoint
is inclusive:

In [19]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

Setting using these methods works just as you would expect:

In [20]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns
either with a single value or sequence:

In [21]:
data = DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])

In [22]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [23]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [24]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [25]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [26]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Another use case is in indexing with a boolean DataFrame, such as
one produced by a scalar comparison:

In [27]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [28]:
data[data < 5] = 0

In [29]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [30]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [34]:
data.ix[['Colorado', 'Utah'], [3,0,1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [36]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [37]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [40]:
data.ix[data.three>5, :3]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


<h3>Arithmetic and data alignment</h3>

In [3]:
s1 = Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])

In [4]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [5]:
df1 = DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), index=['Utah','Ohio', 'Texas', 'Oregon'])

In [6]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


<h4>Arithmetic methods with fill values</h4>
<p>you might want to fill
with a special value, like 0, when an axis label is found in one object but not the other:</p>

In [7]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [8]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


Using the add method on df1 , I pass df2 and an argument to fill_value :

In [9]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [10]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


<h3>Operations between DataFrame and Series</h3>

In [11]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [12]:
arr[0]

array([0., 1., 2., 3.])

In [13]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [14]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [15]:
series = frame.iloc[0]

In [16]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [17]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

By default, arithmetic between DataFrame and Series matches the index of the Series
on the DataFrame's columns, broadcasting down the rows:

In [18]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


If an index value is not found in either the DataFrame’s columns or the Series’s index,
the objects will be reindexed to form the union:

In [19]:
series2 = Series(range(3), index=['b', 'e', 'f'])

In [20]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [21]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [22]:
frame.sub(series3, axis=0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


<h3>Function application and mapping</h3>

In [23]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [24]:
frame

Unnamed: 0,b,d,e
Utah,0.628793,-1.651091,0.188901
Ohio,-0.834656,1.281423,1.687078
Texas,-0.903811,0.438688,0.47922
Oregon,0.646364,-0.803721,-1.12889


In [25]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.628793,1.651091,0.188901
Ohio,0.834656,1.281423,1.687078
Texas,0.903811,0.438688,0.47922
Oregon,0.646364,0.803721,1.12889


Another frequent operation is applying a function on 1D arrays to each column or row.
DataFrame’s apply method does exactly this

In [26]:
f = lambda x: x.max() -x.min()

In [27]:
frame.apply(f)

b    1.550176
d    2.932514
e    2.815968
dtype: float64

In [28]:
frame.apply(f, axis=1)

Utah      2.279884
Ohio      2.521734
Texas     1.383032
Oregon    1.775254
dtype: float64

Many of the most common array statistics (like sum and mean) are DataFrame methods,
so using apply is not necessary.

In [29]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

In [30]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.903811,-1.651091,-1.12889
max,0.646364,1.281423,1.687078


Suppose you wanted to compute a
formatted string from each floating point value in frame.

In [31]:
format = lambda x: '%.2f' % x

In [32]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.63,-1.65,0.19
Ohio,-0.83,1.28,1.69
Texas,-0.9,0.44,0.48
Oregon,0.65,-0.8,-1.13


The reason for the name applymap is that Series has a map method for applying an element-
wise function:

In [33]:
frame['e'].map(format)

Utah       0.19
Ohio       1.69
Texas      0.48
Oregon    -1.13
Name: e, dtype: object

<h3>Sorting and ranking</h3>

In [3]:
obj = Series(range(4), index=['d','a','b','c'])

In [4]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

sort by index on either axis:

In [6]:
frame = DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])

In [7]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [8]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


The data is sorted in ascending order by default, but can be sorted in descending order,
too:

In [9]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


To sort a Series by its values, use its sort_values() method:

In [10]:
obj = Series([4,7,-3,2])

In [12]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [13]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])

In [14]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

On DataFrame, you may want to sort by the values in one or more columns. To do so,
pass one or more column names to the by option:

In [15]:
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})

In [16]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [18]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [19]:
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


<b>Ranking</b> is closely related to sorting, assigning ranks from one through the number of
valid data points in an array. It is similar to the indirect sort indices produced by
numpy.argsort, except that ties are broken according to a rule. The rank methods for
Series and DataFrame are the place to look; by default rank breaks ties by assigning
each group the mean rank:

In [28]:
obj = Series([7,-5,7,4,2,0,4])

In [21]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [29]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [30]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [31]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [32]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})

In [33]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [34]:
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


Method Description<br>
'average' Default: assign the average rank to each entry in the equal group.<br>
'min' Use the minimum rank for the whole group.<br>
'max' Use the maximum rank for the whole group.<br>
'first' Assign ranks in the order the values appear in the data

<h3>Axis indexes with duplicate values</h3>

In [35]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [36]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

The index’s is_unique property can tell you whether its values are unique or not:

In [37]:
obj.index.is_unique

False

Data selection is one of the main things that behaves differently with duplicates. Indexing
a value with multiple entries returns a Series while single entries return a scalar
value

In [38]:
obj['a']

a    0
a    1
dtype: int64

In [39]:
obj['c']

4

The same logic extends to indexing rows in a DataFrame

In [40]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,1.39038,-0.283311,0.523887
a,-0.675147,-1.383122,1.428347
b,0.243536,0.376573,0.703654
b,1.605597,1.603254,-1.024758


In [41]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.243536,0.376573,0.703654
b,1.605597,1.603254,-1.024758
