## Indexing and selecting data
- loc
- iloc

In [1]:
import pandas as pd
import numpy as np

In [2]:
dates = pd.date_range('1/1/2000', periods=8)

In [3]:
df = pd.DataFrame(np.random.randn(8, 4),index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,1.170823,1.84152,0.590857,1.155937
2000-01-02,0.217078,-0.934342,-0.332278,-1.443363
2000-01-03,0.13804,-0.553503,-0.348812,1.216798
2000-01-04,1.332447,1.615673,0.866259,0.050915
2000-01-05,0.105523,0.611315,0.168561,0.994959
2000-01-06,1.641572,-0.226638,0.204321,-2.91929
2000-01-07,-0.382184,-0.668507,-0.46133,0.351939
2000-01-08,-1.063967,-0.036191,-0.248969,0.893267


In [4]:
s = df.A

In [5]:
s

2000-01-01    1.170823
2000-01-02    0.217078
2000-01-03    0.138040
2000-01-04    1.332447
2000-01-05    0.105523
2000-01-06    1.641572
2000-01-07   -0.382184
2000-01-08   -1.063967
Freq: D, Name: A, dtype: float64

In [6]:
s[dates[5]]

1.6415724373908485

In [7]:
df[['B','A']] = df[['A','B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,1.84152,1.170823,0.590857,1.155937
2000-01-02,-0.934342,0.217078,-0.332278,-1.443363
2000-01-03,-0.553503,0.13804,-0.348812,1.216798
2000-01-04,1.615673,1.332447,0.866259,0.050915
2000-01-05,0.611315,0.105523,0.168561,0.994959
2000-01-06,-0.226638,1.641572,0.204321,-2.91929
2000-01-07,-0.668507,-0.382184,-0.46133,0.351939
2000-01-08,-0.036191,-1.063967,-0.248969,0.893267


In [8]:
sa = pd.Series([1,2,3], index=list('abc'))
sa

a    1
b    2
c    3
dtype: int64

In [9]:
sa.c

3

In [10]:
dfa = df.copy()

In [11]:
dfa.A

2000-01-01    1.841520
2000-01-02   -0.934342
2000-01-03   -0.553503
2000-01-04    1.615673
2000-01-05    0.611315
2000-01-06   -0.226638
2000-01-07   -0.668507
2000-01-08   -0.036191
Freq: D, Name: A, dtype: float64

In [12]:
dfa.A = list(range(len(dfa.index)))

In [13]:
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,1.170823,0.590857,1.155937
2000-01-02,1,0.217078,-0.332278,-1.443363
2000-01-03,2,0.13804,-0.348812,1.216798
2000-01-04,3,1.332447,0.866259,0.050915
2000-01-05,4,0.105523,0.168561,0.994959
2000-01-06,5,1.641572,0.204321,-2.91929
2000-01-07,6,-0.382184,-0.46133,0.351939
2000-01-08,7,-1.063967,-0.248969,0.893267


In [14]:
x = pd.DataFrame({'x': [1,2,3],'y':[3,4,5]})
x

Unnamed: 0,x,y
0,1,3
1,2,4
2,3,5


In [15]:
x.iloc[1]

x    2
y    4
Name: 1, dtype: int64

In [16]:
x.iloc[1] = {'x':9, 'y':99}

In [17]:
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


### Slicing ranges

In [18]:
s

2000-01-01    1.841520
2000-01-02   -0.934342
2000-01-03   -0.553503
2000-01-04    1.615673
2000-01-05    0.611315
2000-01-06   -0.226638
2000-01-07   -0.668507
2000-01-08   -0.036191
Freq: D, Name: A, dtype: float64

In [19]:
s[:5]

2000-01-01    1.841520
2000-01-02   -0.934342
2000-01-03   -0.553503
2000-01-04    1.615673
2000-01-05    0.611315
Freq: D, Name: A, dtype: float64

In [20]:
s[::2]

2000-01-01    1.841520
2000-01-03   -0.553503
2000-01-05    0.611315
2000-01-07   -0.668507
Freq: 2D, Name: A, dtype: float64

In [21]:
s[::-1]

2000-01-08   -0.036191
2000-01-07   -0.668507
2000-01-06   -0.226638
2000-01-05    0.611315
2000-01-04    1.615673
2000-01-03   -0.553503
2000-01-02   -0.934342
2000-01-01    1.841520
Freq: -1D, Name: A, dtype: float64

In [22]:
s2 = s.copy()

In [23]:
s2[:5] = 0

In [24]:
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06   -0.226638
2000-01-07   -0.668507
2000-01-08   -0.036191
Freq: D, Name: A, dtype: float64

In [25]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,1.84152,1.170823,0.590857,1.155937
2000-01-02,-0.934342,0.217078,-0.332278,-1.443363
2000-01-03,-0.553503,0.13804,-0.348812,1.216798


In [26]:
df[::-1]

Unnamed: 0,A,B,C,D
2000-01-08,-0.036191,-1.063967,-0.248969,0.893267
2000-01-07,-0.668507,-0.382184,-0.46133,0.351939
2000-01-06,-0.226638,1.641572,0.204321,-2.91929
2000-01-05,0.611315,0.105523,0.168561,0.994959
2000-01-04,1.615673,1.332447,0.866259,0.050915
2000-01-03,-0.553503,0.13804,-0.348812,1.216798
2000-01-02,-0.934342,0.217078,-0.332278,-1.443363
2000-01-01,1.84152,1.170823,0.590857,1.155937


### Selection by label

In [27]:
df1 =  pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'), index = pd.date_range('20130101', periods=5))

In [28]:
df1

Unnamed: 0,A,B,C,D
2013-01-01,0.024179,-2.19796,0.954133,1.266137
2013-01-02,1.23852,0.933563,0.767986,-0.1242
2013-01-03,-0.815006,-0.936208,0.021556,-1.48341
2013-01-04,-0.52134,-0.870073,0.048969,-0.027289
2013-01-05,0.686297,-0.240987,-0.294918,-0.020589


In [29]:
df1.loc["2013-01-01":"2013-01-03"]

Unnamed: 0,A,B,C,D
2013-01-01,0.024179,-2.19796,0.954133,1.266137
2013-01-02,1.23852,0.933563,0.767986,-0.1242
2013-01-03,-0.815006,-0.936208,0.021556,-1.48341


In [30]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))

In [31]:
s1

a    1.697340
b    0.358599
c    1.837521
d   -0.526430
e   -0.462190
f   -0.499349
dtype: float64

In [32]:
s1.loc['c':]

c    1.837521
d   -0.526430
e   -0.462190
f   -0.499349
dtype: float64

In [33]:
s1.loc['b']

0.3585987332369606

In [34]:
s1.loc['c':] = 0

In [35]:
s1

a    1.697340
b    0.358599
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [36]:
df1 = pd.DataFrame(np.random.randn(6, 4),index=list('abcdef'),  columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.122057,0.041953,-0.736483,-2.239804
b,-0.188846,-0.79657,-0.708409,-0.134359
c,1.023693,-0.454419,-0.392817,1.393083
d,-0.085008,0.501042,-0.791074,1.031637
e,-1.474784,0.570263,-0.547505,-1.623851
f,-1.284435,-0.280536,0.193664,-1.7897


In [37]:
df1.loc['d': , 'A':'C']

Unnamed: 0,A,B,C
d,-0.085008,0.501042,-0.791074
e,-1.474784,0.570263,-0.547505
f,-1.284435,-0.280536,0.193664


In [38]:
df1.loc[['a','b','d'], :]

Unnamed: 0,A,B,C,D
a,-0.122057,0.041953,-0.736483,-2.239804
b,-0.188846,-0.79657,-0.708409,-0.134359
d,-0.085008,0.501042,-0.791074,1.031637


In [39]:
df1.loc['a']

A   -0.122057
B    0.041953
C   -0.736483
D   -2.239804
Name: a, dtype: float64

In [40]:
df1.loc['a']>0

A    False
B     True
C    False
D    False
Name: a, dtype: bool

In [41]:
 df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,B
a,0.041953
b,-0.79657
c,-0.454419
d,0.501042
e,0.570263
f,-0.280536


In [42]:
df1.loc['a','A']

-0.12205723900809133

### Slicing with labels

In [43]:
s = pd.Series(list('abcde'),index=[0,3,2,5,4])
s

0    a
3    b
2    c
5    d
4    e
dtype: object

In [44]:
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [45]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [46]:
s.sort_index().loc[1:4]

2    c
3    b
4    e
dtype: object

### Selection by position

In [47]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

0   -0.206972
2    0.026356
4    0.197900
6    0.572509
8    0.363949
dtype: float64

In [48]:
s1.iloc[:3] = 0

In [49]:
s1

0    0.000000
2    0.000000
4    0.000000
6    0.572509
8    0.363949
dtype: float64

In [50]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list(range(0, 12, 2)),columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,-1.101296,1.061966,-0.90233,-2.47486
2,0.836628,0.718992,0.171477,0.275416
4,-3.056462,-1.931229,-1.433968,-0.258217
6,0.668614,0.495545,-1.194247,1.085691
8,0.045483,0.734181,3.535195,-0.407165
10,0.487433,-0.654006,0.472201,-0.527338


In [51]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,-1.101296,1.061966,-0.90233,-2.47486
2,0.836628,0.718992,0.171477,0.275416
4,-3.056462,-1.931229,-1.433968,-0.258217


In [52]:
df1.iloc[1:3,2:4]

Unnamed: 0,4,6
2,0.171477,0.275416
4,-1.433968,-0.258217


In [53]:
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,0.718992,0.275416
6,0.495545,1.085691
10,-0.654006,-0.527338


In [54]:
df1.iloc[1]

0    0.836628
2    0.718992
4    0.171477
6    0.275416
Name: 2, dtype: float64

In [55]:
dfl = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
dfl

Unnamed: 0,A,B
0,2.106859,-1.728982
1,0.805458,-0.881049
2,0.199465,-0.932044
3,-0.426552,0.56284
4,1.469923,0.85522


In [56]:
dfl.iloc[:, 7:9]

0
1
2
3
4


In [57]:
dfl.iloc[4:6]

Unnamed: 0,A,B
4,1.469923,0.85522


### Selection by callable

In [58]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.481456,0.408112,-0.023276,-0.172038
b,0.770814,1.633472,0.030682,1.410729
c,-0.195971,2.251864,0.53001,0.335443
d,0.11244,-0.789203,-1.700322,-0.177588
e,1.589996,-1.296861,-0.568888,1.209989
f,2.286135,1.085281,-0.125201,-0.379026


In [59]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
b,0.770814,1.633472,0.030682,1.410729
d,0.11244,-0.789203,-1.700322,-0.177588
e,1.589996,-1.296861,-0.568888,1.209989
f,2.286135,1.085281,-0.125201,-0.379026


In [60]:
df1.loc[df1['A'] > 0, :]

Unnamed: 0,A,B,C,D
b,0.770814,1.633472,0.030682,1.410729
d,0.11244,-0.789203,-1.700322,-0.177588
e,1.589996,-1.296861,-0.568888,1.209989
f,2.286135,1.085281,-0.125201,-0.379026


In [61]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-0.481456,0.408112
b,0.770814,1.633472
c,-0.195971,2.251864
d,0.11244,-0.789203
e,1.589996,-1.296861
f,2.286135,1.085281


In [62]:
df1.iloc[:, lambda df: [0, 1]]

Unnamed: 0,A,B
a,-0.481456,0.408112
b,0.770814,1.633472
c,-0.195971,2.251864
d,0.11244,-0.789203
e,1.589996,-1.296861
f,2.286135,1.085281


In [63]:
df1[df1.columns[0]]

a   -0.481456
b    0.770814
c   -0.195971
d    0.112440
e    1.589996
f    2.286135
Name: A, dtype: float64

In [64]:
df1.columns[:3]

Index(['A', 'B', 'C'], dtype='object')

In [65]:
df1["A"].loc[df1.A > 0]

b    0.770814
d    0.112440
e    1.589996
f    2.286135
Name: A, dtype: float64

In [66]:
dfd = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=list('abc'))
dfd

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [67]:
dfd.loc[dfd.index[[0, 2]], 'A']

a    1
c    3
Name: A, dtype: int64

In [68]:
dfd.iloc[[0, 2],dfd.columns.get_loc('A')]

a    1
c    3
Name: A, dtype: int64

In [69]:
dfd.columns.get_loc('A')

0

In [70]:
dfd.iloc[[0, 2],dfd.columns.get_indexer(['A'])]

Unnamed: 0,A
a,1
c,3


In [71]:
dfd.columns.get_indexer(['A'])

array([0], dtype=int64)

### Indexing with list with missing labels is deprecated

In [72]:
s = pd.Series([1, 2, 3])
s

0    1
1    2
2    3
dtype: int64

In [73]:
s.loc[[1, 2]]

1    2
2    3
dtype: int64

#### Reindexing

In [74]:
s.reindex([1, 2, 3])

1    2.0
2    3.0
3    NaN
dtype: float64

In [75]:
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
s

a    0
a    1
b    2
c    3
dtype: int32

In [76]:
labels = ['c', 'd']
s.loc[s.index.intersection(labels)].reindex(labels)

c    3.0
d    NaN
dtype: float64

In [77]:
s.index.intersection(labels)

Index(['c'], dtype='object')

### Selecting random samples

In [78]:
s = pd.Series([0, 1, 2, 3, 4, 5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [79]:
s.sample()

3    3
dtype: int64

In [80]:
s.sample(n=3,random_state=2)

4    4
1    1
3    3
dtype: int64

In [81]:
s.sample(frac=0.4)

0    0
2    2
dtype: int64

In [82]:
s = pd.Series([0, 1, 2, 3, 4, 5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [83]:
 example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]

In [84]:
s.sample(n=3, weights=example_weights)

5    5
3    3
4    4
dtype: int64

In [85]:
example_weights2 = [0.5, 0, 0, 0, 0, 0]
s.sample(n=1, weights=example_weights2)

0    0
dtype: int64

In [86]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]})
df2

Unnamed: 0,col1,weight_column
0,9,0.5
1,8,0.4
2,7,0.1
3,6,0.0


In [87]:
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
0,9,0.5
2,7,0.1
1,8,0.4


In [88]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df3

Unnamed: 0,col1,col2
0,1,2
1,2,3
2,3,4


In [89]:
df3.sample(n=1, axis=1)

Unnamed: 0,col1
0,1
1,2
2,3


In [90]:
df4 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df4

Unnamed: 0,col1,col2
0,1,2
1,2,3
2,3,4


In [91]:
df4.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


### Setting with enlargement

In [92]:
se = pd.Series([1, 2, 3])
se

0    1
1    2
2    3
dtype: int64

In [93]:
se[5] = 5
se

0    1
1    2
2    3
5    5
dtype: int64

In [94]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'])
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


In [95]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


In [96]:
dfi.loc[3] = [4,3,1]
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4
3,4,3,1


### Boolean indexing

In [97]:
s = pd.Series(range(-3, 4))
s

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [98]:
s[s > 0]

4    1
5    2
6    3
dtype: int64

In [99]:
s[(s < -1) | (s > 0.5)]

0   -3
1   -2
4    1
5    2
6    3
dtype: int64

In [100]:
s[~(s < 0)]

3    0
4    1
5    2
6    3
dtype: int64

In [101]:
 df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2000-01-01,1.84152,1.170823,0.590857,1.155937
2000-01-04,1.615673,1.332447,0.866259,0.050915
2000-01-05,0.611315,0.105523,0.168561,0.994959


In [102]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],  'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],  'c': np.random.randn(7)})
df2

Unnamed: 0,a,b,c
0,one,x,1.751816
1,one,y,-0.291632
2,two,y,0.161481
3,three,x,-1.143449
4,two,y,-2.148367
5,one,x,-1.526929
6,six,x,-0.881729


In [103]:
criterion = df2['a'].map(lambda x: x.startswith('t'))
criterion

0    False
1    False
2     True
3     True
4     True
5    False
6    False
Name: a, dtype: bool

In [104]:
df2[criterion]

Unnamed: 0,a,b,c
2,two,y,0.161481
3,three,x,-1.143449
4,two,y,-2.148367


In [105]:
df2[[x.startswith('t') for x in df2['a']]]

Unnamed: 0,a,b,c
2,two,y,0.161481
3,three,x,-1.143449
4,two,y,-2.148367


In [106]:
df2[criterion & (df2['b'] == 'x')]

Unnamed: 0,a,b,c
3,three,x,-1.143449


### Indexing with isin

In [107]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [108]:
 s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [109]:
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

In [110]:
s[s.index.isin([2, 4, 6])]

4    0
2    2
dtype: int64

In [111]:
s.reindex([2, 4, 6])

2    2.0
4    0.0
6    NaN
dtype: float64

In [112]:
s_mi = pd.Series(np.arange(6), index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32

In [113]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

0  c    2
1  a    3
dtype: int32

In [114]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int32

In [115]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],'ids2': ['a', 'n', 'c', 'n']})
df

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c
3,4,n,n


In [116]:
values = ['a', 'b', 1, 3]
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [117]:
df[df.isin(values)]

Unnamed: 0,vals,ids,ids2
0,1.0,a,a
1,,b,
2,3.0,,
3,,,


In [118]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,False
1,False,True,False
2,True,False,False
3,False,False,False


In [119]:
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
row_mask = df.isin(values).any(1)
df[row_mask]

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c


In [120]:
row_mask = df.isin(values).all(1)
df[row_mask]

Unnamed: 0,vals,ids,ids2
0,1,a,a


### The where() Method and Masking

In [121]:
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [122]:
s.where(s > 0)

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [123]:
df = pd.DataFrame(np.random.randn(8, 4),columns=list('ABCD'),index=pd.date_range('20000101', periods=8))
df

Unnamed: 0,A,B,C,D
2000-01-01,0.859548,0.671978,-1.004022,-0.067544
2000-01-02,-1.203796,-0.835754,-2.152295,-0.255248
2000-01-03,-0.382727,0.232444,0.331113,-0.462641
2000-01-04,1.249957,-0.528781,-0.304522,-1.578659
2000-01-05,-1.691983,-0.934939,-0.224419,0.368505
2000-01-06,-0.118174,-0.191694,-0.098381,-1.611758
2000-01-07,-0.514084,-1.700765,-1.914289,-0.16563
2000-01-08,-0.961066,0.911104,-0.737668,1.508339


In [124]:
df.where(df < 0,-df)

Unnamed: 0,A,B,C,D
2000-01-01,-0.859548,-0.671978,-1.004022,-0.067544
2000-01-02,-1.203796,-0.835754,-2.152295,-0.255248
2000-01-03,-0.382727,-0.232444,-0.331113,-0.462641
2000-01-04,-1.249957,-0.528781,-0.304522,-1.578659
2000-01-05,-1.691983,-0.934939,-0.224419,-0.368505
2000-01-06,-0.118174,-0.191694,-0.098381,-1.611758
2000-01-07,-0.514084,-1.700765,-1.914289,-0.16563
2000-01-08,-0.961066,-0.911104,-0.737668,-1.508339


In [125]:
s2 = s.copy()
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [126]:
df2 = df.copy()
df2[df2 < 0] = 0
df2

Unnamed: 0,A,B,C,D
2000-01-01,0.859548,0.671978,0.0,0.0
2000-01-02,0.0,0.0,0.0,0.0
2000-01-03,0.0,0.232444,0.331113,0.0
2000-01-04,1.249957,0.0,0.0,0.0
2000-01-05,0.0,0.0,0.0,0.368505
2000-01-06,0.0,0.0,0.0,0.0
2000-01-07,0.0,0.0,0.0,0.0
2000-01-08,0.0,0.911104,0.0,1.508339


In [127]:
df2 = df.copy()
df2[df2[1:4] > 0] = 3
df2

Unnamed: 0,A,B,C,D
2000-01-01,0.859548,0.671978,-1.004022,-0.067544
2000-01-02,-1.203796,-0.835754,-2.152295,-0.255248
2000-01-03,-0.382727,3.0,3.0,-0.462641
2000-01-04,3.0,-0.528781,-0.304522,-1.578659
2000-01-05,-1.691983,-0.934939,-0.224419,0.368505
2000-01-06,-0.118174,-0.191694,-0.098381,-1.611758
2000-01-07,-0.514084,-1.700765,-1.914289,-0.16563
2000-01-08,-0.961066,0.911104,-0.737668,1.508339


In [128]:
df

Unnamed: 0,A,B,C,D
2000-01-01,0.859548,0.671978,-1.004022,-0.067544
2000-01-02,-1.203796,-0.835754,-2.152295,-0.255248
2000-01-03,-0.382727,0.232444,0.331113,-0.462641
2000-01-04,1.249957,-0.528781,-0.304522,-1.578659
2000-01-05,-1.691983,-0.934939,-0.224419,0.368505
2000-01-06,-0.118174,-0.191694,-0.098381,-1.611758
2000-01-07,-0.514084,-1.700765,-1.914289,-0.16563
2000-01-08,-0.961066,0.911104,-0.737668,1.508339


In [129]:
df2 = df.copy()
df2.where(df2 > 0, df2['C'], axis=0)

Unnamed: 0,A,B,C,D
2000-01-01,0.859548,0.671978,-1.004022,-1.004022
2000-01-02,-2.152295,-2.152295,-2.152295,-2.152295
2000-01-03,0.331113,0.232444,0.331113,0.331113
2000-01-04,1.249957,-0.304522,-0.304522,-0.304522
2000-01-05,-0.224419,-0.224419,-0.224419,0.368505
2000-01-06,-0.098381,-0.098381,-0.098381,-0.098381
2000-01-07,-1.914289,-1.914289,-1.914289,-1.914289
2000-01-08,-0.737668,0.911104,-0.737668,1.508339


In [130]:
df2 = df.copy()
df2.apply(lambda x, y: x.where(x > 0, y), y=df['C'])

Unnamed: 0,A,B,C,D
2000-01-01,0.859548,0.671978,-1.004022,-1.004022
2000-01-02,-2.152295,-2.152295,-2.152295,-2.152295
2000-01-03,0.331113,0.232444,0.331113,0.331113
2000-01-04,1.249957,-0.304522,-0.304522,-0.304522
2000-01-05,-0.224419,-0.224419,-0.224419,0.368505
2000-01-06,-0.098381,-0.098381,-0.098381,-0.098381
2000-01-07,-1.914289,-1.914289,-1.914289,-1.914289
2000-01-08,-0.737668,0.911104,-0.737668,1.508339


#### Mask()

In [131]:
s.mask(s >= 0)

4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64

In [132]:
df.mask(df >= 0)

Unnamed: 0,A,B,C,D
2000-01-01,,,-1.004022,-0.067544
2000-01-02,-1.203796,-0.835754,-2.152295,-0.255248
2000-01-03,-0.382727,,,-0.462641
2000-01-04,,-0.528781,-0.304522,-1.578659
2000-01-05,-1.691983,-0.934939,-0.224419,
2000-01-06,-0.118174,-0.191694,-0.098381,-1.611758
2000-01-07,-0.514084,-1.700765,-1.914289,-0.16563
2000-01-08,-0.961066,,-0.737668,


### The query() Method

In [133]:
n= 10
df = pd.DataFrame(np.random.randn(n,3),columns=list('abc'))
df

Unnamed: 0,a,b,c
0,-0.742566,1.343673,0.569181
1,-0.727264,-0.520258,2.082135
2,-0.107565,-0.872154,-1.745796
3,-0.813806,0.407355,1.351234
4,-1.039393,1.481584,1.541697
5,0.368608,0.260044,0.627391
6,-0.637694,-0.981685,0.578819
7,0.267488,1.481877,0.30115
8,0.109993,0.303304,0.798306
9,-0.753214,-0.863889,-0.42279


In [134]:
df[(df['a']<df['b'])&(df['b']<df['c'])]

Unnamed: 0,a,b,c
1,-0.727264,-0.520258,2.082135
3,-0.813806,0.407355,1.351234
4,-1.039393,1.481584,1.541697
8,0.109993,0.303304,0.798306


In [137]:
df.query('(a<b)&(b<c)')

Unnamed: 0,a,b,c
1,-0.727264,-0.520258,2.082135
3,-0.813806,0.407355,1.351234
4,-1.039393,1.481584,1.541697
8,0.109993,0.303304,0.798306


In [142]:
df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df

Unnamed: 0,b,c
0,9,6
1,3,8
2,2,9
3,5,5
4,8,6
5,3,8
6,3,5
7,4,2
8,0,5
9,4,3


In [145]:
df.index.name = 'a'
df

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9,6
1,3,8
2,2,9
3,5,5
4,8,6
5,3,8
6,3,5
7,4,2
8,0,5
9,4,3


In [146]:
df.query('a < b and b < c')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,8


In [147]:
df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df

Unnamed: 0,b,c
0,4,6
1,8,1
2,2,2
3,8,7
4,5,2
5,3,6
6,0,6
7,0,3
8,0,3
9,2,3


In [148]:
df.query('index<b<c')

Unnamed: 0,b,c
0,4,6


In [149]:
df = pd.DataFrame({'a': np.random.randint(5, size=5)})
df

Unnamed: 0,a
0,0
1,0
2,4
3,1
4,3


In [151]:
df.index.name = 'b'
df

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
0,0
1,0
2,4
3,1
4,3


In [153]:
df.query('b>2')

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
3,1
4,3


#### MultiIndex query() 

In [154]:
n= 10
colors = np.random.choice(['red','green'],size = n)
foods = np.random.choice(['eggs', 'ham'],size = n)

In [156]:
colors, foods

(array(['green', 'red', 'red', 'green', 'red', 'green', 'red', 'green',
        'red', 'red'], dtype='<U5'),
 array(['eggs', 'eggs', 'ham', 'eggs', 'ham', 'eggs', 'ham', 'eggs',
        'eggs', 'eggs'], dtype='<U4'))

In [157]:
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
index

MultiIndex([('green', 'eggs'),
            (  'red', 'eggs'),
            (  'red',  'ham'),
            ('green', 'eggs'),
            (  'red',  'ham'),
            ('green', 'eggs'),
            (  'red',  'ham'),
            ('green', 'eggs'),
            (  'red', 'eggs'),
            (  'red', 'eggs')],
           names=['color', 'food'])

In [158]:
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
green,eggs,-0.224896,-0.741689
red,eggs,-0.502092,-2.542527
red,ham,-0.034633,0.330883
green,eggs,-0.090897,-0.197988
red,ham,-0.231698,-0.089791
green,eggs,-0.459482,0.356236
red,ham,-0.315437,0.026225
green,eggs,0.779881,-1.101037
red,eggs,-1.039611,-0.558343
red,eggs,-0.666928,-0.081817


In [160]:
df.query('color =="red"')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
red,eggs,-0.502092,-2.542527
red,ham,-0.034633,0.330883
red,ham,-0.231698,-0.089791
red,ham,-0.315437,0.026225
red,eggs,-1.039611,-0.558343
red,eggs,-0.666928,-0.081817


#### The in and not in operators¶

In [172]:
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), 'c': np.random.randint(5, size=12),'d': np.random.randint(9, size=12)})
df

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1
6,d,b,1,0
7,d,b,0,5
8,e,c,1,5
9,e,c,0,5


In [178]:
df.query('a in b')

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1


In [179]:
# or 
df[df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1


In [165]:
df.query('a not in b')

Unnamed: 0,a,b,c,d
6,d,b,1,5
7,d,b,4,8
8,e,c,2,1
9,e,c,1,6
10,f,c,2,5
11,f,c,0,3


In [167]:
df[~df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
6,d,b,1,5
7,d,b,4,8
8,e,c,2,1
9,e,c,1,6
10,f,c,2,5
11,f,c,0,3


In [181]:
df.query('a in b and c < d')

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1


In [180]:
df[df['a'].isin(df['b']) & (df['c'] < df['d'])]

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1


#### Special use of the == operator with list objects

In [182]:
df.query('b == ["a", "b", "c"]')

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1
6,d,b,1,0
7,d,b,0,5
8,e,c,1,5
9,e,c,0,5


In [183]:
df[df['b'].isin(["a", "b", "c"])]

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
5,c,b,0,1
6,d,b,1,0
7,d,b,0,5
8,e,c,1,5
9,e,c,0,5


In [184]:
df.query('c == [1, 2]')

Unnamed: 0,a,b,c,d
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
6,d,b,1,0
8,e,c,1,5


In [185]:
df.query('c != [1, 2]')

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
5,c,b,0,1
7,d,b,0,5
9,e,c,0,5
10,f,c,0,0
11,f,c,4,0


In [186]:
 df.query('[1, 2] in c')

Unnamed: 0,a,b,c,d
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
6,d,b,1,0
8,e,c,1,5


In [187]:
df.query('[1, 2] not in c')

Unnamed: 0,a,b,c,d
0,a,a,0,5
1,a,a,3,4
5,c,b,0,1
7,d,b,0,5
9,e,c,0,5
10,f,c,0,0
11,f,c,4,0


In [188]:
 df[df['c'].isin([1, 2])]

Unnamed: 0,a,b,c,d
2,b,a,2,8
3,b,a,2,5
4,c,b,1,7
6,d,b,1,0
8,e,c,1,5


In [189]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))

In [190]:
len(df)

10

In [192]:
np.random.rand(len(df))>0.5

array([ True,  True,  True, False, False, False, False, False, False,
        True])

In [193]:
df['bool']=np.random.rand(len(df))>0.5
df

Unnamed: 0,a,b,c,bool
0,0.753958,0.49896,0.39091,False
1,0.748365,0.44982,0.55094,False
2,0.236924,0.643903,0.018118,False
3,0.25396,0.635503,0.832037,False
4,0.518352,0.359664,0.765846,False
5,0.768351,0.337504,0.516265,True
6,0.502757,0.467533,0.354854,False
7,0.282472,0.021438,0.777803,False
8,0.997549,0.297885,0.783058,False
9,0.920225,0.059543,0.524091,False


In [195]:
df.query('bool')

Unnamed: 0,a,b,c,bool
5,0.768351,0.337504,0.516265,True


In [196]:
df.query('~bool')

Unnamed: 0,a,b,c,bool
0,0.753958,0.49896,0.39091,False
1,0.748365,0.44982,0.55094,False
2,0.236924,0.643903,0.018118,False
3,0.25396,0.635503,0.832037,False
4,0.518352,0.359664,0.765846,False
6,0.502757,0.467533,0.354854,False
7,0.282472,0.021438,0.777803,False
8,0.997549,0.297885,0.783058,False
9,0.920225,0.059543,0.524091,False


In [204]:
df.query('not bool') == df[~df['bool']]

Unnamed: 0,a,b,c,bool
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,True,True
4,True,True,True,True
6,True,True,True,True
7,True,True,True,True
8,True,True,True,True
9,True,True,True,True


In [206]:
df.query('not bool') == df.query('~bool')

Unnamed: 0,a,b,c,bool
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,True,True
4,True,True,True,True
6,True,True,True,True
7,True,True,True,True
8,True,True,True,True
9,True,True,True,True


#### Performance of query()¶



- You will only see the performance benefits of using the numexpr engine with DataFrame.query() <span class="girk">if your frame has more than approximately 200,000 rows</span>

### Duplicate data

In [213]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'], 'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],'c': np.random.randn(7)})
df2

Unnamed: 0,a,b,c
0,one,x,-0.892284
1,one,y,1.138734
2,two,x,-1.168535
3,two,y,-1.697206
4,two,x,0.940258
5,three,x,-0.3683
6,four,x,-0.506936


In [218]:
df2.duplicated(['a'])

0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [219]:
df2.duplicated(['a'],keep='last')

0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [220]:
df2.duplicated(['a'],keep=False)

0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool

In [222]:
df2.drop_duplicates('a',keep='last')

Unnamed: 0,a,b,c
1,one,y,1.138734
4,two,x,0.940258
5,three,x,-0.3683
6,four,x,-0.506936


In [223]:
df2.drop_duplicates('a',keep='first')

Unnamed: 0,a,b,c
0,one,x,-0.892284
2,two,x,-1.168535
5,three,x,-0.3683
6,four,x,-0.506936


In [224]:
df2.drop_duplicates('a', keep=False)

Unnamed: 0,a,b,c
5,three,x,-0.3683
6,four,x,-0.506936


In [225]:
df2.duplicated(['a', 'b'])

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [226]:
df2.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,one,x,-0.892284
1,one,y,1.138734
2,two,x,-1.168535
3,two,y,-1.697206
5,three,x,-0.3683
6,four,x,-0.506936


In [228]:
df3 = pd.DataFrame({'a': np.arange(6),'b': np.random.randn(6)}, index=['a', 'a', 'b', 'c', 'b', 'a'])
df3

Unnamed: 0,a,b
a,0,0.771656
a,1,1.818501
b,2,0.956649
c,3,-1.325237
b,4,0.852499
a,5,0.995913


In [229]:
df3.index.duplicated()

array([False,  True, False, False,  True,  True])

In [231]:
df3[~df3.index.duplicated()]

Unnamed: 0,a,b
a,0,0.771656
b,2,0.956649
c,3,-1.325237


In [233]:
df3[df3.index.duplicated(keep='last')]

Unnamed: 0,a,b
a,0,0.771656
a,1,1.818501
b,2,0.956649


In [235]:
df3[ ~df3.index.duplicated(keep=False)]

Unnamed: 0,a,b
c,3,-1.325237


### Dictionary-like get() method

In [237]:
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s

a    1
b    2
c    3
dtype: int64

In [240]:
# quasi s['a']
s.get('a') 

1

In [243]:
# wenn keine Value gefunden wird, will return default value
s.get('g',default= -1) 

-1

### The lookup() method¶

In [251]:
dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D'])
dflookup

Unnamed: 0,A,B,C,D
0,0.576016,0.680801,0.194482,0.51654
1,0.570523,0.652139,0.231126,0.632979
2,0.198877,0.743108,0.960728,0.304529
3,0.086445,0.363323,0.945304,0.226363
4,0.626073,0.223408,0.66719,0.133121
5,0.722288,0.519872,0.818702,0.764082
6,0.092928,0.623468,0.951228,0.674362
7,0.416909,0.022027,0.26952,0.793054
8,0.399445,0.198614,0.957417,0.930874
9,0.972729,0.475977,0.65739,0.53695


In [259]:
dflookup.lookup(list(range(0,10,2)),['D']*len(list(range(0,10,2)))).tolist()

[0.5165396136572387,
 0.3045285068212318,
 0.13312060433960693,
 0.6743620533376404,
 0.9308739590783949]

In [260]:
dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D'])

array([0.68080052, 0.96072846, 0.62607311, 0.62346819, 0.93087396])

### Set / reset index

In [261]:
dflookup

Unnamed: 0,A,B,C,D
0,0.576016,0.680801,0.194482,0.51654
1,0.570523,0.652139,0.231126,0.632979
2,0.198877,0.743108,0.960728,0.304529
3,0.086445,0.363323,0.945304,0.226363
4,0.626073,0.223408,0.66719,0.133121
5,0.722288,0.519872,0.818702,0.764082
6,0.092928,0.623468,0.951228,0.674362
7,0.416909,0.022027,0.26952,0.793054
8,0.399445,0.198614,0.957417,0.930874
9,0.972729,0.475977,0.65739,0.53695


In [262]:
dflookup.set_index('A')

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.576016,0.680801,0.194482,0.51654
0.570523,0.652139,0.231126,0.632979
0.198877,0.743108,0.960728,0.304529
0.086445,0.363323,0.945304,0.226363
0.626073,0.223408,0.66719,0.133121
0.722288,0.519872,0.818702,0.764082
0.092928,0.623468,0.951228,0.674362
0.416909,0.022027,0.26952,0.793054
0.399445,0.198614,0.957417,0.930874
0.972729,0.475977,0.65739,0.53695


In [264]:
dflookup.set_index(['A','B'])

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
0.576016,0.680801,0.194482,0.51654
0.570523,0.652139,0.231126,0.632979
0.198877,0.743108,0.960728,0.304529
0.086445,0.363323,0.945304,0.226363
0.626073,0.223408,0.66719,0.133121
0.722288,0.519872,0.818702,0.764082
0.092928,0.623468,0.951228,0.674362
0.416909,0.022027,0.26952,0.793054
0.399445,0.198614,0.957417,0.930874
0.972729,0.475977,0.65739,0.53695


In [339]:
frame = dflookup.set_index('C', drop=False)
frame

Unnamed: 0_level_0,A,B,C,D
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.194482,0.576016,0.680801,0.194482,0.51654
0.231126,0.570523,0.652139,0.231126,0.632979
0.960728,0.198877,0.743108,0.960728,0.304529
0.945304,0.086445,0.363323,0.945304,0.226363
0.66719,0.626073,0.223408,0.66719,0.133121
0.818702,0.722288,0.519872,0.818702,0.764082
0.951228,0.092928,0.623468,0.951228,0.674362
0.26952,0.416909,0.022027,0.26952,0.793054
0.957417,0.399445,0.198614,0.957417,0.930874
0.65739,0.972729,0.475977,0.65739,0.53695


In [340]:
frame = frame.set_index(['A', 'B'], append=True)
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,C,D
C,A,B,Unnamed: 3_level_1,Unnamed: 4_level_1
0.194482,0.576016,0.680801,0.194482,0.51654
0.231126,0.570523,0.652139,0.231126,0.632979
0.960728,0.198877,0.743108,0.960728,0.304529
0.945304,0.086445,0.363323,0.945304,0.226363
0.66719,0.626073,0.223408,0.66719,0.133121
0.818702,0.722288,0.519872,0.818702,0.764082
0.951228,0.092928,0.623468,0.951228,0.674362
0.26952,0.416909,0.022027,0.26952,0.793054
0.957417,0.399445,0.198614,0.957417,0.930874
0.65739,0.972729,0.475977,0.65739,0.53695


In [341]:
dflookup.set_index('C', drop=True)

Unnamed: 0_level_0,A,B,D
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.194482,0.576016,0.680801,0.51654
0.231126,0.570523,0.652139,0.632979
0.960728,0.198877,0.743108,0.304529
0.945304,0.086445,0.363323,0.226363
0.66719,0.626073,0.223408,0.133121
0.818702,0.722288,0.519872,0.764082
0.951228,0.092928,0.623468,0.674362
0.26952,0.416909,0.022027,0.793054
0.957417,0.399445,0.198614,0.930874
0.65739,0.972729,0.475977,0.53695


#### Reset the index

In [342]:
dflookup

Unnamed: 0,A,B,C,D
0,0.576016,0.680801,0.194482,0.51654
1,0.570523,0.652139,0.231126,0.632979
2,0.198877,0.743108,0.960728,0.304529
3,0.086445,0.363323,0.945304,0.226363
4,0.626073,0.223408,0.66719,0.133121
5,0.722288,0.519872,0.818702,0.764082
6,0.092928,0.623468,0.951228,0.674362
7,0.416909,0.022027,0.26952,0.793054
8,0.399445,0.198614,0.957417,0.930874
9,0.972729,0.475977,0.65739,0.53695


In [343]:
dflookup.reset_index()

Unnamed: 0,index,A,B,C,D
0,0,0.576016,0.680801,0.194482,0.51654
1,1,0.570523,0.652139,0.231126,0.632979
2,2,0.198877,0.743108,0.960728,0.304529
3,3,0.086445,0.363323,0.945304,0.226363
4,4,0.626073,0.223408,0.66719,0.133121
5,5,0.722288,0.519872,0.818702,0.764082
6,6,0.092928,0.623468,0.951228,0.674362
7,7,0.416909,0.022027,0.26952,0.793054
8,8,0.399445,0.198614,0.957417,0.930874
9,9,0.972729,0.475977,0.65739,0.53695


In [344]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,C,D
C,A,B,Unnamed: 3_level_1,Unnamed: 4_level_1
0.194482,0.576016,0.680801,0.194482,0.51654
0.231126,0.570523,0.652139,0.231126,0.632979
0.960728,0.198877,0.743108,0.960728,0.304529
0.945304,0.086445,0.363323,0.945304,0.226363
0.66719,0.626073,0.223408,0.66719,0.133121
0.818702,0.722288,0.519872,0.818702,0.764082
0.951228,0.092928,0.623468,0.951228,0.674362
0.26952,0.416909,0.022027,0.26952,0.793054
0.957417,0.399445,0.198614,0.957417,0.930874
0.65739,0.972729,0.475977,0.65739,0.53695


In [345]:
frame.reset_index(level=['A','B'],inplace=True,drop=False)
frame

Unnamed: 0_level_0,A,B,C,D
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.194482,0.576016,0.680801,0.194482,0.51654
0.231126,0.570523,0.652139,0.231126,0.632979
0.960728,0.198877,0.743108,0.960728,0.304529
0.945304,0.086445,0.363323,0.945304,0.226363
0.66719,0.626073,0.223408,0.66719,0.133121
0.818702,0.722288,0.519872,0.818702,0.764082
0.951228,0.092928,0.623468,0.951228,0.674362
0.26952,0.416909,0.022027,0.26952,0.793054
0.957417,0.399445,0.198614,0.957417,0.930874
0.65739,0.972729,0.475977,0.65739,0.53695


In [347]:
frame.set_index('C', drop=True).reset_index()

Unnamed: 0,C,A,B,D
0,0.194482,0.576016,0.680801,0.51654
1,0.231126,0.570523,0.652139,0.632979
2,0.960728,0.198877,0.743108,0.304529
3,0.945304,0.086445,0.363323,0.226363
4,0.66719,0.626073,0.223408,0.133121
5,0.818702,0.722288,0.519872,0.764082
6,0.951228,0.092928,0.623468,0.674362
7,0.26952,0.416909,0.022027,0.793054
8,0.957417,0.399445,0.198614,0.930874
9,0.65739,0.972729,0.475977,0.53695


## Multiindex advanced indexing

### Creating a MultiIndex (hierarchical index) object

In [348]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [353]:
tuples = list(zip(*arrays))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [355]:
index = pd.MultiIndex.from_tuples(tuples,names = ['first','second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [376]:
# import matplotlib.pyplot as plt
s = pd.Series(np.random.randn(8),index=index)
s

first  second
bar    one      -0.435659
       two      -0.447149
baz    one      -0.024953
       two       0.498038
foo    one       2.095601
       two      -0.593423
qux    one      -0.914377
       two       3.093078
dtype: float64

In [378]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]
pd.MultiIndex.from_product(iterables, names=['first', 'second'])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [380]:
df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], ['foo', 'one'], ['foo', 'two']], columns=['first', 'second'])
df

Unnamed: 0,first,second
0,bar,one
1,bar,two
2,foo,one
3,foo,two


In [381]:
pd.MultiIndex.from_frame(df)

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('foo', 'one'),
            ('foo', 'two')],
           names=['first', 'second'])

In [383]:
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
arrays

[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
       dtype='<U3'),
 array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
       dtype='<U3')]

In [384]:
s = pd.Series(np.random.randn(8), index=arrays)
s

bar  one   -1.507829
     two   -0.008754
baz  one   -0.278393
     two   -1.283055
foo  one   -0.068310
     two   -0.512336
qux  one    0.156587
     two   -0.000454
dtype: float64

In [385]:
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-1.381205,-1.529208,-0.749974,0.67867
bar,two,0.300026,-0.676939,1.556637,-0.369393
baz,one,1.287208,-0.176588,1.741056,-0.376511
baz,two,0.205275,0.520364,0.071711,-0.15598
foo,one,-0.48241,1.06396,-1.78464,-2.835917
foo,two,0.129395,0.496826,-0.241224,-0.668092
qux,one,1.51686,-1.872735,0.898648,0.001625
qux,two,0.295061,0.113847,0.79691,0.746156


In [387]:
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.653717,1.032425,3.000191,0.433147,-1.372312,0.64624,1.726759,-1.455291
B,-2.338609,0.334652,-1.511183,0.024313,-0.264666,0.772714,0.151716,-0.554727
C,1.750478,-0.51568,-0.145374,-0.704822,-0.634,1.688334,0.116658,-0.418727


In [388]:
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,1.369156,1.837369,-0.07268,-1.133263,-1.025338,-1.5181
bar,two,-1.675023,0.891167,2.441338,-1.062225,1.151517,0.700196
baz,one,0.575061,0.056342,1.656075,2.497947,0.398918,-1.141981
baz,two,3.097363,-0.121935,0.218032,0.969896,1.232805,0.653773
foo,one,0.402258,-0.907295,-1.211977,1.269125,-0.305796,-0.976733
foo,two,1.435545,0.13048,0.705188,1.571886,-1.11381,-0.985585


In [389]:
 pd.Series(np.random.randn(8), index=tuples)

(bar, one)   -0.600445
(bar, two)   -1.222072
(baz, one)    0.993520
(baz, two)    1.702915
(foo, one)   -0.220912
(foo, two)    1.852933
(qux, one)   -0.180894
(qux, two)   -0.936582
dtype: float64

In [392]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [393]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

### Basic indexing on axis with MultiIndex

In [394]:
df['bar']

second,one,two
A,0.653717,1.032425
B,-2.338609,0.334652
C,1.750478,-0.51568


In [395]:
 df['bar', 'one']

A    0.653717
B   -2.338609
C    1.750478
Name: (bar, one), dtype: float64

In [396]:
s['qux']

one    0.156587
two   -0.000454
dtype: float64

## Merge, join, and concatenate

### Concatenating objects

In [403]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3])
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [405]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'],'C': ['C4', 'C5', 'C6', 'C7'], 'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 5, 6, 7])
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [406]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],'B': ['B8', 'B9', 'B10', 'B11'], 'C': ['C8', 'C9', 'C10', 'C11'], 'D': ['D8', 'D9', 'D10', 'D11']},index=[8, 9, 10, 11])
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [412]:
result = pd.concat([df1,df2,df3],axis=0)
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [414]:
result = pd.concat([df1,df2,df3],axis=0,keys=['df1','df2','df3'])
result

Unnamed: 0,Unnamed: 1,A,B,C,D
df1,0,A0,B0,C0,D0
df1,1,A1,B1,C1,D1
df1,2,A2,B2,C2,D2
df1,3,A3,B3,C3,D3
df2,4,A4,B4,C4,D4
df2,5,A5,B5,C5,D5
df2,6,A6,B6,C6,D6
df2,7,A7,B7,C7,D7
df3,8,A8,B8,C8,D8
df3,9,A9,B9,C9,D9


In [417]:
result.loc['df1']

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [418]:
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], 'D': ['D2', 'D3', 'D6', 'D7'],'F': ['F2', 'F3', 'F6', 'F7']}, index=[2, 3, 6, 7])
df4

Unnamed: 0,B,D,F
2,B2,D2,F2
3,B3,D3,F3
6,B6,D6,F6
7,B7,D7,F7


In [423]:
result = pd.concat([df1,df4.set_index(df1.index)],axis=1)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,B2,D2,F2
1,A1,B1,C1,D1,B3,D3,F3
2,A2,B2,C2,D2,B6,D6,F6
3,A3,B3,C3,D3,B7,D7,F7


In [424]:
pd.concat([df1, df4.reindex(df1.index)], axis=1)

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [1]:
import os
os.getcwd()

'C:\\Users\\Administrator\\Pandas_tutorial'