In [1]:
import pandas as pd
import numpy as np
ser = pd.Series(range(5), index=list("abcde"))

ser.loc[["a", "c", "e"]]

a    0
c    2
e    4
dtype: int64

In [2]:
df = pd.DataFrame(np.arange(25).reshape(5, 5), index=list("abcde"), columns=list("abcde"))

df.loc[["a", "c", "e"], ["b", "d"]]

Unnamed: 0,b,d
a,1,3
c,11,13
e,21,23


In [3]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])

df

Unnamed: 0,A,B,C,D
2000-01-01,0.314466,-0.587864,1.220949,0.396933
2000-01-02,2.002123,-0.349228,-1.777809,0.223947
2000-01-03,0.408786,0.176971,-0.302385,-0.276823
2000-01-04,-0.343897,0.142302,1.345083,-0.881175
2000-01-05,-0.284611,1.967106,0.98918,0.398049
2000-01-06,-1.065154,-0.262861,0.071358,-0.037758
2000-01-07,-1.346624,0.606611,-1.468591,-1.544969
2000-01-08,-0.361386,1.309436,-0.08867,0.600459


In [4]:
s = df['A']

s[dates[5]]

np.float64(-1.0651537291708963)

In [5]:
df[['B', 'A']] = df[['A', 'B']]

df

Unnamed: 0,A,B,C,D
2000-01-01,-0.587864,0.314466,1.220949,0.396933
2000-01-02,-0.349228,2.002123,-1.777809,0.223947
2000-01-03,0.176971,0.408786,-0.302385,-0.276823
2000-01-04,0.142302,-0.343897,1.345083,-0.881175
2000-01-05,1.967106,-0.284611,0.98918,0.398049
2000-01-06,-0.262861,-1.065154,0.071358,-0.037758
2000-01-07,0.606611,-1.346624,-1.468591,-1.544969
2000-01-08,1.309436,-0.361386,-0.08867,0.600459


In [6]:
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-0.587864,0.314466
2000-01-02,-0.349228,2.002123
2000-01-03,0.176971,0.408786
2000-01-04,0.142302,-0.343897
2000-01-05,1.967106,-0.284611
2000-01-06,-0.262861,-1.065154
2000-01-07,0.606611,-1.346624
2000-01-08,1.309436,-0.361386


In [7]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-0.587864,0.314466
2000-01-02,-0.349228,2.002123
2000-01-03,0.176971,0.408786
2000-01-04,0.142302,-0.343897
2000-01-05,1.967106,-0.284611
2000-01-06,-0.262861,-1.065154
2000-01-07,0.606611,-1.346624
2000-01-08,1.309436,-0.361386


In [8]:
#Swapping column values using raw values
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.314466,-0.587864
2000-01-02,2.002123,-0.349228
2000-01-03,0.408786,0.176971
2000-01-04,-0.343897,0.142302
2000-01-05,-0.284611,1.967106
2000-01-06,-1.065154,-0.262861
2000-01-07,-1.346624,0.606611
2000-01-08,-0.361386,1.309436


In [9]:
df.iloc[:, [1, 0]] = df[['A', 'B']]
df[['A','B']]

Unnamed: 0,A,B
2000-01-01,-0.587864,0.314466
2000-01-02,-0.349228,2.002123
2000-01-03,0.176971,0.408786
2000-01-04,0.142302,-0.343897
2000-01-05,1.967106,-0.284611
2000-01-06,-0.262861,-1.065154
2000-01-07,0.606611,-1.346624
2000-01-08,1.309436,-0.361386


In [10]:
#Accessing an index on a Series or column on a DataFrame directly as an attribute
sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()
sa.b

np.int64(2)

In [11]:
dfa.A

2000-01-01   -0.587864
2000-01-02   -0.349228
2000-01-03    0.176971
2000-01-04    0.142302
2000-01-05    1.967106
2000-01-06   -0.262861
2000-01-07    0.606611
2000-01-08    1.309436
Freq: D, Name: A, dtype: float64

In [12]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [13]:
dfa.A = list(range(len(dfa.index)))  # ok if A already exists
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,0.314466,1.220949,0.396933
2000-01-02,1,2.002123,-1.777809,0.223947
2000-01-03,2,0.408786,-0.302385,-0.276823
2000-01-04,3,-0.343897,1.345083,-0.881175
2000-01-05,4,-0.284611,0.98918,0.398049
2000-01-06,5,-1.065154,0.071358,-0.037758
2000-01-07,6,-1.346624,-1.468591,-1.544969
2000-01-08,7,-0.361386,-0.08867,0.600459


In [14]:
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,0.314466,1.220949,0.396933
2000-01-02,1,2.002123,-1.777809,0.223947
2000-01-03,2,0.408786,-0.302385,-0.276823
2000-01-04,3,-0.343897,1.345083,-0.881175
2000-01-05,4,-0.284611,0.98918,0.398049
2000-01-06,5,-1.065154,0.071358,-0.037758
2000-01-07,6,-1.346624,-1.468591,-1.544969
2000-01-08,7,-0.361386,-0.08867,0.600459


In [15]:
#Slicing ranges
s[:5]

2000-01-01    0.314466
2000-01-02    2.002123
2000-01-03    0.408786
2000-01-04   -0.343897
2000-01-05   -0.284611
Freq: D, Name: A, dtype: float64

In [16]:
s[::2]

2000-01-01    0.314466
2000-01-03    0.408786
2000-01-05   -0.284611
2000-01-07   -1.346624
Freq: 2D, Name: A, dtype: float64

In [17]:
s[::-1]

2000-01-08   -0.361386
2000-01-07   -1.346624
2000-01-06   -1.065154
2000-01-05   -0.284611
2000-01-04   -0.343897
2000-01-03    0.408786
2000-01-02    2.002123
2000-01-01    0.314466
Freq: -1D, Name: A, dtype: float64

In [18]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,-0.587864,0.314466,1.220949,0.396933
2000-01-02,-0.349228,2.002123,-1.777809,0.223947
2000-01-03,0.176971,0.408786,-0.302385,-0.276823


In [19]:
df[2::-1]

Unnamed: 0,A,B,C,D
2000-01-03,0.176971,0.408786,-0.302385,-0.276823
2000-01-02,-0.349228,2.002123,-1.777809,0.223947
2000-01-01,-0.587864,0.314466,1.220949,0.396933


In [21]:
#Selection by label
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=pd.date_range('20130101', periods=5))
dfl.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.128981,1.988784,-0.553198,0.171505
2013-01-03,-0.431542,-1.945262,-0.453373,0.407728
2013-01-04,-0.106323,-0.881239,-1.363811,0.760094


In [22]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

a    0.319279
b    2.201077
c   -1.234000
d   -0.728501
e   -1.783595
f    0.043010
dtype: float64

In [23]:
s1.loc['c':]

c   -1.234000
d   -0.728501
e   -1.783595
f    0.043010
dtype: float64

In [24]:
s1.loc['b']

np.float64(2.201077418728823)

In [28]:
s1.loc['b':] = 0
s1

a    0.319279
b    0.000000
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [29]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))

df1

Unnamed: 0,A,B,C,D
a,1.330373,0.219507,-0.9398,0.350427
b,-0.576895,1.124688,-1.623233,0.65918
c,0.008091,-0.444321,-0.165576,0.740095
d,0.334303,0.595128,0.226517,-1.087642
e,0.44964,0.475313,1.870103,1.653537
f,-0.797806,-0.242434,-0.379059,0.190605


In [30]:
df1.loc[['a', 'b', 'd'], :]

Unnamed: 0,A,B,C,D
a,1.330373,0.219507,-0.9398,0.350427
b,-0.576895,1.124688,-1.623233,0.65918
d,0.334303,0.595128,0.226517,-1.087642


In [32]:
df1.loc['b'] > 0

A    False
B     True
C    False
D     True
Name: b, dtype: bool

In [33]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,A,B,D
a,1.330373,0.219507,0.350427
b,-0.576895,1.124688,0.65918
c,0.008091,-0.444321,0.740095
d,0.334303,0.595128,-1.087642
e,0.44964,0.475313,1.653537
f,-0.797806,-0.242434,0.190605


In [34]:
#Slicing with labels
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [35]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [36]:
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

In [37]:
s = pd.Series(list('abcdef'), index=[0, 3, 2, 5, 4, 2])
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [40]:
#Selection by position
s1 = pd.Series(np.random.randn(10), index=list(range(0, 20, 2)))
s1

0    -0.497291
2    -0.246938
4     0.045686
6     0.943786
8     0.481978
10    0.598440
12   -0.391552
14   -0.286952
16    0.033097
18   -1.038108
dtype: float64

In [41]:
s1.iloc[:6]

0    -0.497291
2    -0.246938
4     0.045686
6     0.943786
8     0.481978
10    0.598440
dtype: float64

In [43]:
s1.iloc[:3] = 0
s1

0     0.000000
2     0.000000
4     0.000000
6     0.943786
8     0.481978
10    0.598440
12   -0.391552
14   -0.286952
16    0.033097
18   -1.038108
dtype: float64

In [44]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list(range(0, 12, 2)),
                   columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,0.823161,-2.167599,0.633544,0.518136
2,1.084002,-0.076324,0.23357,-0.350496
4,0.148158,-0.193706,0.567041,0.017326
6,1.051435,-1.103988,0.364355,1.175216
8,-1.487319,-0.089886,-1.417048,-0.307075
10,-1.314145,-1.019453,0.089999,0.384444


In [45]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,0.823161,-2.167599,0.633544,0.518136
2,1.084002,-0.076324,0.23357,-0.350496
4,0.148158,-0.193706,0.567041,0.017326


In [46]:
df1.iloc[1:5, 2:4]

Unnamed: 0,4,6
2,0.23357,-0.350496
4,0.567041,0.017326
6,0.364355,1.175216
8,-1.417048,-0.307075


In [47]:
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,-0.076324,-0.350496
6,-1.103988,1.175216
10,-1.019453,0.384444


In [48]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.911466,0.315141,-0.725107,0.190673
b,-1.380188,-0.134754,-0.080373,1.383816
c,0.592468,0.59697,-1.644931,-0.80151
d,-0.624535,0.550468,0.568672,-0.477662
e,0.326708,1.793771,0.690663,-0.701349
f,-0.778005,0.343137,-1.069912,-0.134756


In [49]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
a,0.911466,0.315141,-0.725107,0.190673
c,0.592468,0.59697,-1.644931,-0.80151
e,0.326708,1.793771,0.690663,-0.701349


In [50]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,0.911466,0.315141
b,-1.380188,-0.134754
c,0.592468,0.59697
d,-0.624535,0.550468
e,0.326708,1.793771
f,-0.778005,0.343137


In [55]:
df1.iloc[:, lambda df: [0, 1, 2]]

Unnamed: 0,A,B,C
a,0.911466,0.315141,-0.725107
b,-1.380188,-0.134754,-0.080373
c,0.592468,0.59697,-1.644931
d,-0.624535,0.550468,0.568672
e,0.326708,1.793771,0.690663
f,-0.778005,0.343137,-1.069912


In [56]:
df1['A'].loc[lambda s: s > 0]

a    0.911466
c    0.592468
e    0.326708
Name: A, dtype: float64

In [58]:
#Combining positional and label-based indexing
dfd = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]},
                   index=list('abc'))
dfd

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [59]:
dfd.loc[dfd.index[[0, 2]], 'A']

a    1
c    3
Name: A, dtype: int64

In [60]:
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]

a    1
c    3
Name: A, dtype: int64

In [61]:
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]

Unnamed: 0,A,B
a,1,4
c,3,6


In [62]:
#Reindexing
s = pd.Series([1, 2, 3])
s.reindex([1, 2, 3])

1    2.0
2    3.0
3    NaN
dtype: float64

In [63]:
labels = [1, 2, 3]
s.loc[s.index.intersection(labels)]

1    2
2    3
dtype: int64

In [64]:
s.loc[s.index.intersection(labels)].reindex(labels)

1    2.0
2    3.0
3    NaN
dtype: float64

In [65]:
#Selecting random samples
s = pd.Series([0, 1, 2, 3, 4, 5])
s.sample()

1    1
dtype: int64

In [66]:
# One may specify either a number of rows:
s.sample(n=3)

0    0
4    4
5    5
dtype: int64

In [67]:
# Or a fraction of the rows:
s.sample(frac=0.5)

3    3
4    4
1    1
dtype: int64

In [68]:
s = pd.Series([0, 1, 2, 3, 4, 5])
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)

5    5
2    2
4    4
dtype: int64

In [69]:
# Weights will be re-normalized automatically
example_weights2 = [0.5, 0, 0, 0, 0, 0]
s.sample(n=1, weights=example_weights2)

0    0
dtype: int64

In [70]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6],
                    'weight_column': [0.5, 0.4, 0.1, 0]})
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
0,9,0.5
1,8,0.4
2,7,0.1


In [None]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df3.sample(n=1, axis=1)