In [1]:
import pandas as pd
import numpy as np
ser = pd.Series(range(5), index=list("abcde"))

ser.loc[["a", "c", "e"]]

a    0
c    2
e    4
dtype: int64

In [2]:
df = pd.DataFrame(np.arange(25).reshape(5, 5), index=list("abcde"), columns=list("abcde"))

df.loc[["a", "c", "e"], ["b", "d"]]

Unnamed: 0,b,d
a,1,3
c,11,13
e,21,23


In [3]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])

df

Unnamed: 0,A,B,C,D
2000-01-01,-0.388975,0.394617,-0.518202,0.234145
2000-01-02,0.531612,0.651878,-0.03248,-0.744616
2000-01-03,-1.12732,0.044422,1.089375,-0.131771
2000-01-04,1.833056,-0.432709,0.20568,0.742858
2000-01-05,-0.63193,-1.544742,0.473365,-0.134331
2000-01-06,2.462119,0.947975,2.40894,-0.004904
2000-01-07,-1.714662,-0.52664,-0.013492,-1.224102
2000-01-08,0.124953,0.172782,-0.98947,-0.136843


In [4]:
s = df['A']

s[dates[5]]

np.float64(2.462118559689352)

In [5]:
df[['B', 'A']] = df[['A', 'B']]

df

Unnamed: 0,A,B,C,D
2000-01-01,0.394617,-0.388975,-0.518202,0.234145
2000-01-02,0.651878,0.531612,-0.03248,-0.744616
2000-01-03,0.044422,-1.12732,1.089375,-0.131771
2000-01-04,-0.432709,1.833056,0.20568,0.742858
2000-01-05,-1.544742,-0.63193,0.473365,-0.134331
2000-01-06,0.947975,2.462119,2.40894,-0.004904
2000-01-07,-0.52664,-1.714662,-0.013492,-1.224102
2000-01-08,0.172782,0.124953,-0.98947,-0.136843


In [6]:
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.394617,-0.388975
2000-01-02,0.651878,0.531612
2000-01-03,0.044422,-1.12732
2000-01-04,-0.432709,1.833056
2000-01-05,-1.544742,-0.63193
2000-01-06,0.947975,2.462119
2000-01-07,-0.52664,-1.714662
2000-01-08,0.172782,0.124953


In [7]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.394617,-0.388975
2000-01-02,0.651878,0.531612
2000-01-03,0.044422,-1.12732
2000-01-04,-0.432709,1.833056
2000-01-05,-1.544742,-0.63193
2000-01-06,0.947975,2.462119
2000-01-07,-0.52664,-1.714662
2000-01-08,0.172782,0.124953


In [8]:
#Swapping column values using raw values
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-0.388975,0.394617
2000-01-02,0.531612,0.651878
2000-01-03,-1.12732,0.044422
2000-01-04,1.833056,-0.432709
2000-01-05,-0.63193,-1.544742
2000-01-06,2.462119,0.947975
2000-01-07,-1.714662,-0.52664
2000-01-08,0.124953,0.172782


In [9]:
df.iloc[:, [1, 0]] = df[['A', 'B']]
df[['A','B']]

Unnamed: 0,A,B
2000-01-01,0.394617,-0.388975
2000-01-02,0.651878,0.531612
2000-01-03,0.044422,-1.12732
2000-01-04,-0.432709,1.833056
2000-01-05,-1.544742,-0.63193
2000-01-06,0.947975,2.462119
2000-01-07,-0.52664,-1.714662
2000-01-08,0.172782,0.124953


In [10]:
#Accessing an index on a Series or column on a DataFrame directly as an attribute
sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()
sa.b

np.int64(2)

In [11]:
dfa.A

2000-01-01    0.394617
2000-01-02    0.651878
2000-01-03    0.044422
2000-01-04   -0.432709
2000-01-05   -1.544742
2000-01-06    0.947975
2000-01-07   -0.526640
2000-01-08    0.172782
Freq: D, Name: A, dtype: float64

In [12]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [13]:
dfa.A = list(range(len(dfa.index)))  # ok if A already exists
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.388975,-0.518202,0.234145
2000-01-02,1,0.531612,-0.03248,-0.744616
2000-01-03,2,-1.12732,1.089375,-0.131771
2000-01-04,3,1.833056,0.20568,0.742858
2000-01-05,4,-0.63193,0.473365,-0.134331
2000-01-06,5,2.462119,2.40894,-0.004904
2000-01-07,6,-1.714662,-0.013492,-1.224102
2000-01-08,7,0.124953,-0.98947,-0.136843


In [14]:
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.388975,-0.518202,0.234145
2000-01-02,1,0.531612,-0.03248,-0.744616
2000-01-03,2,-1.12732,1.089375,-0.131771
2000-01-04,3,1.833056,0.20568,0.742858
2000-01-05,4,-0.63193,0.473365,-0.134331
2000-01-06,5,2.462119,2.40894,-0.004904
2000-01-07,6,-1.714662,-0.013492,-1.224102
2000-01-08,7,0.124953,-0.98947,-0.136843


In [15]:
#Slicing ranges
s[:5]

2000-01-01   -0.388975
2000-01-02    0.531612
2000-01-03   -1.127320
2000-01-04    1.833056
2000-01-05   -0.631930
Freq: D, Name: A, dtype: float64

In [16]:
s[::2]

2000-01-01   -0.388975
2000-01-03   -1.127320
2000-01-05   -0.631930
2000-01-07   -1.714662
Freq: 2D, Name: A, dtype: float64

In [17]:
s[::-1]

2000-01-08    0.124953
2000-01-07   -1.714662
2000-01-06    2.462119
2000-01-05   -0.631930
2000-01-04    1.833056
2000-01-03   -1.127320
2000-01-02    0.531612
2000-01-01   -0.388975
Freq: -1D, Name: A, dtype: float64

In [18]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,0.394617,-0.388975,-0.518202,0.234145
2000-01-02,0.651878,0.531612,-0.03248,-0.744616
2000-01-03,0.044422,-1.12732,1.089375,-0.131771


In [19]:
df[2::-1]

Unnamed: 0,A,B,C,D
2000-01-03,0.044422,-1.12732,1.089375,-0.131771
2000-01-02,0.651878,0.531612,-0.03248,-0.744616
2000-01-01,0.394617,-0.388975,-0.518202,0.234145


In [20]:
#Selection by label
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=pd.date_range('20130101', periods=5))
dfl.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.471738,-0.197573,-0.279607,0.225218
2013-01-03,-0.494448,-1.218578,-0.245819,-0.077833
2013-01-04,1.777204,-0.166616,0.965451,0.030039


In [21]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

a    0.979678
b   -0.308386
c    1.524569
d   -1.363097
e    1.250031
f    0.154462
dtype: float64

In [22]:
s1.loc['c':]

c    1.524569
d   -1.363097
e    1.250031
f    0.154462
dtype: float64

In [23]:
s1.loc['b']

np.float64(-0.3083855176531642)

In [24]:
s1.loc['b':] = 0
s1

a    0.979678
b    0.000000
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [25]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))

df1

Unnamed: 0,A,B,C,D
a,2.143104,-0.585128,-0.093311,0.137785
b,-1.628376,0.283355,0.240283,-0.799633
c,-0.607211,0.709945,-0.500209,1.420637
d,0.33557,-0.652822,2.002482,0.61093
e,-0.580844,0.079901,0.64402,-0.47165
f,-1.106619,-2.485606,-0.316298,-0.150424


In [26]:
df1.loc[['a', 'b', 'd'], :]

Unnamed: 0,A,B,C,D
a,2.143104,-0.585128,-0.093311,0.137785
b,-1.628376,0.283355,0.240283,-0.799633
d,0.33557,-0.652822,2.002482,0.61093


In [27]:
df1.loc['b'] > 0

A    False
B     True
C     True
D    False
Name: b, dtype: bool

In [28]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,A,D
a,2.143104,0.137785
b,-1.628376,-0.799633
c,-0.607211,1.420637
d,0.33557,0.61093
e,-0.580844,-0.47165
f,-1.106619,-0.150424


In [29]:
#Slicing with labels
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [30]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [31]:
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

In [32]:
s = pd.Series(list('abcdef'), index=[0, 3, 2, 5, 4, 2])
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [33]:
#Selection by position
s1 = pd.Series(np.random.randn(10), index=list(range(0, 20, 2)))
s1

0     0.433968
2    -1.540859
4    -0.860384
6     0.250601
8    -0.705467
10   -0.411516
12   -0.016356
14    0.418809
16   -0.983095
18    0.154185
dtype: float64

In [34]:
s1.iloc[:6]

0     0.433968
2    -1.540859
4    -0.860384
6     0.250601
8    -0.705467
10   -0.411516
dtype: float64

In [35]:
s1.iloc[:3] = 0
s1

0     0.000000
2     0.000000
4     0.000000
6     0.250601
8    -0.705467
10   -0.411516
12   -0.016356
14    0.418809
16   -0.983095
18    0.154185
dtype: float64

In [36]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list(range(0, 12, 2)),
                   columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,0.314015,-0.807377,0.491171,-0.2753
2,-0.211742,0.738047,0.402195,0.397203
4,-0.498642,0.158177,-1.81312,0.66506
6,-1.079534,0.80072,1.267918,0.681232
8,0.242831,1.49617,0.539307,0.242661
10,0.400031,-1.470804,-0.89854,0.045514


In [37]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,0.314015,-0.807377,0.491171,-0.2753
2,-0.211742,0.738047,0.402195,0.397203
4,-0.498642,0.158177,-1.81312,0.66506


In [38]:
df1.iloc[1:5, 2:4]

Unnamed: 0,4,6
2,0.402195,0.397203
4,-1.81312,0.66506
6,1.267918,0.681232
8,0.539307,0.242661


In [39]:
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,0.738047,0.397203
6,0.80072,0.681232
10,-1.470804,0.045514


In [40]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-1.119926,0.044275,-2.114484,0.224631
b,0.368712,-0.334807,1.079218,-0.32823
c,-0.50833,0.07171,1.171704,-0.27096
d,-0.345873,1.111492,0.162519,-1.586199
e,-0.436028,0.581179,-0.089046,0.809241
f,-0.091932,0.659551,-1.001504,0.210321


In [41]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
b,0.368712,-0.334807,1.079218,-0.32823


In [42]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-1.119926,0.044275
b,0.368712,-0.334807
c,-0.50833,0.07171
d,-0.345873,1.111492
e,-0.436028,0.581179
f,-0.091932,0.659551


In [43]:
df1.iloc[:, lambda df: [0, 1, 2]]

Unnamed: 0,A,B,C
a,-1.119926,0.044275,-2.114484
b,0.368712,-0.334807,1.079218
c,-0.50833,0.07171,1.171704
d,-0.345873,1.111492,0.162519
e,-0.436028,0.581179,-0.089046
f,-0.091932,0.659551,-1.001504


In [44]:
df1['A'].loc[lambda s: s > 0]

b    0.368712
Name: A, dtype: float64

In [45]:
#Combining positional and label-based indexing
dfd = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]},
                   index=list('abc'))
dfd

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [46]:
dfd.loc[dfd.index[[0, 2]], 'A']

a    1
c    3
Name: A, dtype: int64

In [47]:
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]

a    1
c    3
Name: A, dtype: int64

In [48]:
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]

Unnamed: 0,A,B
a,1,4
c,3,6


In [49]:
#Reindexing
s = pd.Series([1, 2, 3])
s.reindex([1, 2, 3])

1    2.0
2    3.0
3    NaN
dtype: float64

In [50]:
labels = [1, 2, 3]
s.loc[s.index.intersection(labels)]

1    2
2    3
dtype: int64

In [51]:
s.loc[s.index.intersection(labels)].reindex(labels)

1    2.0
2    3.0
3    NaN
dtype: float64

In [52]:
#Selecting random samples
s = pd.Series([0, 1, 2, 3, 4, 5])
s.sample()

3    3
dtype: int64

In [53]:
# One may specify either a number of rows:
s.sample(n=3)

0    0
5    5
2    2
dtype: int64

In [54]:
# Or a fraction of the rows:
s.sample(frac=0.5)

4    4
2    2
1    1
dtype: int64

In [55]:
s = pd.Series([0, 1, 2, 3, 4, 5])
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)

5    5
2    2
3    3
dtype: int64

In [56]:
# Weights will be re-normalized automatically
example_weights2 = [0.5, 0, 0, 0, 0, 0]
s.sample(n=1, weights=example_weights2)

0    0
dtype: int64

In [57]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6],
                    'weight_column': [0.5, 0.4, 0.1, 0]})
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
1,8,0.4
0,9,0.5
2,7,0.1


In [58]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df3.sample(n=1, axis=1)

Unnamed: 0,col1
0,1
1,2
2,3


In [59]:
df4 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
# With a given seed, the sample will always draw the same rows.
df4.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


In [60]:
df4.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


In [61]:
#Setting with enlargement
se = pd.Series([1, 2, 3])
se

0    1
1    2
2    3
dtype: int64

In [62]:
se[5] = 5.
se

0    1.0
1    2.0
2    3.0
5    5.0
dtype: float64

In [63]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2),
                   columns=['A', 'B'])
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


In [64]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


In [65]:
dfi.loc[3] = 5
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4
3,5,5,5


In [68]:
#Fast scalar value getting and setting
s.iat[5]

np.int64(5)

In [69]:
df.iat[3, 0]

np.float64(-0.432708845884139)

In [71]:
df.at[dates[5], 'B']

np.float64(2.462118559689352)

In [72]:
df.at[dates[5], 'B'] = 4
df.at[dates[5], 'B']

np.float64(4.0)

In [77]:
#Boolean indexing
s = pd.Series(range(1, 6))
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [78]:
s[s> 3]

3    4
4    5
dtype: int64

In [79]:
s[(s< 2)| (s>4)] 

0    1
4    5
dtype: int64

In [83]:
s[~(s> 3)]

0    1
1    2
2    3
dtype: int64

In [84]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2000-01-01,0.394617,-0.388975,-0.518202,0.234145
2000-01-02,0.651878,0.531612,-0.03248,-0.744616
2000-01-03,0.044422,-1.12732,1.089375,-0.131771
2000-01-06,0.947975,4.0,2.40894,-0.004904
2000-01-08,0.172782,0.124953,-0.98947,-0.136843


In [85]:
#Indexing with isin
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [86]:
s.isin([2,3])

4    False
3    False
2     True
1     True
0    False
dtype: bool

In [87]:
s[s.isin([2,3])]

2    2
1    3
dtype: int64

In [88]:
s[s.index.isin([2, 3])]

3    1
2    2
dtype: int64

In [89]:
#MultiIndex allows selecting a separate level to use in the membership check
s_mi = pd.Series(np.arange(6),
                 index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int64

In [90]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

0  c    2
1  a    3
dtype: int64

In [91]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int64

In [92]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
                   'ids2': ['a', 'n', 'c', 'n']})

values = ['a', 'b', 1, 3]
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [93]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,False
1,False,True,False
2,True,False,False
3,False,False,False


In [94]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}
~df.isin(values)

Unnamed: 0,vals,ids,ids2
0,False,False,True
1,True,False,True
2,False,True,True
3,True,True,True


In [95]:
#The where() Method and Masking
s[s> 0]

3    1
2    2
1    3
0    4
dtype: int64

In [96]:
s.where (s>0)

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [99]:
dates = pd.date_range('1/1/2025', periods= 10)
df = pd.DataFrame(np.random.randn(10, 6), index=dates, columns = ['A','B','C','D,','E','F'])
df

Unnamed: 0,A,B,C,"D,",E,F
2025-01-01,-1.481209,-0.242529,0.16947,0.7806,-0.30501,2.178981
2025-01-02,-0.588504,-0.857547,0.014258,1.115088,-0.990576,-0.314185
2025-01-03,-0.119689,-0.839221,0.459907,1.389886,1.140433,1.549202
2025-01-04,-0.314162,-0.918138,1.627237,0.019334,-0.65218,-1.198086
2025-01-05,-0.244296,-0.736231,1.639778,0.260493,-1.900633,1.205825
2025-01-06,-0.028099,-0.504173,1.384606,0.150455,-0.996183,0.424643
2025-01-07,-0.697296,-0.165171,0.153317,0.522139,-1.522039,-1.69515
2025-01-08,-1.117074,-0.266371,-0.872456,1.772141,0.666792,-1.000435
2025-01-09,0.504276,0.731455,-0.37207,0.442116,1.051566,-1.105344
2025-01-10,-1.316536,0.743443,1.092071,-0.316993,0.249066,-0.75486


In [100]:
df[df < 0]

Unnamed: 0,A,B,C,"D,",E,F
2025-01-01,-1.481209,-0.242529,,,-0.30501,
2025-01-02,-0.588504,-0.857547,,,-0.990576,-0.314185
2025-01-03,-0.119689,-0.839221,,,,
2025-01-04,-0.314162,-0.918138,,,-0.65218,-1.198086
2025-01-05,-0.244296,-0.736231,,,-1.900633,
2025-01-06,-0.028099,-0.504173,,,-0.996183,
2025-01-07,-0.697296,-0.165171,,,-1.522039,-1.69515
2025-01-08,-1.117074,-0.266371,-0.872456,,,-1.000435
2025-01-09,,,-0.37207,,,-1.105344
2025-01-10,-1.316536,,,-0.316993,,-0.75486


In [104]:
df.where(df<0, -df)

Unnamed: 0,A,B,C,"D,",E,F
2025-01-01,-1.481209,-0.242529,-0.16947,-0.7806,-0.30501,-2.178981
2025-01-02,-0.588504,-0.857547,-0.014258,-1.115088,-0.990576,-0.314185
2025-01-03,-0.119689,-0.839221,-0.459907,-1.389886,-1.140433,-1.549202
2025-01-04,-0.314162,-0.918138,-1.627237,-0.019334,-0.65218,-1.198086
2025-01-05,-0.244296,-0.736231,-1.639778,-0.260493,-1.900633,-1.205825
2025-01-06,-0.028099,-0.504173,-1.384606,-0.150455,-0.996183,-0.424643
2025-01-07,-0.697296,-0.165171,-0.153317,-0.522139,-1.522039,-1.69515
2025-01-08,-1.117074,-0.266371,-0.872456,-1.772141,-0.666792,-1.000435
2025-01-09,-0.504276,-0.731455,-0.37207,-0.442116,-1.051566,-1.105344
2025-01-10,-1.316536,-0.743443,-1.092071,-0.316993,-0.249066,-0.75486


In [105]:
s2 = s.copy()
s2[s2< 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [107]:
df2 = df.copy()
df2[df2< 0] = 0
df2

Unnamed: 0,A,B,C,"D,",E,F
2025-01-01,0.0,0.0,0.16947,0.7806,0.0,2.178981
2025-01-02,0.0,0.0,0.014258,1.115088,0.0,0.0
2025-01-03,0.0,0.0,0.459907,1.389886,1.140433,1.549202
2025-01-04,0.0,0.0,1.627237,0.019334,0.0,0.0
2025-01-05,0.0,0.0,1.639778,0.260493,0.0,1.205825
2025-01-06,0.0,0.0,1.384606,0.150455,0.0,0.424643
2025-01-07,0.0,0.0,0.153317,0.522139,0.0,0.0
2025-01-08,0.0,0.0,0.0,1.772141,0.666792,0.0
2025-01-09,0.504276,0.731455,0.0,0.442116,1.051566,0.0
2025-01-10,0.0,0.743443,1.092071,0.0,0.249066,0.0


In [108]:
#mask() is the inverse boolean operation of where.
s.mask(s >= 0)

4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64

In [109]:
df.mask(df >= 0)

Unnamed: 0,A,B,C,"D,",E,F
2025-01-01,-1.481209,-0.242529,,,-0.30501,
2025-01-02,-0.588504,-0.857547,,,-0.990576,-0.314185
2025-01-03,-0.119689,-0.839221,,,,
2025-01-04,-0.314162,-0.918138,,,-0.65218,-1.198086
2025-01-05,-0.244296,-0.736231,,,-1.900633,
2025-01-06,-0.028099,-0.504173,,,-0.996183,
2025-01-07,-0.697296,-0.165171,,,-1.522039,-1.69515
2025-01-08,-1.117074,-0.266371,-0.872456,,,-1.000435
2025-01-09,,,-0.37207,,,-1.105344
2025-01-10,-1.316536,,,-0.316993,,-0.75486


In [110]:
#Setting with enlargement conditionally using numpy()
df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
df

Unnamed: 0,col1,col2,color
0,A,Z,green
1,B,Z,green
2,B,X,red
3,C,Y,red


In [111]:
conditions = [
    (df['col2'] == 'Z') & (df['col1'] == 'A'),
    (df['col2'] == 'Z') & (df['col1'] == 'B'),
    (df['col1'] == 'B')
]
choices = ['yellow', 'blue', 'purple']
df['color'] = np.select(conditions, choices, default='black')
df

Unnamed: 0,col1,col2,color
0,A,Z,yellow
1,B,Z,blue
2,B,X,purple
3,C,Y,black


In [112]:
#The query() Method
n = 10
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df

Unnamed: 0,a,b,c
0,0.615265,0.326763,0.754507
1,0.399427,0.636354,0.02806
2,0.110952,0.054698,0.317605
3,0.959141,0.207383,0.337024
4,0.210829,0.083215,0.892328
5,0.533718,0.57042,0.516908
6,0.33695,0.686429,0.460518
7,0.231605,0.51299,0.943393
8,0.127877,0.459758,0.457472
9,0.795295,0.588979,0.781497


In [113]:
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

Unnamed: 0,a,b,c
7,0.231605,0.51299,0.943393


In [114]:
df.query('(a < b) & (b < c)')

Unnamed: 0,a,b,c
7,0.231605,0.51299,0.943393


In [115]:
#MultiIndex query() Syntax
n = 10
first_names = np.random.choice(['raph', 'grace', 'flo'], size=n)
last_names = np.random.choice(['kioko', 'wangu', 'muthoni'], size=n)
first_names

array(['flo', 'raph', 'grace', 'grace', 'grace', 'grace', 'flo', 'raph',
       'grace', 'flo'], dtype='<U5')

In [116]:
last_names

array(['muthoni', 'kioko', 'muthoni', 'wangu', 'kioko', 'wangu', 'kioko',
       'muthoni', 'muthoni', 'muthoni'], dtype='<U7')

In [117]:
index = pd.MultiIndex.from_arrays([first_names, last_names], names=['first_names', 'last_names'])
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
first_names,last_names,Unnamed: 2_level_1,Unnamed: 3_level_1
flo,muthoni,-0.212672,-1.754362
raph,kioko,1.363982,0.344145
grace,muthoni,-2.064342,0.314137
grace,wangu,1.46046,0.296155
grace,kioko,-0.643417,1.734751
grace,wangu,-0.805802,0.517306
flo,kioko,1.273198,-0.377603
raph,muthoni,-0.880473,0.296062
grace,muthoni,0.264598,-0.390125
flo,muthoni,-1.653784,1.053281


In [119]:
df.query('first_names == "flo"')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
first_names,last_names,Unnamed: 2_level_1,Unnamed: 3_level_1
flo,muthoni,-0.212672,-1.754362
flo,kioko,1.273198,-0.377603
flo,muthoni,-1.653784,1.053281


In [120]:
#The in and not in operators
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})


df

Unnamed: 0,a,b,c,d
0,a,a,4,6
1,a,a,3,0
2,b,a,4,0
3,b,a,4,3
4,c,b,1,5
5,c,b,3,8
6,d,b,0,8
7,d,b,0,5
8,e,c,2,2
9,e,c,2,4


In [121]:
df.query('a in b')

Unnamed: 0,a,b,c,d
0,a,a,4,6
1,a,a,3,0
2,b,a,4,0
3,b,a,4,3
4,c,b,1,5
5,c,b,3,8


In [122]:
df[df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
0,a,a,4,6
1,a,a,3,0
2,b,a,4,0
3,b,a,4,3
4,c,b,1,5
5,c,b,3,8


In [123]:
df.query('a not in b')

Unnamed: 0,a,b,c,d
6,d,b,0,8
7,d,b,0,5
8,e,c,2,2
9,e,c,2,4
10,f,c,4,5
11,f,c,2,5


In [124]:
df[~df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
6,d,b,0,8
7,d,b,0,5
8,e,c,2,2
9,e,c,2,4
10,f,c,4,5
11,f,c,2,5


In [125]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})
df2

Unnamed: 0,a,b,c
0,one,x,-0.597607
1,one,y,0.301659
2,two,x,-0.771104
3,two,y,0.235909
4,two,x,0.923086
5,three,x,-0.073862
6,four,x,-0.090085


In [126]:
df2.duplicated('a')

0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [127]:
df2.duplicated('a', keep='last')

0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [128]:
df2.drop_duplicates('a')

Unnamed: 0,a,b,c
0,one,x,-0.597607
2,two,x,-0.771104
5,three,x,-0.073862
6,four,x,-0.090085


In [129]:
df2.drop_duplicates('a', keep='last')

Unnamed: 0,a,b,c
1,one,y,0.301659
4,two,x,0.923086
5,three,x,-0.073862
6,four,x,-0.090085


In [130]:
df2.drop_duplicates('a', keep=False)

Unnamed: 0,a,b,c
5,three,x,-0.073862
6,four,x,-0.090085


In [131]:
#Index objects
index = pd.Index(['e', 'd', 'a', 'b'])
index

Index(['e', 'd', 'a', 'b'], dtype='object')

In [132]:
'd' in index

True

In [133]:
index = pd.Index(['e', 'd', 'a', 'b'], name='something')
index.name

'something'

In [134]:
index = pd.Index(list(range(5)), name='rows')
columns = pd.Index(['A', 'B', 'C'], name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)
df

cols,A,B,C
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.193395,0.083517,1.009713
1,0.135022,-0.040824,0.925085
2,-0.596476,-0.41143,1.398103
3,1.251978,-1.063133,1.242187
4,-0.013172,0.213558,-2.337255


In [136]:
df['C']

rows
0    1.009713
1    0.925085
2    1.398103
3    1.242187
4   -2.337255
Name: C, dtype: float64