In [1]:
import numpy as np
import pandas as pd

In [2]:
# https://docs.scipy.org/doc/numpy/user/basics.indexing.html?highlight=indexing

# caveat 1

# chained-indexing as in arr[m][n] vs multidimenional-indexing as in arr[m,n] work differently
arr = np.arange(1, 40, 2).reshape(4,5)
print (arr)
# this is literaly just extracting element at position 1,2
print (arr[1,2])      
print ("-----")
# this first creates a temporary 1D array arr[1], 
# and then picks up the element at position 2 from that temporary array
print (arr[1][2])     

print ("*****")

# while above is fine for accessing individual elements from a 2D array, it can produce weird results when working on 2D slices
arr = np.arange(1, 40, 2).reshape(4,5)
print (arr)
# this will pick up the 2D slices like we expect
print (arr[0:3,0:2])  
print ("-----")
# this will create a temporary 2D array arr[0:3], 
# and then apply [0:2] to that temporary 2D array, thus giving the wrong result
print (arr[0:3][0:2])  

# so, just use the arr[m,n] form in gerenal to stay out of trouble!

[[ 1  3  5  7  9]
 [11 13 15 17 19]
 [21 23 25 27 29]
 [31 33 35 37 39]]
15
-----
15
*****
[[ 1  3  5  7  9]
 [11 13 15 17 19]
 [21 23 25 27 29]
 [31 33 35 37 39]]
[[ 1  3]
 [11 13]
 [21 23]]
-----
[[ 1  3  5  7  9]
 [11 13 15 17 19]]


In [3]:
# https://docs.scipy.org/doc/numpy/user/basics.indexing.html?highlight=indexing

# caveat 2

# slices of arrays do not copy the internal array data but only produce 
#  new views of the original data
# An explicit copy() is therefore recommended so you don't end up tampering 
#  with the original data

a = np.arange(1, 40, 2).reshape(4,5)
b = a[0:3,0:2] # this is giving me a view
print (a)
print (b)
print ("-----")
# now if I change an element in a, it will get reflected in b
a[0,0]=-1000
print (a)
print (b)

print ("*****")

a = np.arange(1, 40, 2).reshape(4,5)
b = a[0:3,0:2].copy() # this is giving me a copy
print (a)
print (b)
print ("-----")
# now if I change an element in a, it will NOT get reflected in b
a[0,0]=-1000
print (a)
print (b)

[[ 1  3  5  7  9]
 [11 13 15 17 19]
 [21 23 25 27 29]
 [31 33 35 37 39]]
[[ 1  3]
 [11 13]
 [21 23]]
-----
[[-1000     3     5     7     9]
 [   11    13    15    17    19]
 [   21    23    25    27    29]
 [   31    33    35    37    39]]
[[-1000     3]
 [   11    13]
 [   21    23]]
*****
[[ 1  3  5  7  9]
 [11 13 15 17 19]
 [21 23 25 27 29]
 [31 33 35 37 39]]
[[ 1  3]
 [11 13]
 [21 23]]
-----
[[-1000     3     5     7     9]
 [   11    13    15    17    19]
 [   21    23    25    27    29]
 [   31    33    35    37    39]]
[[ 1  3]
 [11 13]
 [21 23]]


In [4]:
# SettingWithCopyWarning
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#why-does-assignment-fail-when-using-chained-indexing

# caveat 1
# you might need to clear out cache to see the two different scenarios play out

np.random.seed(0)
df = pd.DataFrame(data=np.random.normal(size=(4,6)), 
                 index = ['w','x','y','z'],
                 columns = ['a','b','c','d','e','f'])
print (df)

# Assigning to the result of chained-indexing (as in df[m][n]) has inherently unpredictable results.
# Outside of simple cases, it’s very hard to predict whether the result of the chained-indexing 
# will return  a view of the dataframe (and therefore the assignment will modify the original 
# dataframe as expected), or a temporary copy of the dataframe (and therefore the assignment 
# will not update the original df).
# That is what the SettingWithCopyWarning is for.
# df[df['f']>0]['b'] = -1000

# Assigning to the result of multidimenional-indexing (as in df[m,n]) always has predictable results.
# The result of the multidimenional-indexing will always return a view of the dataframe 
# (and therefore the assignment will modify the original dataframe as expected).
# That is why you don't see a SettingWithCopyWarning when you assign to the result of  
# multidimenional-indexing.
df.loc[df['f']>0, 'b'] = -1000

print (df)

# so, just use the df[m,n] form in gerenal to stay out of trouble - esp when making assignments!

          a         b         c         d         e         f
w  1.764052  0.400157  0.978738  2.240893  1.867558 -0.977278
x  0.950088 -0.151357 -0.103219  0.410599  0.144044  1.454274
y  0.761038  0.121675  0.443863  0.333674  1.494079 -0.205158
z  0.313068 -0.854096 -2.552990  0.653619  0.864436 -0.742165
          a            b         c         d         e         f
w  1.764052     0.400157  0.978738  2.240893  1.867558 -0.977278
x  0.950088 -1000.000000 -0.103219  0.410599  0.144044  1.454274
y  0.761038     0.121675  0.443863  0.333674  1.494079 -0.205158
z  0.313068    -0.854096 -2.552990  0.653619  0.864436 -0.742165


In [5]:
# caveat 2
# you might need to clear out cache to see the two different scnarios play out

np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(4,6)), 
                 index = ['w','x','y','z'],
                 columns = ['a','b','c','d','e','f'])
print (df1)

# this assignment is a view or copy () of the original df 
# which means any change to df2 in subsequent code may change df as well, 
# and this will trigger a SettingWithCopyWarning 
#df2 = df1.loc['w':'y'] 

# this assignment is a copy of the original df 
# which means any change to df2 in subsequent code will will not change df, 
# and hence we will not get a SettingWithCopyWarning 
df2 = df1.loc['w':'y'].copy() 

print (df2)

df2['a']['w'] = 100

print (df1)
print (df2)

          a         b         c         d         e         f
w  1.764052  0.400157  0.978738  2.240893  1.867558 -0.977278
x  0.950088 -0.151357 -0.103219  0.410599  0.144044  1.454274
y  0.761038  0.121675  0.443863  0.333674  1.494079 -0.205158
z  0.313068 -0.854096 -2.552990  0.653619  0.864436 -0.742165
          a         b         c         d         e         f
w  1.764052  0.400157  0.978738  2.240893  1.867558 -0.977278
x  0.950088 -0.151357 -0.103219  0.410599  0.144044  1.454274
y  0.761038  0.121675  0.443863  0.333674  1.494079 -0.205158
          a         b         c         d         e         f
w  1.764052  0.400157  0.978738  2.240893  1.867558 -0.977278
x  0.950088 -0.151357 -0.103219  0.410599  0.144044  1.454274
y  0.761038  0.121675  0.443863  0.333674  1.494079 -0.205158
z  0.313068 -0.854096 -2.552990  0.653619  0.864436 -0.742165
            a         b         c         d         e         f
w  100.000000  0.400157  0.978738  2.240893  1.867558 -0.977278
x   

In [6]:
# To summarize, here's how to consistently use multidimensional-indexing
# This is the preferred way expecially when assigning values
np.random.seed(0)
df = pd.DataFrame(data=np.random.normal(size=(4,6)), 
                  index=list('wxyz'), 
                  columns=list('abcdef'))
df

Unnamed: 0,a,b,c,d,e,f
w,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278
x,0.950088,-0.151357,-0.103219,0.410599,0.144044,1.454274
y,0.761038,0.121675,0.443863,0.333674,1.494079,-0.205158
z,0.313068,-0.854096,-2.55299,0.653619,0.864436,-0.742165


In [7]:
# row w
df.loc[['w'], :] # df.loc['w', :] will return series object

Unnamed: 0,a,b,c,d,e,f
w,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278


In [8]:
# rows w and y
df.loc[['w','y'], :] 

Unnamed: 0,a,b,c,d,e,f
w,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278
y,0.761038,0.121675,0.443863,0.333674,1.494079,-0.205158


In [9]:
# rows w to y
df.loc['w':'y', :] 

Unnamed: 0,a,b,c,d,e,f
w,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278
x,0.950088,-0.151357,-0.103219,0.410599,0.144044,1.454274
y,0.761038,0.121675,0.443863,0.333674,1.494079,-0.205158


In [10]:
# col a
df.loc[:, ['a']] # df.loc[:,'a'] will return a series object

Unnamed: 0,a
w,1.764052
x,0.950088
y,0.761038
z,0.313068


In [11]:
# cols a and c
df.loc[:, ['a','c']] 

Unnamed: 0,a,c
w,1.764052,0.978738
x,0.950088,-0.103219
y,0.761038,0.443863
z,0.313068,-2.55299


In [12]:
# cols a to c
df.loc[:, 'a':'c'] 

Unnamed: 0,a,b,c
w,1.764052,0.400157,0.978738
x,0.950088,-0.151357,-0.103219
y,0.761038,0.121675,0.443863
z,0.313068,-0.854096,-2.55299


In [13]:
# some cross-section of data
df.loc[['w','y'], ['a','c']]

Unnamed: 0,a,c
w,1.764052,0.978738
y,0.761038,0.443863


In [14]:
# another cross-section of data
df.loc['w':'y', 'a':'c']

Unnamed: 0,a,b,c
w,1.764052,0.400157,0.978738
x,0.950088,-0.151357,-0.103219
y,0.761038,0.121675,0.443863


In [15]:
# conditional selection
#df.loc[:,'b']<0
df.loc[df.loc[:,'b']<0, :]

Unnamed: 0,a,b,c,d,e,f
x,0.950088,-0.151357,-0.103219,0.410599,0.144044,1.454274
z,0.313068,-0.854096,-2.55299,0.653619,0.864436,-0.742165
