In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='svg'

In [2]:
data = {'x': 2**np.arange(5),
       'y': 3**np.arange(5),
       'z': np.array([45,98,24,11,64])}

In [3]:
index=['a', 'b', 'c', 'd', 'e']

In [4]:
df = pd.DataFrame(data=data, index=index)
df

Unnamed: 0,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24
d,8,27,11
e,16,81,64


In [8]:
mask = df['z'] < 50

In [9]:
mask

a     True
b    False
c     True
d     True
e    False
Name: z, dtype: bool

In [10]:
df[mask]

Unnamed: 0,x,y,z
a,1,1,45
c,4,9,24
d,8,27,11


In [11]:
df[mask]['z'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
df

Unnamed: 0,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24
d,8,27,11
e,16,81,64


Here, we received a SettingWithCopyWarning by trying to change the values in row 'z' using the mask. As the warning says, using the accessor commands is a better option.

In [13]:
df.loc[mask, 'z'] = 0
df

Unnamed: 0,x,y,z
a,1,1,0
b,2,3,98
c,4,9,0
d,8,27,0
e,16,81,64


Alternatively, the mask can be provided as the second locator:

In [14]:
df = pd.DataFrame(data=data, index=index)

In [15]:
df['z'][mask] = 0
df

Unnamed: 0,x,y,z
a,1,1,0
b,2,3,98
c,4,9,0
d,8,27,0
e,16,81,64


# Views and Copies

In [16]:
arr = np.array([1, 2, 4, 8, 12, 16, 32])
arr

array([ 1,  2,  4,  8, 12, 16, 32])

In [17]:
# Extract the second and 4th elements
arr[1:4:2]

array([2, 8])

In [18]:
arr[[1,3]]

array([2, 8])

These two resulting arrays look the same, but they aren't.

In [19]:
arr[1:4:2].base

array([ 1,  2,  4,  8, 12, 16, 32])

In [20]:
arr[1:4:2].flags.owndata

False

In [21]:
arr[[1,3]].base

In [22]:
arr[[1,3]].flags.owndata

True

Modifying the base of a view will modify the data in the view, but will not affect the data in a copy.

In [23]:
view_of_arr = arr.view()

In [24]:
copy_of_arr = arr.copy()

In [25]:
arr.nbytes

56

In [26]:
copy_of_arr.nbytes

56

In [27]:
view_of_arr.nbytes

56

In [28]:
from sys import getsizeof

In [29]:
getsizeof(arr)

152

In [30]:
getsizeof(view_of_arr)

96

In [31]:
getsizeof(copy_of_arr)

152

In [32]:
arr[1] = 64

In [33]:
arr

array([ 1, 64,  4,  8, 12, 16, 32])

In [34]:
view_of_arr

array([ 1, 64,  4,  8, 12, 16, 32])

In [35]:
copy_of_arr

array([ 1,  2,  4,  8, 12, 16, 32])

In [36]:
df = pd.DataFrame(data=data, index=index)
df

Unnamed: 0,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24
d,8,27,11
e,16,81,64


In [37]:
view_of_df = df.copy(deep=False)
view_of_df

Unnamed: 0,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24
d,8,27,11
e,16,81,64


In [39]:
copy_of_df = df.copy()
copy_of_df

Unnamed: 0,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24
d,8,27,11
e,16,81,64


In [40]:
view_of_df.to_numpy().base is df.to_numpy().base

True

In [41]:
copy_of_df.to_numpy().base is df.to_numpy().base

False

# Indices and Slices in Pandas and NumPy

Using slicing operations returns a view of the original array (a shallow copy).

In [42]:
arr = np.arange(6)

In [43]:
a = arr[1:3]
a

array([1, 2])

In [44]:
a.base is arr

True

Using a list of integers to index an array returns a copy.

In [45]:
c = arr[[1,3]]
c

array([1, 3])

In [46]:
c.base is arr

False

Masking similarly returns a copy.

In [47]:
mask = arr < 4

In [48]:
d = arr[mask]

In [49]:
d

array([0, 1, 2, 3])

In [50]:
d.base is None

True

In [51]:
d.flags.owndata

True

In [52]:
arr[1] = 64

In [53]:
a

array([64,  2])

In [55]:
c

array([1, 3])

In [56]:
d

array([0, 1, 2, 3])

In [62]:
arr = np.zeros((3,4))
for i in range(arr.shape[1]):
    for j in range(arr.shape[0]):
        arr[j,i] = 2**(i+4*j)

In [63]:
arr

array([[1.000e+00, 2.000e+00, 4.000e+00, 8.000e+00],
       [1.600e+01, 3.200e+01, 6.400e+01, 1.280e+02],
       [2.560e+02, 5.120e+02, 1.024e+03, 2.048e+03]])

In [64]:
a = arr[:,1:3] # columns 1 & 2

In [65]:
a

array([[   2.,    4.],
       [  32.,   64.],
       [ 512., 1024.]])

In [66]:
a.base

array([[1.000e+00, 2.000e+00, 4.000e+00, 8.000e+00],
       [1.600e+01, 3.200e+01, 6.400e+01, 1.280e+02],
       [2.560e+02, 5.120e+02, 1.024e+03, 2.048e+03]])

In [67]:
a.base is arr

True

In [68]:
b = arr[:,1:4:2] # take columns 1 & 2

In [69]:
b

array([[2.000e+00, 8.000e+00],
       [3.200e+01, 1.280e+02],
       [5.120e+02, 2.048e+03]])

In [70]:
b.base

array([[1.000e+00, 2.000e+00, 4.000e+00, 8.000e+00],
       [1.600e+01, 3.200e+01, 6.400e+01, 1.280e+02],
       [2.560e+02, 5.120e+02, 1.024e+03, 2.048e+03]])

In [71]:
b.base is arr

True

In [72]:
c = arr[:,[1,3]]

In [73]:
c

array([[2.000e+00, 8.000e+00],
       [3.200e+01, 1.280e+02],
       [5.120e+02, 2.048e+03]])

In [74]:
c.base

array([[2.000e+00, 3.200e+01, 5.120e+02],
       [8.000e+00, 1.280e+02, 2.048e+03]])

In [75]:
c.base is arr

False

In [76]:
d = arr[:, [False, True, False, True]]
d

array([[2.000e+00, 8.000e+00],
       [3.200e+01, 1.280e+02],
       [5.120e+02, 2.048e+03]])

In [77]:
d.base is arr

False

In [78]:
arr[0,1] = 100

In [79]:
a

array([[ 100.,    4.],
       [  32.,   64.],
       [ 512., 1024.]])

In [80]:
b

array([[ 100.,    8.],
       [  32.,  128.],
       [ 512., 2048.]])

In [81]:
c

array([[2.000e+00, 8.000e+00],
       [3.200e+01, 1.280e+02],
       [5.120e+02, 2.048e+03]])

In [82]:
d

array([[2.000e+00, 8.000e+00],
       [3.200e+01, 1.280e+02],
       [5.120e+02, 2.048e+03]])

## Indexing in Pandas

In [83]:
df = pd.DataFrame(data=data, index=index)

In [84]:
df['a':'c']

Unnamed: 0,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24


In [85]:
df['a':'c'].to_numpy().base

array([[ 1,  2,  4,  8, 16],
       [ 1,  3,  9, 27, 81],
       [45, 98, 24, 11, 64]])

In [87]:
df['a':'c'].to_numpy().base is df.to_numpy().base

True

Like with NumPy, slicing has returned a shallow copy. Indexing will return a deep copy.

In [88]:
df[['x','y']]

Unnamed: 0,x,y
a,1,1
b,2,3
c,4,9
d,8,27
e,16,81


In [89]:
df[['x','y']].to_numpy().base

array([[ 1,  2,  4,  8, 16],
       [ 1,  3,  9, 27, 81]])

# Hierarchical Indexing and SettingWithCopyWarning

Multi-indexing is performed using tuples are row or column labels:

In [91]:
df = pd.DataFrame(
data = {('powers', 'x'): 2**np.arange(5),
       ('powers', 'y'): 3**np.arange(5),
       ('random', 'z'): np.array([45, 98, 24, 11, 64])},
index=['a', 'b', 'c', 'd', 'e'])

In [92]:
df

Unnamed: 0_level_0,powers,powers,random
Unnamed: 0_level_1,x,y,z
a,1,1,45
b,2,3,98
c,4,9,24
d,8,27,11
e,16,81,64


In [93]:
df['powers']

Unnamed: 0,x,y
a,1,1
b,2,3
c,4,9
d,8,27
e,16,81


In [94]:
df['powers','x']

a     1
b     2
c     4
d     8
e    16
Name: (powers, x), dtype: int64

In [95]:
df['powers', 'x'] = 0
df

Unnamed: 0_level_0,powers,powers,random
Unnamed: 0_level_1,x,y,z
a,0,1,45
b,0,3,98
c,0,9,24
d,0,27,11
e,0,81,64


In [96]:
df = pd.DataFrame(
data = {('powers', 'x'): 2**np.arange(5),
       ('powers', 'y'): 3**np.arange(5),
       ('random', 'z'): np.array([45, 98, 24, 11, 64])},
index=['a', 'b', 'c', 'd', 'e'])

In [97]:
df.loc[['a', 'b'], 'powers']

Unnamed: 0,x,y
a,1,1
b,2,3


In [98]:
df.loc[['a', 'b'], ('powers','x')]

a    1
b    2
Name: (powers, x), dtype: int64

In [99]:
df.loc[['a', 'b'], ('powers','x')] = 0.
df

Unnamed: 0_level_0,powers,powers,random
Unnamed: 0_level_1,x,y,z
a,0.0,1,45
b,0.0,3,98
c,4.0,9,24
d,8.0,27,11
e,16.0,81,64


In [100]:
df.dtypes

powers  x    float64
        y      int64
random  z      int64
dtype: object