In [30]:
import pandas as pd
import numpy as np

# A Note on Column (Re)Assignment

* https://afraenkel.github.io/practical-data-science/04/modifying-dataframes.html#warning-chained-assignment 
* It is not always clear when Pandas returns a copy vs view.
* It's dangerous, don't do it!
* See Chapter 2.3 and 4.1 in the textbook.
* Caution: Tutorials lie to you!

### TLDR; Make copies whenever possible, use `assign` for adding columns

In [31]:
# 5 columns, 10 rows with random ints
def create_rands():
    np.random.seed(42)  # same input for each call, "fixes" the randomness 
    data = np.random.randint(0, 5, size=(10, 5))
    df = pd.DataFrame(data, columns=['col%d' % n for n in range(5)])
    return df

# 1 -- slice rows, then change column

In [32]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [33]:
# What are we doing here?

evens = df[df['col0'] % 2 == 0]
# evens = df.loc[9]
evens

Unnamed: 0,col0,col1,col2,col3,col4
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
9,0,1,4,1,3


# Array Broadcasting

* Arrays with different sizes (usually) cannot be added, subtracted, or generally be used in arithmetic.
* A way to overcome this is to duplicate the smaller array so that it is the dimensionality and size as the larger array. 
* This is called array **broadcasting** and is available in NumPy when performing array arithmetic.



https://machinelearningmastery.com/broadcasting-with-numpy-arrays/

In [34]:
# replicating the smaller array along the last mismatched dimension

a = np.array([1, 2, 3])
print(a)
b = 2
print(b)
c = a + b
print(c)

[1 2 3]
2
[3 4 5]


In [35]:
# one-dimensional and two-dimensional
from numpy import array
A = array([[1, 2, 3], [1, 2, 3]])
print(A)
print()
b = array([10, 20, 30])
print(b)
print()
C = A + b
print(C)


[[1 2 3]
 [1 2 3]]

[10 20 30]

[[11 22 33]
 [11 22 33]]


In [36]:
# Limitations
# can only be performed when the shape of each dimension in the arrays are 
# equal or one has the dimension size of 1

a = np.array([1, 2])
b = np.array([1, 2, 3])
c = a + b
c

ValueError: operands could not be broadcast together with shapes (2,) (3,) 

In [37]:
evens

Unnamed: 0,col0,col1,col2,col3,col4
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
9,0,1,4,1,3


In [38]:
# What am I trying to do? What is the output I expect? 
# Modyfing the existing dataframe

evens['col0'] = -1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
# meaning of the warning
# You just reassigned evens here by setting a new value in the dataframe
# but that dataframe is a COPY of a DIFFERENT dataframe.
# The result maybe not what you had in mind

In [39]:
evens

Unnamed: 0,col0,col1,col2,col3,col4
5,-1000,2,1,3,3
6,-1000,3,3,0,2
7,-1000,2,4,0,1
9,-1000,1,4,1,3


In [40]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [None]:
# ^^ Copy (evens) was created

## 2 -- slice column, then change row

In [41]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [42]:
# What am I doing?

col = df.loc[:, 'col0']  # slice of a single column
col.loc[df['col0'] % 2 == 0] = -1000  # set evens in this column to be -1000
col

0       3
1       1
2       3
3       1
4       1
5   -1000
6   -1000
7   -1000
8       3
9   -1000
Name: col0, dtype: int64

In [43]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,-1000,2,1,3,3
6,-1000,3,3,0,2
7,-1000,2,4,0,1
8,3,0,3,1,1
9,-1000,1,4,1,3


In [None]:
# ^^ it was a reference

# 3 -- select row, then change entry

# Single row is selected

Unlike the first example we selected a **single** row. What did we get? A copy or a reference?

In [44]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [47]:
#row = df.loc[0:4]
row = df.loc[0]
#row
row['col0'] = -1000
row

col0   -1000
col1       4
col2       2
col3       4
col4       4
Name: 0, dtype: int64

In [48]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,-1000,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [None]:
# ^^ reference!
# It treats slices of a dataframe and refers to it by reference 
# https://stackoverflow.com/questions/23296282/what-rules-does-pandas-use-to-generate-a-view-vs-a-copy

# 4 -- select row, then change entry (w/string column)

In [49]:
df = create_rands().assign(col4='a')
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,a
1,1,2,2,2,a
2,3,2,4,1,a
3,1,3,4,0,a
4,1,4,3,0,a
5,2,2,1,3,a
6,2,3,3,0,a
7,4,2,4,0,a
8,3,0,3,1,a
9,0,1,4,1,a


In [50]:
row = df.loc[0]
row['col0'] = -1000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [51]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,a
1,1,2,2,2,a
2,3,2,4,1,a
3,1,3,4,0,a
4,1,4,3,0,a
5,2,2,1,3,a
6,2,3,3,0,a
7,4,2,4,0,a
8,3,0,3,1,a
9,0,1,4,1,a


In [52]:
row

col0    -1000
col1        4
col2        2
col3        4
col4        a
Name: 0, dtype: object

## References vs Copies
* If table is homogeneous -- Pandas outsources memory management to numpy (reference)
* If table is heterogeneous -- Pandas makes copies

In [53]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [54]:
arr = df.values
arr[arr == 0] = -100000

In [55]:
arr

array([[      3,       4,       2,       4,       4],
       [      1,       2,       2,       2,       4],
       [      3,       2,       4,       1,       3],
       [      1,       3,       4, -100000,       3],
       [      1,       4,       3, -100000, -100000],
       [      2,       2,       1,       3,       3],
       [      2,       3,       3, -100000,       2],
       [      4,       2,       4, -100000,       1],
       [      3, -100000,       3,       1,       1],
       [-100000,       1,       4,       1,       3]])

In [56]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,-100000,3
4,1,4,3,-100000,-100000
5,2,2,1,3,3
6,2,3,3,-100000,2
7,4,2,4,-100000,1
8,3,-100000,3,1,1
9,-100000,1,4,1,3


In [57]:
df = create_rands().assign(col5='a')
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,3,4,2,4,4,a
1,1,2,2,2,4,a
2,3,2,4,1,3,a
3,1,3,4,0,3,a
4,1,4,3,0,0,a
5,2,2,1,3,3,a
6,2,3,3,0,2,a
7,4,2,4,0,1,a
8,3,0,3,1,1,a
9,0,1,4,1,3,a


In [58]:
arr = df.values
arr[arr == 0] = -100000

In [59]:
arr

array([[3, 4, 2, 4, 4, 'a'],
       [1, 2, 2, 2, 4, 'a'],
       [3, 2, 4, 1, 3, 'a'],
       [1, 3, 4, -100000, 3, 'a'],
       [1, 4, 3, -100000, -100000, 'a'],
       [2, 2, 1, 3, 3, 'a'],
       [2, 3, 3, -100000, 2, 'a'],
       [4, 2, 4, -100000, 1, 'a'],
       [3, -100000, 3, 1, 1, 'a'],
       [-100000, 1, 4, 1, 3, 'a']], dtype=object)

In [60]:
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,3,4,2,4,4,a
1,1,2,2,2,4,a
2,3,2,4,1,3,a
3,1,3,4,0,3,a
4,1,4,3,0,0,a
5,2,2,1,3,3,a
6,2,3,3,0,2,a
7,4,2,4,0,1,a
8,3,0,3,1,1,a
9,0,1,4,1,3,a


In [None]:
# Use .copy and .assign as much as you can