In [None]:
pip install -U fortran-magic

Collecting fortran-magic
  Downloading fortran_magic-0.7-py3-none-any.whl (9.6 kB)
Installing collected packages: fortran-magic
Successfully installed fortran-magic-0.7


In [None]:
%reload_ext fortranmagic

  self._lib_dir = os.path.join(get_ipython_cache_dir(), 'fortran')


In [None]:
%matplotlib inline
%load_ext fortranmagic

import sys; sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('figure', figsize=(12, 7))

ran_the_first_cell = True

jan2017 = pd.to_datetime(['2017-01-03 00:00:00+00:00',
 '2017-01-04 00:00:00+00:00',
 '2017-01-05 00:00:00+00:00',
 '2017-01-06 00:00:00+00:00',
 '2017-01-09 00:00:00+00:00',
 '2017-01-10 00:00:00+00:00',
 '2017-01-11 00:00:00+00:00',
 '2017-01-12 00:00:00+00:00',
 '2017-01-13 00:00:00+00:00',
 '2017-01-17 00:00:00+00:00',
 '2017-01-18 00:00:00+00:00',
 '2017-01-19 00:00:00+00:00',
 '2017-01-20 00:00:00+00:00',
 '2017-01-23 00:00:00+00:00',
 '2017-01-24 00:00:00+00:00',
 '2017-01-25 00:00:00+00:00',
 '2017-01-26 00:00:00+00:00',
 '2017-01-27 00:00:00+00:00',
 '2017-01-30 00:00:00+00:00',
 '2017-01-31 00:00:00+00:00',
 '2017-02-01 00:00:00+00:00'])
calendar = jan2017.values.astype('datetime64[D]')

event_dates = pd.to_datetime(['2017-01-06 00:00:00+00:00', 
                             '2017-01-07 00:00:00+00:00', 
                             '2017-01-08 00:00:00+00:00']).values.astype('datetime64[D]')
event_values = np.array([10, 15, 20])

The fortranmagic extension is already loaded. To reload it, use:
  %reload_ext fortranmagic


<center>
  <h1>The PyData Toolbox</h1>
  <h3>Scott Sanderson (Twitter: @scottbsanderson, GitHub: ssanderson)</h3>
  <h3><a href="https://github.com/ssanderson/pydata-toolbox">https://github.com/ssanderson/pydata-toolbox</a></h3>
</center>

# About Me:

<img src="images/me.jpg" alt="Drawing" style="width: 300px;"/>

- Senior Engineer at [Quantopian](www.quantopian.com)
- Background in Mathematics and Philosophy
- **Twitter:** [@scottbsanderson](https://twitter.com/scottbsanderson)
- **GitHub:** [ssanderson](github.com/ssanderson)

## Outline

- Built-in Data Structures
- Numpy `array`
- Pandas `Series`/`DataFrame`
- Plotting and "Real-World" Analyses

# Data Structures

> Rule 5. Data dominates. If you've chosen the right data structures and organized things well, the algorithms
will almost always be self-evident. Data structures, not algorithms, are central to programming.

- *Notes on Programming in C*, by Rob Pike.

# Lists

In [None]:
assert ran_the_first_cell, "Oh noes!"

In [None]:
l = [1, 'two', 3.0, 4, 5.0, "six"]
l

[1, 'two', 3.0, 4, 5.0, 'six']

In [None]:
# Lists can be indexed like C-style arrays.
first = l[0]
second = l[1]
print("First:", first)
print("Second:", second)

First: 1
Second: two


In [None]:
# Negative indexing gives elements relative to the end of the list.
last = l[-1]
penultimate = l[-2]
print("last:", last)
print("second to last:", penultimate)

last: six
second to last: 5.0


In [None]:
# Lists can also be sliced, which makes a copy of elements between 
# start (inclusive) and stop (exclusive)
sublist = l[1:3]
sublist

['two', 3.0]

In [None]:
# l[:N] is equivalent to l[0:N].
first_three = l[:3]
first_three

[1, 'two', 3.0]

In [None]:
# l[3:] is equivalent to l[3:len(l)].
after_three = l[3:]
after_three

[4, 5.0, 'six']

In [None]:
# There's also a third parameter, "step", which gets every Nth element.
l = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h']
l[1:7:2]

['b', 'd', 'f']

In [None]:
# This is a cute way to reverse a list.
l[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

In [None]:
# Lists can be grown efficiently (in O(1) amortized time).
l = [1, 2, 3, 4, 5]
print("Before:", l)
l.append('six')
print("After:", l)

Before: [1, 2, 3, 4, 5]
After: [1, 2, 3, 4, 5, 'six']


##**Mi Ejemplo 1**

In [None]:
# Comprehensions let us perform elementwise computations.
l = [1, 2, 3, 4, 5]
[x * 2 for x in l]

[2, 4, 6, 8, 10]

In [None]:
lista_1 = [1.89,'Caballo',3,4,"Oveja","Casa","Merienda",3.2, True, 4,7, 8.9]
lista_1

[1.89, 'Caballo', 3, 4, 'Oveja', 'Casa', 'Merienda', 3.2, True, 4, 7, 8.9]

##**Mi Ejemplo 2**

In [None]:
first_example = lista_1[0]
second_example = lista_1[4]
third_example = lista_1[5]
print("Primer elemento de la lista es: ", first_example)
print("El quinto elemento de la lista es : ", second_example)
print("El sexto elemento de la lista es: ", third_example)

Primer elemento de la lista es:  1.89
El quinto elemento de la lista es :  Oveja
El sexto elemento de la lista es:  Casa


##**Mi Ejemplo 3**

In [None]:
last_example = lista_1[-1]
penultimate_example = lista_1[-2]
print("El último elemento de la lista es: ", last_example)
print("El elemento inmediatamente anterior al último: ", penultimate_example)

El último elemento de la lista es:  8.9
El elemento inmediatamente anterior al último:  7


##**Mi Ejemplo 4** 

In [None]:
sublist_example = lista_1[0:5]
print('La sublista desde el elemento 1 al 5 es:', sublist_example)

La sublista desde el elemento 1 al 5 es: [1.89, 'Caballo', 3, 4, 'Oveja']


##**Mi Ejemplo 5**

In [None]:
first_three_ex = lista_1[:3]
print('Los primeros tres elementos de la lista son:', first_three_ex)

Los primeros tres elementos de la lista son: [1.89, 'Caballo', 3]


##**Mi Ejemplo 6**

In [None]:
after_four_ex = lista_1[4:]
print('Los último elementos despues del cuatro son:', after_four_ex)

Los último elementos despues del cuatro son: ['Oveja', 'Casa', 'Merienda', 3.2, True, 4, 7, 8.9]


##**Mi Ejemplo 7**

In [None]:
abcd= ["a",6.7, 'k','m', 34,  'a', 'u', 24, 'd', "e", "i","o", "u"]
print('Los elementos pares de la lista son:',abcd[0::2]  )

Los elementos pares de la lista son: ['a', 'k', 34, 'u', 'd', 'i', 'u']


##**Mi Ejemplo 8**

In [None]:
print('La lista al revés es:', abcd[::-1])

La lista al revés es: ['u', 'o', 'i', 'e', 'd', 24, 'u', 'a', 34, 'm', 'k', 6.7, 'a']


##**Mi Ejemplo 9**

In [None]:
lista_2= [2,4,6,8,10,12,14,16,18]
print("La lista antes:", lista_2)
lista_2.append(20)
print("La lista despues:",lista_2)

La lista antes: [2, 4, 6, 8, 10, 12, 14, 16, 18]
La lista despues: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


##**Mi Ejemplo 10**

In [None]:
a=[x/2 for x in lista_2]
print('La lista al dividir los números a mitad es:', a)

La lista al dividir los números a mitad es: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]


##**Mi Ejemplo 11**

In [None]:
lista_1.append(lista_2)
print("La lista 1 con la lista 2 como un elemento más es:",lista_1)

La lista 1 con la lista 2 como un elemento más es: [1.89, 'Caballo', 3, 4, 'Oveja', 'Casa', 'Merienda', 3.2, True, 4, 7, 8.9, [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]]


## Review: Python Lists

- Zero-indexed sequence of arbitrary Python values.
- Slicing syntax: `l[start:stop:step]` copies elements at regular intervals from `start` to `stop`.
- Efficient (`O(1)`) appends and removes from end.
- Comprehension syntax: `[f(x) for x in l if cond(x)]`.

# Dictionaries

In [None]:
# Dictionaries are key-value mappings.
philosophers = {'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}
philosophers

{'Bertrand': 'Russell', 'David': 'Hume', 'Immanuel': 'Kant'}

In [None]:
# Like lists, dictionaries are size-mutable.
philosophers['Ludwig'] = 'Wittgenstein'
philosophers

{'Bertrand': 'Russell',
 'David': 'Hume',
 'Immanuel': 'Kant',
 'Ludwig': 'Wittgenstein'}

In [None]:
del philosophers['David']
philosophers

{'Bertrand': 'Russell', 'Immanuel': 'Kant', 'Ludwig': 'Wittgenstein'}

In [None]:
# No slicing.
philosophers['Bertrand':'Immanuel']

NameError: ignored

##**Mi Ejemplo 12**

In [None]:
frutas = {'Manzana':'Malus domestica', 'Fresa':'Fragaria',  'Pera':'Pyrus communis'}
print('Mi diccionario de frutas es', frutas)

Mi diccionario de frutas es {'Manzana': 'Malus domestica', 'Fresa': 'Fragaria', 'Pera': 'Pyrus communis'}


##**Mi Ejemplo 13**

In [None]:
frutas['Uva']= 'Vitis vinifera'
frutas

{'Fresa': 'Fragaria',
 'Manzana': 'Malus domestica',
 'Pera': 'Pyrus communis',
 'Uva': 'Vitis vinifera'}

##**Mi Ejemplo 14**

In [None]:
del frutas['Pera']
frutas

{'Fresa': 'Fragaria', 'Manzana': 'Malus domestica', 'Uva': 'Vitis vinifera'}

##**Mi Ejemplo 15**

In [None]:
frutas["Pera":"Malus domestica"]

TypeError: ignored

## Review: Python Dictionaries

- Unordered key-value mapping from (almost) arbitrary keys to arbitrary values.
- Efficient (`O(1)`) lookup, insertion, and deletion.
- No slicing (would require a notion of order).

In [None]:
# Suppose we have some matrices...
a = [[1, 2, 3],
     [2, 3, 4],
     [5, 6, 7],
     [1, 1, 1]]

b = [[1, 2, 3, 4],
     [2, 3, 4, 5]]

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

In [None]:
%%time

matmul(a, b)

CPU times: user 39 µs, sys: 0 ns, total: 39 µs
Wall time: 43.2 µs


[[5, 8, 11, 14], [8, 13, 18, 23], [17, 28, 39, 50], [3, 5, 7, 9]]

In [None]:
import random
def random_matrix(m, n):
    out = []
    for row in range(m):
        out.append([random.random() for _ in range(n)])
    return out

randm = random_matrix(2, 3)
randm

[[0.8546702415098711, 0.5209555545083568, 0.3759823914279984],
 [0.03953084831291387, 0.28538500524333177, 0.8572740708942216]]

In [None]:
%%time
randa = random_matrix(600, 100)
randb = random_matrix(100, 600)
x = matmul(randa, randb)

CPU times: user 10.3 s, sys: 43.8 ms, total: 10.3 s
Wall time: 12.1 s


In [None]:
# Maybe that's not that bad?  Let's try a simpler case.
def python_dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [None]:
%%fortr
subroutine fortran_dot_product(xs, ys, result)
    double precision, intent(in) :: xs(:)
    double precision, intent(in) :: ys(:)
    double precision, intent(out) :: result
    
    result = sum(xs * ys)
end

UsageError: Cell magic `%%fortr` not found.


In [None]:
import numpy as np
list_data = [float(i) for i in range(100000)]
array_data = np.array(list_data)

In [None]:
%%time
python_dot_product(list_data, list_data)

CPU times: user 11 ms, sys: 0 ns, total: 11 ms
Wall time: 11 ms


333328333350000.0

In [None]:
%%time
fortran_dot_product(array_data, array_data)

NameError: ignored

##**Mi Ejemplo 16**

In [None]:
a_1 = [[1, 5, 9],
        [2, 6, 10],
        [3, 7, 11],
        [4, 8, 12]]

b_1 = [[1, 4, 6, 9],
        [7, 1, 2, 5]]

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

In [None]:
%%time

matmul(a_1, b_1)

CPU times: user 64 µs, sys: 0 ns, total: 64 µs
Wall time: 70.1 µs


[[36, 9, 16, 34], [44, 14, 24, 48], [52, 19, 32, 62], [60, 24, 40, 76]]

##**Mi Ejemplo 17**

In [None]:
randm_ex = random_matrix(2,2)
randm_ex

[[0.10841352958205186, 0.2243897037267497],
 [0.15649267838727143, 0.7547311494842249]]

##**Mi Ejemplo 18**

In [None]:
%%time
randa_1 = random_matrix(8, 10)
randb_1 = random_matrix(10, 8)
x = matmul(randa_1, randb_1)

CPU times: user 373 µs, sys: 0 ns, total: 373 µs
Wall time: 381 µs


##**Mi Ejemplo 19**

In [None]:
import numpy as np

list_data_ex = [float(i) for i in range(300000)]
array_data_ex = np.array(list_data)

NameError: ignored

## Why is the Python Version so Much Slower?

In [None]:
# Dynamic typing.
def mul_elemwise(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

mul_elemwise([1, 2, 3, 4], [1, 2 + 0j, 3.0, 'four'])
#[type(x) for x in _]

[1, (4+0j), 9.0, 'fourfourfourfour']

In [None]:
# Interpretation overhead.
source_code = 'a + b * c'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)

  1           0 LOAD_NAME                0 (a)
              2 LOAD_NAME                1 (b)
              4 LOAD_NAME                2 (c)
              6 BINARY_MULTIPLY
              8 BINARY_ADD
             10 RETURN_VALUE


## Why is the Python Version so Slow?
- Dynamic typing means that every single operation requires dispatching on the input type.
- Having an interpreter means that every instruction is fetched and dispatched at runtime.
- Other overheads:
  - Arbitrary-size integers.
  - Reference-counted garbage collection.

> This is the paradox that we have to work with when we're doing scientific or numerically-intensive Python. What makes Python fast for development -- this high-level, interpreted, and dynamically-typed aspect of the language -- is exactly what makes it slow for code execution.

- Jake VanderPlas, [*Losing Your Loops: Fast Numerical Computing with NumPy*](https://www.youtube.com/watch?v=EEUXKG97YRw)

# What Do We Do?

- Python is slow for numerical computation because it performs dynamic dispatch on every operation we perform...

- ...but often, we just want to do the same thing over and over in a loop!

- If we don't need Python's dynamicism, we don't want to pay (much) for it.

- **Idea:** Dispatch **once per operation** instead of **once per element**.

In [None]:
import numpy as np

data = np.array([1, 2, 3, 4])
data

array([1, 2, 3, 4])

In [None]:
data + data

array([2, 4, 6, 8])

In [None]:
%%time
# Naive dot product
(data * data).sum()

CPU times: user 98 µs, sys: 0 ns, total: 98 µs
Wall time: 118 µs


30

In [None]:
%%time
# Built-in dot product.
data.dot(data)

CPU times: user 100 µs, sys: 0 ns, total: 100 µs
Wall time: 107 µs


30

In [None]:
%%time
fortran_dot_product(data, data)

NameError: ignored

##**Mi Ejemplo 20**

In [None]:
import numpy as np

datos_2 = np.array([9,7,7,6,5,4,2,2,1])
datos_2

array([9, 7, 7, 6, 5, 4, 2, 2, 1])

##**Mi Ejemplo 21**



In [None]:
datos_2 + datos_2

array([18, 14, 14, 12, 10,  8,  4,  4,  2])

##**Mi Ejemplo 22**


In [None]:
(datos_2 * datos_2).sum()

265

##**Mi Ejemplo 23**



In [None]:
%%time
# Built-in dot product.
datos_2.dot(datos_2)

CPU times: user 34 µs, sys: 3 µs, total: 37 µs
Wall time: 42 µs


265

##**Mi Ejemplo 24**


In [None]:
%%time
fortran_dot_product(datos_2, datos_2)

NameError: ignored

In [None]:
# Numpy won't allow us to write a string into an int array.
datos[0] = "foo"

NameError: ignored

##**Mi Ejemplo 25**



In [None]:
data[2] = "hello world"

In [None]:
# We also can't grow an array once it's created.
data.append(3)

AttributeError: ignored

##**Mi Ejemplo 26**


In [None]:
data_2.append(8)

NameError: ignored

In [None]:
# We **can** reshape an array though.
two_by_two = data.reshape(2, 2)
two_by_two

array([[1, 2],
       [3, 4]])

Numpy arrays are:

- Fixed-type

- Size-immutable

- Multi-dimensional

- Fast\*

\* If you use them correctly.

##**Mi Ejemplo 27**


In [None]:
two_by_two_ex = data_2.reshape(3, 3)
two_by_two_ex

NameError: ignored

# What's in an Array?

In [None]:
arr = np.array([1, 2, 3, 4, 5, 6], dtype='int16').reshape(2, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

Array:
[[1 2 3]
 [4 5 6]]
DType: int16
Shape: (2, 3)
Strides: (6, 2)
Data: b'\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00'


# Core Operations

- Vectorized **ufuncs** for elementwise operations.
- Fancy indexing and masking for selection and filtering.
- Aggregations across axes.
- Broadcasting

# UFuncs

UFuncs (universal functions) are functions that operate elementwise on one or more arrays.

In [None]:
data = np.arange(15).reshape(3, 5)
data

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [None]:
# Binary operators.
data * data

array([[  0,   1,   4,   9,  16],
       [ 25,  36,  49,  64,  81],
       [100, 121, 144, 169, 196]])

In [None]:
# Unary functions.
np.sqrt(data)

array([[0.        , 1.        , 1.41421356, 1.73205081, 2.        ],
       [2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ],
       [3.16227766, 3.31662479, 3.46410162, 3.60555128, 3.74165739]])

In [None]:
# Comparison operations
(data % 3) == 0

array([[ True, False, False,  True, False],
       [False,  True, False, False,  True],
       [False, False,  True, False, False]])

In [None]:
# Boolean combinators.
((data % 2) == 0) & ((data % 3) == 0)

array([[ True, False, False, False, False],
       [False,  True, False, False, False],
       [False, False,  True, False, False]])

In [None]:
# as of python 3.5, @ is matrix-multiply
data @ data.T

array([[ 30,  80, 130],
       [ 80, 255, 430],
       [130, 430, 730]])

##**Mi Ejemplo 28**


In [None]:
data_3 = np.arange(4,29).reshape(5, 5)
data_3

array([[ 4,  5,  6,  7,  8],
       [ 9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18],
       [19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28]])

##**Mi Ejemplo 29**

In [None]:
data_3*data_3

array([[ 16,  25,  36,  49,  64],
       [ 81, 100, 121, 144, 169],
       [196, 225, 256, 289, 324],
       [361, 400, 441, 484, 529],
       [576, 625, 676, 729, 784]])

##**Mi Ejemplo 30**


In [None]:
np.sqrt(data_3)

array([[2.        , 2.23606798, 2.44948974, 2.64575131, 2.82842712],
       [3.        , 3.16227766, 3.31662479, 3.46410162, 3.60555128],
       [3.74165739, 3.87298335, 4.        , 4.12310563, 4.24264069],
       [4.35889894, 4.47213595, 4.58257569, 4.69041576, 4.79583152],
       [4.89897949, 5.        , 5.09901951, 5.19615242, 5.29150262]])

##**Mi Ejemplo 31**


In [None]:
data_3% 2==3

array([[False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False]])

##**Mi Ejemplo 32**


In [None]:
((data_3 % 2) == 1) & ((data_3 % 3) == 0)

array([[False, False, False, False, False],
       [ True, False, False, False, False],
       [False,  True, False, False, False],
       [False, False,  True, False, False],
       [False, False, False,  True, False]])

##**Mi Ejemplo 33**


In [None]:
data_3 @ data_3.T

array([[ 190,  340,  490,  640,  790],
       [ 340,  615,  890, 1165, 1440],
       [ 490,  890, 1290, 1690, 2090],
       [ 640, 1165, 1690, 2215, 2740],
       [ 790, 1440, 2090, 2740, 3390]])

# UFuncs Review

- UFuncs provide efficient elementwise operations applied across one or more arrays.
- Arithmetic Operators (`+`, `*`, `/`)
- Comparisons (`==`, `>`, `!=`)
- Boolean Operators (`&`, `|`, `^`)
- Trigonometric Functions (`sin`, `cos`)
- Transcendental Functions (`exp`, `log`)

# Selections

We often want to perform an operation on just a subset of our data.

In [None]:
sines = np.sin(np.linspace(0, 3.14, 10))
cosines = np.cos(np.linspace(0, 3.14, 10))
sines

array([0.        , 0.34185385, 0.64251645, 0.86575984, 0.98468459,
       0.98496101, 0.8665558 , 0.64373604, 0.34335012, 0.00159265])

In [None]:
sines[:3]  # First three elements  

array([0.        , 0.34185385, 0.64251645])

In [None]:
# Slicing works with the same semantics as Python lists.
sines[0]

0.0

In [None]:
sines[::2]  # Every other element.

array([0.        , 0.64251645, 0.98468459, 0.8665558 , 0.34335012])

In [None]:
sines[5:]  # Elements from 5 on.

array([0.98496101, 0.8665558 , 0.64373604, 0.34335012, 0.00159265])

In [None]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.5)
print("sines[sines > 0.5]:\n", sines[sines > 0.5])

sines:
 [0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]
sines > 0.5:
 [False False  True  True  True  True  True  True False False]
sines[sines > 0.5]:
 [0.64251645 0.86575984 0.98468459 0.98496101 0.8665558  0.64373604]


In [None]:
# We index with lists/arrays of integers to select values at those indices.
print(sines)
sines[[0, 4, 7]]

[0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]


array([0.        , 0.98468459, 0.64373604])

In [None]:
# Index arrays are often used for sorting one or more arrays.
unsorted_data = np.array([1, 3, 2, 12, -1, 5, 2])

In [None]:
sort_indices = np.argsort(unsorted_data)
sort_indices

array([4, 0, 2, 6, 1, 5, 3])

In [None]:
unsorted_data[sort_indices]

array([-1,  1,  2,  2,  3,  5, 12])

In [None]:
market_caps = np.array([12, 6, 10, 5, 6])  # Presumably in dollars?
assets = np.array(['A', 'B', 'C', 'D', 'E'])

In [None]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap = np.argsort(market_caps)
assets[sort_by_mcap]

array(['D', 'B', 'E', 'C', 'A'], dtype='<U1')

In [None]:
# Indexers are also useful for aligning data.
print("Dates:\n", repr(event_dates))
print("Values:\n", repr(event_values))
print("Calendar:\n", repr(calendar))

NameError: ignored

In [None]:
print("Raw Dates:", event_dates)
print("Indices:", calendar.searchsorted(event_dates))
print("Forward-Filled Dates:", calendar[calendar.searchsorted(event_dates)])

NameError: ignored

##**Mi Ejemplo 34**


In [None]:
sines_ex = np.sin(np.linspace(2,54.57, 18))
cosines_ex = np.cos(np.linspace(0, 3.14, 17))
sines_ex

array([ 0.90929743, -0.928678  ,  0.94580741, -0.96064412,  0.97315217,
       -0.98330125,  0.99106674, -0.99642983,  0.99937751, -0.99990265,
        0.99800396, -0.99368605,  0.98695938, -0.97784027,  0.96635081,
       -0.95251887,  0.93637796, -0.91796721])

##**Mi Ejemplo 35**


In [None]:
sines_ex[0]

0.9092974268256817

##**Mi Ejemplo 36**


In [None]:
sines_ex[:2]

array([ 0.90929743, -0.928678  ])

##**Mi Ejemplo 37**


In [None]:
sines_ex[5:] 

array([-0.98330125,  0.99106674, -0.99642983,  0.99937751, -0.99990265,
        0.99800396, -0.99368605,  0.98695938, -0.97784027,  0.96635081,
       -0.95251887,  0.93637796, -0.91796721])

##**Mi Ejemplo 38**


In [None]:
sines_ex[::2]

array([0.90929743, 0.94580741, 0.97315217, 0.99106674, 0.99937751,
       0.99800396, 0.98695938, 0.96635081, 0.93637796])

##**Mi Ejemplo 39**


In [None]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.2)
print("sines[sines > 0.5]:\n", sines_ex[sines_ex >-0.5])

sines:
 [0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]
sines > 0.5:
 [False  True  True  True  True  True  True  True  True False]
sines[sines > 0.5]:
 [0.90929743 0.94580741 0.97315217 0.99106674 0.99937751 0.99800396
 0.98695938 0.96635081 0.93637796]


##**Mi Ejemplo 40**


In [None]:
print(sines_ex)
sines_ex[[1, 2, 8]]

[ 0.90929743 -0.928678    0.94580741 -0.96064412  0.97315217 -0.98330125
  0.99106674 -0.99642983  0.99937751 -0.99990265  0.99800396 -0.99368605
  0.98695938 -0.97784027  0.96635081 -0.95251887  0.93637796 -0.91796721]


array([-0.928678  ,  0.94580741,  0.99937751])

##**Mi Ejemplo 41**


In [2]:
import numpy as np
unsorted_data_3 = np.array([1,4,5,7,8,4,4,2,2,34,7,5,])

##**Mi Ejemplo 42**


In [4]:
sort_indices_3 = np.argsort(unsorted_data_3)
sort_indices_3


array([ 0,  7,  8,  1,  5,  6,  2, 11,  3, 10,  4,  9])

##**Mi Ejemplo 43**


In [5]:
unsorted_data_3[sort_indices_3]

array([ 1,  2,  2,  4,  4,  4,  5,  5,  7,  7,  8, 34])

##**Mi Ejemplo 44**


In [6]:
market_caps_3 = np.array([3, 43, 3, 41, 55, 49])  # Presumably in dollars?
assets_3 = np.array(['A', 'B', 'C', 'D', 'E', "I"])

In [8]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap_3 = np.argsort(market_caps_3)
assets_3[sort_by_mcap_3]

array(['A', 'C', 'D', 'B', 'I', 'E'], dtype='<U1')

On multi-dimensional arrays, we can slice along each axis independently.

In [9]:
data = np.arange(25).reshape(5, 5)
data

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [10]:
data[:2, :2]  # First two rows and first two columns.

array([[0, 1],
       [5, 6]])

In [18]:
data[(data[:, 0] % 2) == 0]  # Rows where the first column is divisible by two.

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24]])

In [19]:
data[:3, [0, -1]] 

array([[ 0,  4],
       [ 5,  9],
       [10, 14]])

##**Mi ejemplo 45**


In [11]:
data_3 = np.arange(64).reshape(8, 8)
data_3

array([[ 0,  1,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29, 30, 31],
       [32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47],
       [48, 49, 50, 51, 52, 53, 54, 55],
       [56, 57, 58, 59, 60, 61, 62, 63]])

##**Mi Ejemplo 46**


In [12]:
data_3[:3, :2] 

array([[ 0,  1],
       [ 8,  9],
       [16, 17]])

In [13]:
data[:2, [0, -1]]  # First two rows, first and last columns.

array([[0, 4],
       [5, 9]])

##**Mi Ejemplo 47**


In [17]:
data_3[:4, [0, -2]] 

array([[ 0,  6],
       [ 8, 14],
       [16, 22],
       [24, 30]])

##**Mi Ejemplo 48**


In [21]:
data_3[(data_3[:,0]% 5)==1]

array([[16, 17, 18, 19, 20, 21, 22, 23],
       [56, 57, 58, 59, 60, 61, 62, 63]])

# Selections Review

- Indexing with an integer removes a dimension.
- Slicing operations work on Numpy arrays the same way they do on lists.
- Indexing with a boolean array filters to True locations.
- Indexing with an integer array selects indices along an axis.
- Multidimensional arrays can apply selections independently along different axes.

## Reductions

Functions that reduce an array to a scalar.

$Var(X) = \frac{1}{N}\sqrt{\sum_{i=1}^N (x_i - \bar{x})^2}$

In [22]:
def variance(x):
    return ((x - x.mean()) ** 2).sum() / len(x)

In [23]:
variance(np.random.standard_normal(1000))

0.9193565154691172

In [24]:
variance(np.random.standard_normal(2000))

1.0058740718577177

- `sum()` and `mean()` are both **reductions**.

- In the simplest case, we use these to reduce an entire array into a single value...

In [25]:
data = np.arange(30)
data.mean()

14.5

- ...but we can do more interesting things with multi-dimensional arrays.

In [28]:
data = np.arange(30).reshape(3, 10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])

In [31]:
data.mean()

14.5

In [33]:
data.mean(axis=0)

array([10., 11., 12., 13., 14., 15., 16., 17., 18., 19.])

In [35]:
data.mean(axis=1)

array([ 4.5, 14.5, 24.5])

##**Mi Ejemplo 49**


In [27]:
data_3 = np.arange(20)
data_3.mean()

9.5

##**Mi Ejemplo 50**


In [30]:
data_3 = np.arange(20).reshape(5, 4)
data_3

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

##**Mi Ejemplo 51**


In [32]:
data_3.mean()

9.5

##**Mi Ejemplo 52**


In [34]:
data_3.mean(axis=0)

array([ 8.,  9., 10., 11.])

##**Mi Ejemplo 53**


In [36]:
data_3.mean(axis=1)

array([ 1.5,  5.5,  9.5, 13.5, 17.5])

## Reductions Review

- Reductions allow us to perform efficient aggregations over arrays.
- We can do aggregations over a single axis to collapse a single dimension.
- Many built-in reductions (`mean`, `sum`, `min`, `max`, `median`, ...).

# Broadcasting

In [37]:
row = np.array([1, 2, 3, 4])
column = np.array([[1], [2], [3]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

Row:
[1 2 3 4]
Column:
[[1]
 [2]
 [3]]


In [40]:
row + column

array([[2, 3, 4, 5],
       [3, 4, 5, 6],
       [4, 5, 6, 7]])

In [46]:
column = np.array([[1], [2], [3],[4],[5]])


##**Mi Ejemplo 54**


In [39]:
row_3 = np.array([7,9,8,7,6,0])
column_3 = np.array([[1], [8], [2],[3],[7]])
print("Row:\n", row_3, sep='')
print("Column:\n", column_3, sep='')

Row:
[7 9 8 7 6 0]
Column:
[[1]
 [8]
 [2]
 [3]
 [7]]


##**Mi Ejemplo 55**


In [44]:
row_3 + column_3

array([[ 8, 10,  9,  8,  7,  1],
       [15, 17, 16, 15, 14,  8],
       [ 9, 11, 10,  9,  8,  2],
       [10, 12, 11, 10,  9,  3],
       [14, 16, 15, 14, 13,  7]])

<center><img src="images/broadcasting.png" alt="Drawing" style="width: 60%;"/></center>

<h5>Source: http://www.scipy-lectures.org/_images/numpy_broadcasting.png</h5>

In [47]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data, sep='')
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

Data:
[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]]
Mean:
[10. 11. 12. 13. 14. 15. 16. 17. 18. 19.]
Data - Mean:
[[-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [ 10.  10.  10.  10.  10.  10.  10.  10.  10.  10.]]


##**Mi Ejemplo 56**





In [49]:
print("Data:\n", data_3, sep='')
print("Mean:\n", data_3.mean(axis=0), sep='')
print("Data - Mean:\n", data_3 - data_3.mean(axis=0), sep='')

Data:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]]
Mean:
[ 8.  9. 10. 11.]
Data - Mean:
[[-8. -8. -8. -8.]
 [-4. -4. -4. -4.]
 [ 0.  0.  0.  0.]
 [ 4.  4.  4.  4.]
 [ 8.  8.  8.  8.]]


# Broadcasting Review

- Numpy operations can work on arrays of different dimensions as long as the arrays' shapes are still "compatible".
- Broadcasting works by "tiling" the smaller array along the missing dimension.
- The result of a broadcasted operation is always at least as large in each dimension as the largest array in that dimension.

# Numpy Review

- Numerical algorithms are slow in pure Python because the overhead dynamic dispatch dominates our runtime.

- Numpy solves this problem by:
  1. Imposing additional restrictions on the contents of arrays.
  2. Moving the inner loops of our algorithms into compiled C code.

- Using Numpy effectively often requires reworking an algorithms to use vectorized operations instead of for-loops, but the resulting operations are usually simpler, clearer, and faster than the pure Python equivalent.

<center><img src="images/unicorn.jpg" alt="Drawing" style="width: 75%;"/></center>

Numpy is great for many things, but...

- Sometimes our data is equipped with a natural set of **labels**:
  - Dates/Times
  - Stock Tickers
  - Field Names (e.g. Open/High/Low/Close)

- Sometimes we have **more than one type of data** that we want to keep grouped together.
  - Tables with a mix of real-valued and categorical data.

- Sometimes we have **missing** data, which we need to ignore, fill, or otherwise work around.

<center><img src="images/panda-wrangling.gif" alt="Drawing" style="width: 75%;"/></center>

<center><img src="images/pandas_logo.png" alt="Drawing" style="width: 75%;"/></center>


Pandas extends Numpy with more complex data structures:

- `Series`: 1-dimensional, homogenously-typed, labelled array.
- `DataFrame`: 2-dimensional, semi-homogenous, labelled table.

Pandas also provides many utilities for: 
- Input/Output
- Data Cleaning
- Rolling Algorithms
- Plotting

# Selection in Pandas

In [52]:
import pandas as pd
s = pd.Series(index=['a', 'b', 'c', 'd', 'e'], data=[1, 2, 3, 4, 5])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [57]:
# There are two pieces to a Series: the index and the values.
print("The index is:", s.index)
print("The values are:", s.values)

The index is: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
The values are: [1 2 3 4 5]


In [59]:
# We can look up values out of a Series by position...
s.iloc[0]

1

In [61]:
# ... or by label.
s.loc['a']

1

In [63]:
# Slicing works as expected...
s.iloc[:2]

a    1
b    2
dtype: int64

In [66]:
# ...but it works with labels too!
s.loc[:'c']

a    1
b    2
c    3
dtype: int64

In [68]:
# Fancy indexing works the same as in numpy.
s.iloc[[0, -1]]

a    1
e    5
dtype: int64

In [69]:
# As does boolean masking.
s.loc[s > 2]

c    3
d    4
e    5
dtype: int64

In [70]:
# Element-wise operations are aligned by index.
other_s = pd.Series({'a': 10.0, 'c': 20.0, 'd': 30.0, 'z': 40.0})
other_s

a    10.0
c    20.0
d    30.0
z    40.0
dtype: float64

In [71]:
s + other_s

a    11.0
b     NaN
c    23.0
d    34.0
e     NaN
z     NaN
dtype: float64

In [72]:
# We can fill in missing values with fillna().
(s + other_s).fillna(0.0)

a    11.0
b     0.0
c    23.0
d    34.0
e     0.0
z     0.0
dtype: float64

In [73]:
# Most real datasets are read in from an external file format.
aapl = pd.read_csv('AAPL.csv', parse_dates=['Date'], index_col='Date')
aapl.head()

FileNotFoundError: ignored

In [None]:
aapl.loc[pd.Timestamp('2010-02-01'):pd.Timestamp('2010-02-04'), ['Close', 'Volume']]

NameError: ignored

##**Mi Ejemplo 57**


In [56]:
s_3 = pd.Series(index=['L', 'A', 'U', 'R', 'A'], data=[12,1,22,19,1])
s_3

L    12
A     1
U    22
R    19
A     1
dtype: int64

##**Mi Ejemplo 58**


In [58]:
print("The index is:", s_3.index)
print("The values are:", s_3.values)

The index is: Index(['L', 'A', 'U', 'R', 'A'], dtype='object')
The values are: [12  1 22 19  1]


##**Mi Ejemplo 59**


In [60]:
s_3.iloc[2]

22

##**Mi Ejemplo 60**


In [62]:
s_3.loc["U"]

22

##**Mi Ejemplo 61**


In [65]:
s_3.iloc[:5]

L    12
A     1
U    22
R    19
A     1
dtype: int64

##**Mi Ejemplo 62**


In [67]:
s_3.loc[:'R']

L    12
A     1
U    22
R    19
dtype: int64

##**Mi Ejemplo 63**


In [75]:
s_3.iloc[[0, -3]]

L    12
U    22
dtype: int64

##**Mi Ejemplo 64**


In [77]:
s_3.loc[s_3 < 15]

L    12
A     1
A     1
dtype: int64

##**Mi Ejemplo 65**


In [79]:
other_s_3 = pd.Series({'J': 10, 'U': 22, 'D':4, 'I': 9, 'T': 21, "H":8})
other_s_3

J    10
U    22
D     4
I     9
T    21
H     8
dtype: int64

##**Mi Ejemplo 66**


In [80]:
s_3 + other_s_3

A     NaN
A     NaN
D     NaN
H     NaN
I     NaN
J     NaN
L     NaN
R     NaN
T     NaN
U    44.0
dtype: float64

##**Mi Ejemplo 67**


In [83]:
(s_3 + other_s_3).fillna(5)

A     5.0
A     5.0
D     5.0
H     5.0
I     5.0
J     5.0
L     5.0
R     5.0
T     5.0
U    44.0
dtype: float64

##**Mi Ejemplo 68**


In [None]:
# Slicing generalizes to two dimensions as you'd expect:
aapl.iloc[:2, :2]

NameError: ignored

# Rolling Operations

<center><img src="images/rolling.gif" alt="Drawing" style="width: 75%;"/></center>

In [None]:
aapl.rolling(5)[['Close', 'Adj Close']].mean().plot();

NameError: ignored

In [None]:
# Drop `Volume`, since it's way bigger than everything else.
aapl.drop('Volume', axis=1).resample('2W').max().plot();

In [None]:
# 30-day rolling exponentially-weighted stddev of returns.
aapl['Close'].pct_change().ewm(span=30).std().plot();

# "Real World" Data

In [None]:
from demos.avocados import read_avocadata

avocados = read_avocadata('2014', '2016')
avocados.head()

In [None]:
# Unlike numpy arrays, pandas DataFrames can have a different dtype for each column.
avocados.dtypes

In [None]:
# What's the regional average price of a HASS avocado every day?
hass = avocados[avocados.Variety == 'HASS']
hass.groupby(['Date', 'Region'])['Weighted Avg Price'].mean().unstack().ffill().plot();

In [None]:
def _organic_spread(group):

    if len(group.columns) != 2:
        return pd.Series(index=group.index, data=0.0)
    
    is_organic = group.columns.get_level_values('Organic').values.astype(bool)
    organics = group.loc[:, is_organic].squeeze()
    non_organics = group.loc[:, ~is_organic].squeeze()
    diff = organics - non_organics
    return diff

def organic_spread_by_region(df):
    """What's the difference between the price of an organic 
    and non-organic avocado within each region?
    """
    return (
        df
        .set_index(['Date', 'Region', 'Organic'])
         ['Weighted Avg Price']
        .unstack(level=['Region', 'Organic'])
        .ffill()
        .groupby(level='Region', axis=1)
        .apply(_organic_spread)
    )

In [None]:
organic_spread_by_region(hass).plot();
plt.gca().set_title("Daily Regional Organic Spread");
plt.legend(bbox_to_anchor=(1, 1));

In [None]:
spread_correlation = organic_spread_by_region(hass).corr()
spread_correlation

In [None]:
import seaborn as sns
grid = sns.clustermap(spread_correlation, annot=True)
fig = grid.fig
axes = fig.axes
ax = axes[2]
ax.set_xticklabels(ax.get_xticklabels(), rotation=45);

# Pandas Review

- Pandas extends numpy with more complex datastructures and algorithms.
- If you understand numpy, you understand 90% of pandas.
- `groupby`, `set_index`, and `unstack` are powerful tools for working with categorical data.
- Avocado prices are surprisingly interesting :)

# Thanks!