In [None]:
!pip install githubdl

Collecting githubdl
  Downloading https://files.pythonhosted.org/packages/f9/18/d11b9ea8a29440c21dd2ba1e4b6e3d859d5871ab4db6ffdf65a7a7074f4a/githubdl-0.1.6-py3-none-any.whl
Installing collected packages: githubdl
Successfully installed githubdl-0.1.6


In [None]:
!githubdl

usage: githubdl [-h] (-f FILE | -d DIR | -a | -b) -u URL [-t TARGET]
                [-g GIT_TOKEN] [-l LOG_LEVEL] [-r REFERENCE] [-s]
githubdl: error: the following arguments are required: -u/--url


In [None]:
!githubdl -u https://github.com/ssanderson/pydata-toolbox -d demos -g c5deee5c551fd7c1598a2e1c5f5516b58d5f3fcd
!githubdl -u https://github.com/ssanderson/pydata-toolbox -d notebooks -g c5deee5c551fd7c1598a2e1c5f5516b58d5f3fcd
!githubdl -u https://github.com/juarodriguezc/AlgorithmsUN2021I -d Lab2/files -g c5deee5c551fd7c1598a2e1c5f5516b58d5f3fcd

In [None]:
pip install -U fortran-magic

Collecting fortran-magic
  Downloading https://files.pythonhosted.org/packages/53/b1/d664bd431354c450f40fa436b2c9306666cd417b23098dd5636751c557a4/fortran_magic-0.7-py3-none-any.whl
Installing collected packages: fortran-magic
Successfully installed fortran-magic-0.7


In [None]:
%reload_ext fortranmagic
instalacion = True

  self._lib_dir = os.path.join(get_ipython_cache_dir(), 'fortran')


In [None]:
%matplotlib inline
%load_ext fortranmagic

import sys; sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('figure', figsize=(12, 7))

ran_the_first_cell = True

jan2017 = pd.to_datetime(['2017-01-03 00:00:00+00:00',
 '2017-01-04 00:00:00+00:00',
 '2017-01-05 00:00:00+00:00',
 '2017-01-06 00:00:00+00:00',
 '2017-01-09 00:00:00+00:00',
 '2017-01-10 00:00:00+00:00',
 '2017-01-11 00:00:00+00:00',
 '2017-01-12 00:00:00+00:00',
 '2017-01-13 00:00:00+00:00',
 '2017-01-17 00:00:00+00:00',
 '2017-01-18 00:00:00+00:00',
 '2017-01-19 00:00:00+00:00',
 '2017-01-20 00:00:00+00:00',
 '2017-01-23 00:00:00+00:00',
 '2017-01-24 00:00:00+00:00',
 '2017-01-25 00:00:00+00:00',
 '2017-01-26 00:00:00+00:00',
 '2017-01-27 00:00:00+00:00',
 '2017-01-30 00:00:00+00:00',
 '2017-01-31 00:00:00+00:00',
 '2017-02-01 00:00:00+00:00'])
calendar = jan2017.values.astype('datetime64[D]')

event_dates = pd.to_datetime(['2017-01-06 00:00:00+00:00', 
                             '2017-01-07 00:00:00+00:00', 
                             '2017-01-08 00:00:00+00:00']).values.astype('datetime64[D]')
event_values = np.array([10, 15, 20])

The fortranmagic extension is already loaded. To reload it, use:
  %reload_ext fortranmagic


<center>
  <h1>The PyData Toolbox</h1>
  <h3>Scott Sanderson (Twitter: @scottbsanderson, GitHub: ssanderson)</h3>
  <h3><a href="https://github.com/ssanderson/pydata-toolbox">https://github.com/ssanderson/pydata-toolbox</a></h3>
</center>

# About Me:

<img src="images/me.jpg" alt="Drawing" style="width: 300px;"/>

- Senior Engineer at [Quantopian](www.quantopian.com)
- Background in Mathematics and Philosophy
- **Twitter:** [@scottbsanderson](https://twitter.com/scottbsanderson)
- **GitHub:** [ssanderson](github.com/ssanderson)

## Outline

- Built-in Data Structures
- Numpy `array`
- Pandas `Series`/`DataFrame`
- Plotting and "Real-World" Analyses

# Data Structures

> Rule 5. Data dominates. If you've chosen the right data structures and organized things well, the algorithms
will almost always be self-evident. Data structures, not algorithms, are central to programming.

- *Notes on Programming in C*, by Rob Pike.

# Lists

In [None]:
assert ran_the_first_cell, "Oh noes!"

In [None]:
l = [1, 'two', 3.0, 4, 5.0, "six"]
l

[1, 'two', 3.0, 4, 5.0, 'six']

In [None]:
# Lists can be indexed like C-style arrays.
first = l[0]
second = l[1]
print("first:", first)
print("second:", second)

first: 1
second: two


In [None]:
# Negative indexing gives elements relative to the end of the list.
last = l[-1]
penultimate = l[-2]
print("last:", last)
print("second to last:", penultimate)

last: six
second to last: 5.0


In [None]:
# Lists can also be sliced, which makes a copy of elements between 
# start (inclusive) and stop (exclusive)
sublist = l[1:3]
sublist

['two', 3.0]

In [None]:
# l[:N] is equivalent to l[0:N].
first_three = l[:3]
first_three

[1, 'two', 3.0]

In [None]:
# l[3:] is equivalent to l[3:len(l)].
after_three = l[3:]
after_three

[4, 5.0, 'six']

In [None]:
# There's also a third parameter, "step", which gets every Nth element.
l = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h']
l[1:7:2]

['b', 'd', 'f']

In [None]:
# This is a cute way to reverse a list.
l[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

In [None]:
# Lists can be grown efficiently (in O(1) amortized time).
l = [1, 2, 3, 4, 5]
print("Before:", l)
l.append('six')
print("After:", l)

Before: [1, 2, 3, 4, 5]
After: [1, 2, 3, 4, 5, 'six']


In [None]:
# Comprehensions let us perform elementwise computations.
l = [1, 2, 3, 4, 5]
[x * 2 for x in l]

[2, 4, 6, 8, 10]

**Mi ejemplo para esta sección:**

In [None]:
#programa que ordena una lista siguiendo un algoritmo de fuerza bruta
a = [4, 3, 6, 7, 1, 9, 7, 8] # Ingrese aquí lista a ordenar

def BuscarMenor(*entrada):
  args = entrada[0]
  menor = args[0]
  x = 0
  for i in range(1, len(args)):
    if args[i] < menor:
      menor = args[i]
      x = i
  return x

def BuscarMayor(*entrada):
  args = entrada[0]
  mayor = args[0]
  x = 0
  for i in range(1, len(args)):
    if args[i] > mayor:
      mayor = args[i]
      x = i
  return x
aMayor = a[BuscarMayor(a)]
b = []
for i in range(len(a)):
  menori = BuscarMenor(a)
  b.append(a[menori])
  a[menori] = aMayor
print(b[:len(a)])

[1, 3, 4, 6, 7, 7, 8, 9]


## Review: Python Lists

- Zero-indexed sequence of arbitrary Python values.
- Slicing syntax: `l[start:stop:step]` copies elements at regular intervals from `start` to `stop`.
- Efficient (`O(1)`) appends and removes from end.
- Comprehension syntax: `[f(x) for x in l if cond(x)]`.

# Dictionaries

In [None]:
# Dictionaries are key-value mappings.
philosophers = {'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}
philosophers

{'Bertrand': 'Russell', 'David': 'Hume', 'Immanuel': 'Kant'}

In [None]:
# Like lists, dictionaries are size-mutable.
philosophers['Ludwig'] = 'Wittgenstein'
philosophers

{'Bertrand': 'Russell',
 'David': 'Hume',
 'Immanuel': 'Kant',
 'Ludwig': 'Wittgenstein'}

In [None]:
del philosophers['David']
philosophers

{'Bertrand': 'Russell', 'Immanuel': 'Kant', 'Ludwig': 'Wittgenstein'}

In [None]:
# No slicing.
# philosophers['Bertrand':'Immanuel']

**Mi ejemplo para esta sección:**

In [None]:
#Diccionario que relaciona llas letras con su respectiva posición en el abecedario
Letras = {'a': 1, 
          'b': 2,
          'c': 3,
          'd': 4, 
          'e': 5,
          'f': 6, 
          'g': 7,
          'h': 8,
          'i': 9,
          'j': 10, 
          'k': 11,
          'l': 12,
          'm': 13, 
          'n': 14,
          'o': 16, 
          'p': 17,
          'q': 18,
          'r': 19, 
          's': 20,
          't': 21,
          'u': 22, 
          'v': 23,
          'w': 24,
          'x': 25, 
          'y': 26,
          'z': 27
          }
print(Letras)
Letras['ñ'] = 15
print(Letras)
del Letras['ñ']
print(Letras)
#Letras['a' : 'f']

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, 'ñ': 15}
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}


## Review: Python Dictionaries

- Unordered key-value mapping from (almost) arbitrary keys to arbitrary values.
- Efficient (`O(1)`) lookup, insertion, and deletion.
- No slicing (would require a notion of order).

<center><img src="images/pacino.gif" alt="Drawing" style="width: 100%;"/></center>


In [None]:
# Suppose we have some matrices...
a = [[1, 2, 3],
     [2, 3, 4],
     [5, 6, 7],
     [1, 1, 1]]

b = [[1, 2, 3, 4],
     [2, 3, 4, 5]]

In [None]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

<center><img src="images/gross.gif" alt="Drawing" style="width: 50%;"/></center>


In [None]:
%%time

matmul(a, b)

CPU times: user 58 µs, sys: 0 ns, total: 58 µs
Wall time: 62 µs


[[5, 8, 11, 14], [8, 13, 18, 23], [17, 28, 39, 50], [3, 5, 7, 9]]

In [None]:
import random
def random_matrix(m, n):
    out = []
    for row in range(m):
        out.append([random.random() for _ in range(n)])
    return out

randm = random_matrix(2, 3)
randm

[[0.19198580721686442, 0.6678999174214523, 0.6665860377678822],
 [0.5193917001349029, 0.4848296705346903, 0.751932870601914]]

In [None]:
%%time
randa = random_matrix(600, 100)
randb = random_matrix(100, 600)
x = matmul(randa, randb)

CPU times: user 9.72 s, sys: 17.2 ms, total: 9.73 s
Wall time: 9.75 s


In [None]:
# Maybe that's not that bad?  Let's try a simpler case.
def python_dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [None]:
%%fortran
subroutine fortran_dot_product(xs, ys, result)
    double precision, intent(in) :: xs(:)
    double precision, intent(in) :: ys(:)
    double precision, intent(out) :: result
    
    result = sum(xs * ys)
end

In [None]:
list_data = [float(i) for i in range(100000)]
array_data = np.array(list_data)

In [None]:
%%time
python_dot_product(list_data, list_data)

CPU times: user 8.42 ms, sys: 24 µs, total: 8.45 ms
Wall time: 8.4 ms


333328333350000.0

In [None]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 183 µs, sys: 0 ns, total: 183 µs
Wall time: 187 µs


333328333350000.0

<center><img src="images/sloth.gif" alt="Drawing" style="width: 1080px;"/></center>


**Mi ejemplo para esta sección:**

In [None]:
#usando el algoritmo de producto punto alojado en numPy vamos a comparar con respecto al de fortran y el ya escrito aquí
import numpy as np
def numPy_dot_product(xs, ys):
  return np.dot(xs, ys)

In [None]:
%%time
numPy_dot_product(list_data, list_data)

CPU times: user 11.1 ms, sys: 4.01 ms, total: 15.1 ms
Wall time: 19 ms


333328333350000.0

Nos damos cuenta que es incluso más lento que el algoritmo escrito en este colab, siendo esta la forma más lenta de hacer un producto punto

## Why is the Python Version so Much Slower?

In [None]:
# Dynamic typing.
def mul_elemwise(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

mul_elemwise([1, 2, 3, 4], [1, 2 + 0j, 3.0, 'four'])
#[type(x) for x in _]

[1, (4+0j), 9.0, 'fourfourfourfour']

In [None]:
# Interpretation overhead.
source_code = 'a + b * c'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)

  1           0 LOAD_NAME                0 (a)
              2 LOAD_NAME                1 (b)
              4 LOAD_NAME                2 (c)
              6 BINARY_MULTIPLY
              8 BINARY_ADD
             10 RETURN_VALUE


## Why is the Python Version so Slow?
- Dynamic typing means that every single operation requires dispatching on the input type.
- Having an interpreter means that every instruction is fetched and dispatched at runtime.
- Other overheads:
  - Arbitrary-size integers.
  - Reference-counted garbage collection.

> This is the paradox that we have to work with when we're doing scientific or numerically-intensive Python. What makes Python fast for development -- this high-level, interpreted, and dynamically-typed aspect of the language -- is exactly what makes it slow for code execution.

- Jake VanderPlas, [*Losing Your Loops: Fast Numerical Computing with NumPy*](https://www.youtube.com/watch?v=EEUXKG97YRw)

# What Do We Do?

<center><img src="images/runaway.gif" alt="Drawing" style="width: 50%;"/></center>

<center><img src="images/thisisfine.gif" alt="Drawing" style="width: 1080px;"/></center>

- Python is slow for numerical computation because it performs dynamic dispatch on every operation we perform...

- ...but often, we just want to do the same thing over and over in a loop!

- If we don't need Python's dynamicism, we don't want to pay (much) for it.

- **Idea:** Dispatch **once per operation** instead of **once per element**.

In [None]:
import numpy as np

data = np.array([1, 2, 3, 4])
data

array([1, 2, 3, 4])

In [None]:
data + data

array([2, 4, 6, 8])

In [None]:
%%time
# Naive dot product
(array_data * array_data).sum()

CPU times: user 436 µs, sys: 0 ns, total: 436 µs
Wall time: 445 µs


333328333350000.0

In [None]:
%%time
# Built-in dot product.
array_data.dot(array_data)

CPU times: user 0 ns, sys: 304 µs, total: 304 µs
Wall time: 183 µs


333328333350000.0

In [None]:
%%time
fortran_dot_product(array_data, array_data)

CPU times: user 333 µs, sys: 24 µs, total: 357 µs
Wall time: 183 µs


333328333350000.0

In [None]:
# Numpy won't allow us to write a string into an int array.
#data[0] = "foo"

In [None]:
# We also can't grow an array once it's created.
#data.append(3)

In [None]:
# We **can** reshape an array though.
two_by_two = data.reshape(2, 2)
two_by_two

array([[1, 2],
       [3, 4]])

Numpy arrays are:

- Fixed-type

- Size-immutable

- Multi-dimensional

- Fast\*

\* If you use them correctly.

**Mi ejemplo para esta sección:**

In [None]:
#estimar el tiempo usado para otras funciones de python como el producto matricial
two_by_three = np.array([[3, 5, 6], [4, 5, 9]])
%time
print(np.dot(two_by_two, two_by_three))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs
[[11 15 24]
 [25 35 54]]


# What's in an Array?

In [None]:
arr = np.array([1, 2, 3, 4, 5, 6], dtype='int16').reshape(2, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

NameError: ignored

**Mi ejemplo:**

In [None]:
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype='int64').reshape(4, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

# Core Operations

- Vectorized **ufuncs** for elementwise operations.
- Fancy indexing and masking for selection and filtering.
- Aggregations across axes.
- Broadcasting

# UFuncs

UFuncs (universal functions) are functions that operate elementwise on one or more arrays.

In [None]:
data = np.arange(15).reshape(3, 5)
data

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [None]:
# Binary operators.
data * data

array([[  0,   1,   4,   9,  16],
       [ 25,  36,  49,  64,  81],
       [100, 121, 144, 169, 196]])

In [None]:
# Unary functions.
np.sqrt(data)

array([[ 0.        ,  1.        ,  1.41421356,  1.73205081,  2.        ],
       [ 2.23606798,  2.44948974,  2.64575131,  2.82842712,  3.        ],
       [ 3.16227766,  3.31662479,  3.46410162,  3.60555128,  3.74165739]])

In [None]:
# Comparison operations
(data % 3) == 0

array([[ True, False, False,  True, False],
       [False,  True, False, False,  True],
       [False, False,  True, False, False]], dtype=bool)

In [None]:
# Boolean combinators.
((data % 2) == 0) & ((data % 3) == 0)

array([[ True, False, False, False, False],
       [False,  True, False, False, False],
       [False, False,  True, False, False]], dtype=bool)

In [None]:
# as of python 3.5, @ is matrix-multiply
data @ data.T

array([[ 30,  80, 130],
       [ 80, 255, 430],
       [130, 430, 730]])

**Mi ejemplo para esta sección:**

In [None]:
data = np.arange(35).reshape(7, 5)
print("data:\n", data, "\ndata^2:\n", data * data, "\nsqrt(data):\n", np.sqrt(data))


data:
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]] 
data^2:
 [[   0    1    4    9   16]
 [  25   36   49   64   81]
 [ 100  121  144  169  196]
 [ 225  256  289  324  361]
 [ 400  441  484  529  576]
 [ 625  676  729  784  841]
 [ 900  961 1024 1089 1156]] 
sqrt(data):
 [[0.         1.         1.41421356 1.73205081 2.        ]
 [2.23606798 2.44948974 2.64575131 2.82842712 3.        ]
 [3.16227766 3.31662479 3.46410162 3.60555128 3.74165739]
 [3.87298335 4.         4.12310563 4.24264069 4.35889894]
 [4.47213595 4.58257569 4.69041576 4.79583152 4.89897949]
 [5.         5.09901951 5.19615242 5.29150262 5.38516481]
 [5.47722558 5.56776436 5.65685425 5.74456265 5.83095189]]


In [None]:
data == 13

array([[False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False]])

In [None]:
((data == 13) | (data >= 17))

array([[False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False,  True, False],
       [False, False,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

In [None]:
data @ data.T

array([[  30,   80,  130,  180,  230,  280,  330],
       [  80,  255,  430,  605,  780,  955, 1130],
       [ 130,  430,  730, 1030, 1330, 1630, 1930],
       [ 180,  605, 1030, 1455, 1880, 2305, 2730],
       [ 230,  780, 1330, 1880, 2430, 2980, 3530],
       [ 280,  955, 1630, 2305, 2980, 3655, 4330],
       [ 330, 1130, 1930, 2730, 3530, 4330, 5130]])

# UFuncs Review

- UFuncs provide efficient elementwise operations applied across one or more arrays.
- Arithmetic Operators (`+`, `*`, `/`)
- Comparisons (`==`, `>`, `!=`)
- Boolean Operators (`&`, `|`, `^`)
- Trigonometric Functions (`sin`, `cos`)
- Transcendental Functions (`exp`, `log`)

# Selections

We often want to perform an operation on just a subset of our data.

In [None]:
sines = np.sin(np.linspace(0, 3.14, 10))
cosines = np.cos(np.linspace(0, 3.14, 10))
sines

array([0.        , 0.34185385, 0.64251645, 0.86575984, 0.98468459,
       0.98496101, 0.8665558 , 0.64373604, 0.34335012, 0.00159265])

In [None]:
# Slicing works with the same semantics as Python lists.
sines[0]

0.0

In [None]:
sines[:3]  # First three elements  

array([0.        , 0.34185385, 0.64251645])

In [None]:
sines[5:]  # Elements from 5 on.

array([0.98496101, 0.8665558 , 0.64373604, 0.34335012, 0.00159265])

In [None]:
sines[::2]  # Every other element.

array([0.        , 0.64251645, 0.98468459, 0.8665558 , 0.34335012])

In [None]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.5)
print("sines[sines > 0.5]:\n", sines[sines > 0.5])

sines:
 [0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]
sines > 0.5:
 [False False  True  True  True  True  True  True False False]
sines[sines > 0.5]:
 [0.64251645 0.86575984 0.98468459 0.98496101 0.8665558  0.64373604]


In [None]:
# We index with lists/arrays of integers to select values at those indices.
print(sines)
sines[[0, 4, 7]]

[0.         0.34185385 0.64251645 0.86575984 0.98468459 0.98496101
 0.8665558  0.64373604 0.34335012 0.00159265]


array([0.        , 0.98468459, 0.64373604])

In [None]:
# Index arrays are often used for sorting one or more arrays.
unsorted_data = np.array([1, 3, 2, 12, -1, 5, 2])

In [None]:
sort_indices = np.argsort(unsorted_data)
sort_indices

array([4, 0, 2, 6, 1, 5, 3])

In [None]:
unsorted_data[sort_indices]

array([-1,  1,  2,  2,  3,  5, 12])

In [None]:
market_caps = np.array([12, 6, 10, 5, 6])  # Presumably in dollars?
assets = np.array(['A', 'B', 'C', 'D', 'E'])

In [None]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap = np.argsort(market_caps)
assets[sort_by_mcap]

array(['D', 'B', 'E', 'C', 'A'], dtype='<U1')

In [None]:
# Indexers are also useful for aligning data.
print("Dates:\n", repr(event_dates))
print("Values:\n", repr(event_values))
print("Calendar:\n", repr(calendar))

Dates:
 array(['2017-01-06', '2017-01-07', '2017-01-08'], dtype='datetime64[D]')
Values:
 array([10, 15, 20])
Calendar:
 array(['2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06',
       '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
       '2017-01-13', '2017-01-17', '2017-01-18', '2017-01-19',
       '2017-01-20', '2017-01-23', '2017-01-24', '2017-01-25',
       '2017-01-26', '2017-01-27', '2017-01-30', '2017-01-31',
       '2017-02-01'], dtype='datetime64[D]')


In [None]:
print("Raw Dates:", event_dates)
print("Indices:", calendar.searchsorted(event_dates))
print("Forward-Filled Dates:", calendar[calendar.searchsorted(event_dates)])

Raw Dates: ['2017-01-06' '2017-01-07' '2017-01-08']
Indices: [3 4 4]
Forward-Filled Dates: ['2017-01-06' '2017-01-09' '2017-01-09']


On multi-dimensional arrays, we can slice along each axis independently.

In [None]:
data = np.arange(25).reshape(5, 5)
data

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [None]:
data[:2, :2]  # First two rows and first two columns.

array([[0, 1],
       [5, 6]])

In [None]:
data[:2, [0, -1]]  # First two rows, first and last columns.

array([[0, 4],
       [5, 9]])

In [None]:
data[(data[:, 0] % 2) == 0]  # Rows where the first column is divisible by two.

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24]])

**Mi ejemplo para esta sección:**

In [None]:
randomExp = np.random.exponential(1.0, 100)
sinesEXP = np.sin(randomExp)
cosinesEXP = np.cos(randomExp)
print(sinesEXP)
print(cosinesEXP)

[ 0.93749752  0.29488403  0.79857738  0.41261712  0.55220394 -0.36664795
  0.34933392  0.26571172  0.4749245   0.17982699  0.99471959  0.84116383
  0.49860042  0.19351211 -0.02488953  0.85192143  0.97435584  0.25381946
  0.51646246  0.78968299  0.65213117  0.96584414  0.11940884  0.91679149
  0.13686539  0.10361452  0.7719036   0.933681    0.8707708   0.10227904
  0.16514078  0.05604701 -0.97304275  0.31293512  0.99456032  0.26714593
  0.33937646  0.98584286  0.06131392  0.89378316  0.35036273  0.00259698
  0.99996289  0.82121919  0.41503434  0.33708183  0.50617502  0.32664484
  0.60268925  0.87306622  0.81165593  0.26422782  0.38466331  0.07064077
  0.68917002  0.43369409  0.99197067  0.24005151  0.98928872  0.45398122
  0.99997885  0.30461167  0.9006281   0.4890173   0.51235885  0.10037764
  0.30248966  0.91164147  0.23859847  0.95741754  0.07268826  0.00300012
  0.29318075  0.74273443  0.7484638   0.14934786  0.99576615  0.23214768
  0.55167263  0.57829792  0.50257425  0.80813953  0

In [None]:
sort_sines = np.argsort(sinesEXP)
sort_cosines = np.argsort(cosinesEXP)
print(sort_sines)
print(sort_cosines)

[32  5 95 14 41 71 83 31 38 53 70 88 65 29 89 25 22 24 75 30  9 13 98 87
 77 68 57 86 17 51  7 35 85 72  1 66 61 33 47 45 36  6 40 52  3 44 55 59
  8 63 12 80 46 64 18 97 78  4 84 79 48 20 54 73 74 26 94 19  2 81 50 43
 11 93 15 28 49 96 91 39 62 67 23 27  0 82 92 99 69 21 16 37 58 56 34 10
 76 90 42 60]
[14 29 95 86 72  5 97 78 43 96 62 27  0 82 56 10 90 60 42 76 34 58 37 16
 32 21 69 99 92 23 67 39 91 49 28 15 93 11 50 81  2 19 94 26 74 73 54 20
 48 79 84  4 18 64 46 80 12 63  8 59 55 44  3 52 40  6 36 45 47 33 61 66
  1 85 35  7 51 17 57 68 77 87 98 13  9 30 75 24 22 25 89 65 88 70 53 38
 31 83 71 41]


In [None]:
print(sinesEXP[sort_sines])
print(cosinesEXP[sort_cosines])

[-0.97304275 -0.36664795 -0.14066242 -0.02488953  0.00259698  0.00300012
  0.03729009  0.05604701  0.06131392  0.07064077  0.07268826  0.08908111
  0.10037764  0.10227904  0.10338344  0.10361452  0.11940884  0.13686539
  0.14934786  0.16514078  0.17982699  0.19351211  0.1992103   0.20220571
  0.23214768  0.23859847  0.24005151  0.25089441  0.25381946  0.26422782
  0.26571172  0.26714593  0.27099925  0.29318075  0.29488403  0.30248966
  0.30461167  0.31293512  0.32664484  0.33708183  0.33937646  0.34933392
  0.35036273  0.38466331  0.41261712  0.41503434  0.43369409  0.45398122
  0.4749245   0.4890173   0.49860042  0.50257425  0.50617502  0.51235885
  0.51646246  0.52655434  0.55167263  0.55220394  0.57119819  0.57829792
  0.60268925  0.65213117  0.68917002  0.74273443  0.7484638   0.7719036
  0.78825224  0.78968299  0.79857738  0.80813953  0.81165593  0.82121919
  0.84116383  0.84623847  0.85192143  0.8707708   0.87306622  0.88353194
  0.88513544  0.89378316  0.9006281   0.91164147  0.

# Selections Review

- Indexing with an integer removes a dimension.
- Slicing operations work on Numpy arrays the same way they do on lists.
- Indexing with a boolean array filters to True locations.
- Indexing with an integer array selects indices along an axis.
- Multidimensional arrays can apply selections independently along different axes.

## Reductions

Functions that reduce an array to a scalar.

$Var(X) = \frac{1}{N}\sqrt{\sum_{i=1}^N (x_i - \bar{x})^2}$

In [None]:
def variance(x):
    return ((x - x.mean()) ** 2).sum() / len(x)

In [None]:
variance(np.random.standard_normal(1000))

1.0149374473477994

- `sum()` and `mean()` are both **reductions**.

- In the simplest case, we use these to reduce an entire array into a single value...

In [None]:
data = np.arange(30)
data.mean()

14.5

- ...but we can do more interesting things with multi-dimensional arrays.

In [None]:
data = np.arange(30).reshape(3, 10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])

In [None]:
data.mean()

14.5

In [None]:
data.mean(axis=0)

array([10., 11., 12., 13., 14., 15., 16., 17., 18., 19.])

In [None]:
data.mean(axis=1)

array([ 4.5, 14.5, 24.5])

**Mi ejemplo para esta sección:**

In [None]:
def mean(x):
    return x.sum() / len(x)

In [None]:
mean(np.random.standard_normal(1000))

0.027648236025910704

In [None]:
data = np.arange(3, 50).reshape(1, 47)
data.mean()

26.0

In [None]:
data.mean(axis=0)

array([ 3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15.,
       16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
       29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
       42., 43., 44., 45., 46., 47., 48., 49.])

In [None]:
data.mean(axis=1)

array([26.])

## Reductions Review

- Reductions allow us to perform efficient aggregations over arrays.
- We can do aggregations over a single axis to collapse a single dimension.
- Many built-in reductions (`mean`, `sum`, `min`, `max`, `median`, ...).

# Broadcasting

In [None]:
row = np.array([1, 2, 3, 4])
column = np.array([[1], [2], [3]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

Row:
[1 2 3 4]
Column:
[[1]
 [2]
 [3]]


In [None]:
row + column

array([[2, 3, 4, 5],
       [3, 4, 5, 6],
       [4, 5, 6, 7]])

<center><img src="images/broadcasting.png" alt="Drawing" style="width: 60%;"/></center>

<h5>Source: http://www.scipy-lectures.org/_images/numpy_broadcasting.png</h5>

In [None]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data, sep='')
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

Data:
[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]]
Mean:
[ 10.  11.  12.  13.  14.  15.  16.  17.  18.  19.]
Data - Mean:
[[-10. -10. -10. -10. -10. -10. -10. -10. -10. -10.]
 [  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [ 10.  10.  10.  10.  10.  10.  10.  10.  10.  10.]]


**Mi ejemplo para esta sección:**

In [None]:
row = np.array([4, 7, 9, 10, 12, 11, 14])
column = np.array([[4], [7], [9], [10], [12], [11], [14]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

Row:
[ 4  7  9 10 12 11 14]
Column:
[[ 4]
 [ 7]
 [ 9]
 [10]
 [12]
 [11]
 [14]]


In [None]:
row + column

array([[ 8, 11, 13, 14, 16, 15, 18],
       [11, 14, 16, 17, 19, 18, 21],
       [13, 16, 18, 19, 21, 20, 23],
       [14, 17, 19, 20, 22, 21, 24],
       [16, 19, 21, 22, 24, 23, 26],
       [15, 18, 20, 21, 23, 22, 25],
       [18, 21, 23, 24, 26, 25, 28]])

In [None]:
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

Mean:
[ 3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20.
 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38.
 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49.]
Data - Mean:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


# Broadcasting Review

- Numpy operations can work on arrays of different dimensions as long as the arrays' shapes are still "compatible".
- Broadcasting works by "tiling" the smaller array along the missing dimension.
- The result of a broadcasted operation is always at least as large in each dimension as the largest array in that dimension.

# Numpy Review

- Numerical algorithms are slow in pure Python because the overhead dynamic dispatch dominates our runtime.

- Numpy solves this problem by:
  1. Imposing additional restrictions on the contents of arrays.
  2. Moving the inner loops of our algorithms into compiled C code.

- Using Numpy effectively often requires reworking an algorithms to use vectorized operations instead of for-loops, but the resulting operations are usually simpler, clearer, and faster than the pure Python equivalent.

<center><img src="images/unicorn.jpg" alt="Drawing" style="width: 75%;"/></center>

Numpy is great for many things, but...

- Sometimes our data is equipped with a natural set of **labels**:
  - Dates/Times
  - Stock Tickers
  - Field Names (e.g. Open/High/Low/Close)

- Sometimes we have **more than one type of data** that we want to keep grouped together.
  - Tables with a mix of real-valued and categorical data.

- Sometimes we have **missing** data, which we need to ignore, fill, or otherwise work around.

<center><img src="images/panda-wrangling.gif" alt="Drawing" style="width: 75%;"/></center>

<center><img src="images/pandas_logo.png" alt="Drawing" style="width: 75%;"/></center>


Pandas extends Numpy with more complex data structures:

- `Series`: 1-dimensional, homogenously-typed, labelled array.
- `DataFrame`: 2-dimensional, semi-homogenous, labelled table.

Pandas also provides many utilities for: 
- Input/Output
- Data Cleaning
- Rolling Algorithms
- Plotting

# Selection in Pandas

In [None]:
s = pd.Series(index=['a', 'b', 'c', 'd', 'e'], data=[1, 2, 3, 4, 5])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [None]:
# There are two pieces to a Series: the index and the values.
print("The index is:", s.index)
print("The values are:", s.values)

The index is: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
The values are: [1 2 3 4 5]


In [None]:
# We can look up values out of a Series by position...
s.iloc[0]

1

In [None]:
# ... or by label.
s.loc['a']

1

In [None]:
# Slicing works as expected...
s.iloc[:2]

a    1
b    2
dtype: int64

In [None]:
# ...but it works with labels too!
s.loc[:'c']

a    1
b    2
c    3
dtype: int64

In [None]:
# Fancy indexing works the same as in numpy.
s.iloc[[0, -1]]

a    1
e    5
dtype: int64

In [None]:
# As does boolean masking.
s.loc[s > 2]

c    3
d    4
e    5
dtype: int64

In [None]:
# Element-wise operations are aligned by index.
other_s = pd.Series({'a': 10.0, 'c': 20.0, 'd': 30.0, 'z': 40.0})
other_s

a    10.0
c    20.0
d    30.0
z    40.0
dtype: float64

In [None]:
s + other_s

a    11.0
b     NaN
c    23.0
d    34.0
e     NaN
z     NaN
dtype: float64

In [None]:
# We can fill in missing values with fillna().
(s + other_s).fillna(0.0)

a    11.0
b     0.0
c    23.0
d    34.0
e     0.0
z     0.0
dtype: float64

In [None]:
# Most real datasets are read in from an external file format.
# aapl = pd.read_csv('AAPL.csv', parse_dates=['Date'], index_col='Date')
a# apl.head()

[[1, 2, 3], [2, 3, 4], [5, 6, 7], [1, 1, 1]]

In [None]:
# Slicing generalizes to two dimensions as you'd expect:
# aapl.iloc[:2, :2]

In [None]:
# aapl.loc[pd.Timestamp('2010-02-01'):pd.Timestamp('2010-02-04'), ['Close', 'Volume']]

# Rolling Operations

<center><img src="images/rolling.gif" alt="Drawing" style="width: 75%;"/></center>

In [None]:
# aapl.rolling(5)[['Close', 'Adj Close']].mean().plot();

In [None]:
# Drop `Volume`, since it's way bigger than everything else.
# aapl.drop('Volume', axis=1).resample('2W').max().plot();

In [None]:
# 30-day rolling exponentially-weighted stddev of returns.
# aapl['Close'].pct_change().ewm(span=30).std().plot();

# "Real World" Data

In [None]:
# from demos.avocados import read_avocadata

#avocados = read_avocadata('2014', '2016')
#avocados.head()

In [None]:
# Unlike numpy arrays, pandas DataFrames can have a different dtype for each column.
# avocados.dtypes

In [None]:
# What's the regional average price of a HASS avocado every day?
# hass = avocados[avocados.Variety == 'HASS']
# hass.groupby(['Date', 'Region'])['Weighted Avg Price'].mean().unstack().ffill().plot();

In [None]:
def _organic_spread(group):

    if len(group.columns) != 2:
        return pd.Series(index=group.index, data=0.0)
    
    is_organic = group.columns.get_level_values('Organic').values.astype(bool)
    organics = group.loc[:, is_organic].squeeze()
    non_organics = group.loc[:, ~is_organic].squeeze()
    diff = organics - non_organics
    return diff

def organic_spread_by_region(df):
    """What's the difference between the price of an organic 
    and non-organic avocado within each region?
    """
    return (
        df
        .set_index(['Date', 'Region', 'Organic'])
         ['Weighted Avg Price']
        .unstack(level=['Region', 'Organic'])
        .ffill()
        .groupby(level='Region', axis=1)
        .apply(_organic_spread)
    )

In [None]:
#organic_spread_by_region(hass).plot();
#plt.gca().set_title("Daily Regional Organic Spread");
#plt.legend(bbox_to_anchor=(1, 1));

In [None]:
#spread_correlation = organic_spread_by_region(hass).corr()
#spread_correlation

In [None]:
import seaborn as sns
#grid = sns.clustermap(spread_correlation, annot=True)
#fig = grid.fig
#axes = fig.axes
#ax = axes[2]
#ax.set_xticklabels(ax.get_xticklabels(), rotation=45);

# Pandas Review

- Pandas extends numpy with more complex datastructures and algorithms.
- If you understand numpy, you understand 90% of pandas.
- `groupby`, `set_index`, and `unstack` are powerful tools for working with categorical data.
- Avocado prices are surprisingly interesting :)

# Thanks!