# Performance 3

In [None]:
# known import statements
import pandas as pd
import csv
from subprocess import check_output

# new import statements
import zipfile
from io import TextIOWrapper

### Let's take a look at the files inside the current working directory.

In [None]:
str(check_output(["ls", "-lh"]), encoding="utf-8").split("\n")

### Let's `unzip` "wi.zip".

In [None]:
check_output(["unzip", "wi.zip"])

### Let's take a look at the files inside the current working directory.

In [None]:
str(check_output(["ls", "-lh"]), encoding="utf-8").split("\n")

### Traditional way of reading data using pandas

In [None]:
df = pd.read_csv("wi.csv")

In [None]:
df.head(5) # Top 5 rows within the DataFrame

### How can we see all the column names?

In [None]:
df.columns

### How to extract `interest_rate`?

In [None]:
df["interest_rate"] # observe that there are missing values

### How to count unique values in a column `Series`?

In [None]:
df["interest_rate"].value_counts()

### Let's eliminiate the strings (Exempt) and missing values (NaN).
Let's try `pd.to_numeric(...)`. We need a way to specify that strings need to be converted into NaN values.

In [None]:
pd.to_numeric(df["interest_rate"]) 
# TODO: open the documentation and figure out what parameter will help us
# Recall that we can press shift + tab after a function name to open the documentation

In [None]:
pd.to_numeric(df["interest_rate"], errors="coerce")

### Let's drop the NaN values and compute average interest rate.

In [None]:
pd.to_numeric(df["interest_rate"], errors="coerce").dropna()

In [None]:
pd.to_numeric(df["interest_rate"], errors="coerce").dropna().mean()

### How can we read the data without creating an uncompressed version called "wi.csv"?

- Why would we want to do something like that?
    1. lower memory usage (we can try to load information on one loan at a time, instead of all the loans): that will still work for average interest rate computation
    2. lower storage usage (you can directly work with compressed data)

In [None]:
f = open("wi.csv")
df = pd.read_csv(f) # instead of passing relative path of file name, we can pass a file object instance reference
f.close()

### Let's clear the memory and delete "wi.csv".
In python, you can clear memory used up for an object simply by getting rid of all the active references.

In [None]:
df = "some_string" # you can retrive the memory used for storing the DataFrame

In [None]:
check_output(["rm", "wi.csv"])
str(check_output(["ls", "-lh"]), encoding="utf-8").split("\n")

### How can we read data directly from a zip file?
`zipfile.ZipFile(...)`

### Goals:
1. directly access the data without decompressing: `zipfile.ZipFile(...)` - saves storage space by directly opening a zip file
2. only look at one row at a time: `csv.DictReader(...)` - saves memory space by enabling us to read one row at a time (as `dict`)

In [None]:
# code for goal 1
zf = zipfile.ZipFile("wi.zip")
f = zf.open("wi.csv")

df = pd.read_csv(f) 

f.close()
zf.close()

In [None]:
# code for goal 2 & goal 1
zf = zipfile.ZipFile("wi.zip")
f = zf.open("wi.csv")

reader = csv.DictReader(f)

for row in reader:
    print(row)
    break

f.close()
zf.close()

### Let's learn more modes for `open` built-in function
- `open(..., mode="r")`   => text (default)
- `open(..., mode="rb")`  => bytes
- `zf.open(...)`          => always bytes

With `zipfile` module there isn't a way for us to specify that we need text.

### `TextIOWrapper` inside `io` module enables us to convert `bytes` into `str`

In [None]:
# code for goal 2 & goal 1
zf = zipfile.ZipFile("wi.zip")
f = zf.open("wi.csv")

reader = csv.DictReader(TextIOWrapper(f))

for row in reader:
    print(row)
    break

f.close()
zf.close()

### Let's go back to calculating average interest rate.
- Algorithm / Pseudocode steps:
    1. print "interest rate" and type of "interest rate"
    2. convert "interest rate" into `float` - how can we handle errors? `try` ... `except` ... (*IMPORTANT*: always have your `except` block catch specific exceptions)
    3. calculate running total, count for each row of data
    4. calculate average

In [None]:
zf = zipfile.ZipFile("wi.zip")
f = zf.open("wi.csv")

total = 0
count = 0

reader = csv.DictReader(TextIOWrapper(f))

for row in reader:
    try:
        total += float(row["interest_rate"])
        count += 1
    except ValueError:
        pass # do nothing

f.close()
zf.close()

total / count

### Let's generalize the code to read "interest rate" into a function.

- This does make things worse because we are going back to reading all the data before doing the computation.
- But this sets us up to learn about generators.

In [None]:
def get_rates_v1():
    rates = []
    
    zf = zipfile.ZipFile("wi.zip")
    f = zf.open("wi.csv")

    reader = csv.DictReader(TextIOWrapper(f))
    
    for row in reader:
        try:
            rates.append(float(row["interest_rate"]))
        except ValueError:
            pass # do nothing

    f.close()
    zf.close()
    
    return rates

rates = get_rates_v1()
sum(rates) / len(rates)

### Using a generator
- `yield` each value
- use `next` to get the next value => internally `for` loop invokes `next` for each iteration

In [None]:
def get_rates_v2():
    print("Starting generator")
    
    zf = zipfile.ZipFile("wi.zip")
    f = zf.open("wi.csv")
    
    reader = csv.DictReader(TextIOWrapper(f))
    
    for row in reader:
        try:
            yield float(row["interest_rate"])
        except ValueError:
            pass # do nothing

    f.close()
    zf.close()

rates = get_rates_v2()

In [None]:
next(rates) # gives us the next value

In [None]:
next(rates) # gives us the next value

In [None]:
next(rates) # gives us the next value

### Let's use `for` loop to keep getting all the rates.

- `len` function doesn't work with generators
- indexing doesn't work with generators

In [None]:
len(rates)

In [None]:
rates[4]

In [None]:
rates = get_rates_v2()

total = 0
count = 0

for rate in rates: # keeps calling next(rates) to get values from yield
    total += rate
    count += 1
    
total / count

This approach doesn't work for median calculation. Why? Remember we have to sort, so we need all values in memory.

In [None]:
rates = list(get_rates_v2())
rates.sort()

# OOP 1: Classes

Classes enable to create custom types. Attributes within the class will be used to store information about each object instance.

### Let's create a `Dog` class.

In [None]:
class Dog:
    pass # eventually we will learn how to write code inside a class

### Let's create `Dog` object instances and add attributes.

In [None]:
dog1 = Dog()
dog1.name = "Jimmy"
dog1.age = 2

In [None]:
dog2 = Dog()
dog2.name = "Buster"

### Let's define a `speak` function that will make the `Dog` bark.
- Algorithm / pseudocode steps:
    1. puppies bark thrice (age < 2)
    2. dogs bark once

In [None]:
def speak(dog):
    if dog.age < 2:
        print(f"{dog.name}: bark bark bark!")
    else:
        print(f"{dog.name}: bark!")

### Let's invoke `speak` for dog1 and dog2.

In [None]:
speak(dog1)

In [None]:
speak(dog2)

### How can we standardize the attribute initialization to avoid bugs?

- Eventually we will learn about how to define methods inside the class, which will include `__init__` method.
- For now, let's define an `init` function.

In [None]:
def init(dog, name, how_old):
    dog.name = name
    dog.age = how_old

In [None]:
dog2 = Dog()
init(dog2, "Spark", 10)
speak(dog2)