<a href="https://colab.research.google.com/github/mkotemp/ghc2024-vectorization-workshop/blob/main/GHC_vec_speed_session.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### Utility helper functions for the vectorization workshop.###

import time

import numpy as np


def time_funcs(funcs, args, names, reps):
    """Returns dict of timing data for a list of functinos, arguments, and names.

    Args:
      funcs: list of functions to test
      args: list of arguments to the functions
      names: list of names of the functions. Will compare function with `slow` in the name vs those with `vec`
      reps: number of times to repeat the functions in computation of mean and std of timings

    Returns:
      timings: dict {name1:[time_elapsed, ...], name2:[time_elapsed, ...]..., }
      data
    """
    timings = {names[i]: [] for i in range(len(names))}
    data = {names[i]: None for i in range(len(names))}
    for j in range(reps):
        for i in range(len(funcs)):
            name = names[i]
            start = time.time()
            data[name] = funcs[i](*args[i])
            end = time.time()
            timings[name].append(end - start)
    timings["speedup"] = []
    for j in range(reps):
        slow_names = [name for name in names if "slow" in name]
        vec_names = [name for name in names if "vec" in name]
        timings["speedup"].append(
            max([timings[sn][j] for sn in slow_names])
            / min([timings[vn][j] for vn in vec_names])
        )
    return timings, data


def print_time_results(timings, data_size):
    """Given a timings object (output of time_funcs), print the results."""
    fstring = f'n_reps={len(timings["speedup"])}; data_size= {data_size} records\n'
    l = max([len(key) for key in timings.keys()])
    for key, value in timings.items():
        buff = l - len(key)
        if key == "speedup":
            fstring += f'{key}: {" "}{np.round(np.mean(value),1)}X  +/- {np.round(np.std(value),1)}X\n'
        else:
            fstring += f'{key}: {" "}{np.round(np.mean(value),4)}s +/- {np.round(np.std(value),4)}s\n'
    print(fstring)



In [2]:
"""Unoptimized functions to be vectorized."""

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q1: Convert a list to np.array


def convert_list_to_array(input_list: list):
    #pass  # insert your code here
    return np.array(input_list)


def make_test_list(size: int = 100):
    return [i for i in range(size)]


def test_convert_list(size: int = 1000):
    print("\n\nQ1: Running test_convert_list...\n")
    input = make_test_list(size)
    print('input',type(input),input)
    output = convert_list_to_array(input)
    print('mko',output)
    if output is not None:
        assert (
            type(output) == np.ndarray
            and len(input) == len(output)
            and np.all([input[i] == output[i] for i in range(len(input))])
        ), "Whoops! input and output do not match"
        print("  Success!")
    else:
        print("  convert_list_to_array is not implemented")



In [3]:
"""Testing script to run all functions."""

test_convert_list()



Q1: Running test_convert_list...

input <class 'list'> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 21

In [4]:
"""Unoptimized functions to be vectorized."""

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q2: Convert a dict to pd.DataFrame. Each key in the dict should become a column in the DataFrame.


def convert_dict_to_df(input_dict: dict):
    #pass  # insert your code here
    return pd.DataFrame(input_dict)

def make_test_dict(size: int = 100):
    return {
        chr(97 + i % 26) * (i // 26 + 1): np.random.rand(5) * 100 for i in range(size)
    }


def test_convert_dict(size: int = 1000):
    print("\n\nQ2: Running test_convert_dict...\n")
    input = make_test_dict(size)
    output = convert_dict_to_df(input)
    if output is not None:
        assert type(output) == pd.DataFrame and np.all(
            [key in output.columns for key in input.keys()]
        ), "Whoops! input and output do not match"
        print("  Success!")
    else:
        print("  convert_dict_to_df is not implemented")


In [5]:
test_convert_dict()



Q2: Running test_convert_dict...

  Success!


In [6]:

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q3: Write a vectorized function vec_power which has the same arguments are returns as slow_power.
#     Run the function on data1 and report your speedup on 20 reps


def slow_power(x, m=4):
    out = []
    for x_i in x:
        out.append(x_i**m)
    return np.array(out)


def vec_power(x, m=4):
    #pass  # insert your new code here
    return x**m


def make_data1(size=1000):
    np.random.seed(4)
    array1 = np.random.rand(size, 1)  # np.array
    df1 = pd.DataFrame(array1)  # pd.DataFrame
    return df1


def test_power(size=1000):
    print("\n\nQ3: Running test_power...\n")
    input = make_data1(size=size)
    output_slow = pd.DataFrame(slow_power(input[0]))
    output_vec_raw = vec_power(input[0])
    if output_vec_raw is not None:
        output_vec = pd.DataFrame(output_vec_raw)
        pd.testing.assert_frame_equal(output_slow, output_vec, check_dtype=False)
        print("  Success!")
        timings, _ = time_funcs(
            [slow_power, vec_power],
            [[input[0]], [input[0]]],
            ["slow_power", "vec_power"],
            reps=20,
        )
        print_time_results(timings, size)
    else:
        print("  vec_power is not implemented")
    print(output_slow)
    print(output_vec)



In [7]:

  test_power()



Q3: Running test_power...

  Success!
n_reps=20; data_size= 1000 records
slow_power:  0.0005s +/- 0.0002s
vec_power:  0.0003s +/- 0.0003s
speedup:  1.8X  +/- 0.4X

            0
0    0.874499
1    0.089678
2    0.895133
3    0.261082
4    0.236999
..        ...
995  0.057511
996  0.073116
997  0.570789
998  0.002143
999  0.030919

[1000 rows x 1 columns]
            0
0    0.874499
1    0.089678
2    0.895133
3    0.261082
4    0.236999
..        ...
995  0.057511
996  0.073116
997  0.570789
998  0.002143
999  0.030919

[1000 rows x 1 columns]


In [11]:

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q4: Write a vectorized function, vec_addition, which adds two vectors of the same size


def slow_addition(arr1, arr2):
    assert len(arr1) == len(arr2)
    out = [a1 + a2 for a1, a2 in zip(arr1, arr2)]
    return np.array(out)


def vec_addition(arr1, arr2):
    #pass  # insert your code here
    return np.sum([arr1,arr2],axis=0)


def make_data1(size=1000):
    np.random.seed(4)
    array1 = np.random.rand(size, 1)  # np.array
    df1 = pd.DataFrame(array1)  # pd.DataFrame
    return df1


def test_addition(size=1000):
    print("\n\nQ4: Running test_addition...\n")
    input_1 = make_data1(size=size)
    input_2 = make_data1(size=size)
    output_slow = pd.DataFrame(slow_addition(input_1[0], input_2[0]))
    output_vec = vec_addition(input_1[0], input_2[0])
    #print('slow',output_slow,'\n')
    #print('vec',output_vec)
    if output_vec is not None:
        output_vec_df = pd.DataFrame(output_vec)
        pd.testing.assert_frame_equal(output_slow, output_vec_df, check_dtype=False)
        timings, _ = time_funcs(
            [slow_addition, vec_addition],
            [(input_1[0], input_2[0]), (input_1[0], input_2[0])],
            ["slow_addition", "vec_addition"],
            reps=20,
        )
        print_time_results(timings, size)
    else:
        print("  vec_addition is not implemented")



In [12]:
test_addition()



Q4: Running test_addition...

n_reps=20; data_size= 1000 records
slow_addition:  0.0007s +/- 0.0006s
vec_addition:  0.0003s +/- 0.0009s
speedup:  5.9X  +/- 4.4X



In [17]:

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q5: Write a vectorized function, vec_grade, which has the same arguments and return as slow_grade.
# Note that when evaluating multiple boolean conditions over a vector, bitwise operators must be used
# Example: indices_of_nums_bt_1_and_5 = (some_other_vector > 1) & (some_other_vector < 5)


def slow_grade(grades):
    letter_grades = []
    for grade in grades:
        if grade >= 90:
            letter_grades.append("A")
        elif 80 <= grade < 90:
            letter_grades.append("B")
        elif 70 <= grade < 80:
            letter_grades.append("C")
        elif grade < 70:
            letter_grades.append("F")

    return np.array(letter_grades)


def vec_grade(grades):
    #pass  # insert your code here
    return np.where(grades<70,'F',np.where(grades<80,'C',np.where(grades<90,'B','A')) )


def random_grades(num_grades: int):
    return np.random.randint(0, 100, size=num_grades)


def test_grades(num_grades: int = 1000):
    print("\n\nQ5: Running test_grades...\n")
    input = random_grades(num_grades)
    output_slow = pd.DataFrame(slow_grade(input))
    output_vec = vec_grade(input)
    if output_vec is not None:
        output_vec_df = pd.DataFrame(output_vec)
        pd.testing.assert_frame_equal(output_slow, output_vec_df, check_dtype=False)
        timings, _ = time_funcs(
            [slow_grade, vec_grade],
            [[input], [input]],
            ["slow_grade", "vec_grade"],
            reps=20,
        )
        print_time_results(timings, num_grades)
    else:
        print("  vec_grade is not implemented")



In [18]:
test_grades()



Q5: Running test_grades...

n_reps=20; data_size= 1000 records
slow_grade:  0.0008s +/- 0.0003s
vec_grade:  0.0001s +/- 0.0001s
speedup:  17.5X  +/- 4.6X



In [37]:

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q6: Write a vectorized function, vec_pass_fail, which has the same arguments and returns as slow_pass_fail.
# (Hint: try using np.where and np.isin)

#this function is wrong...we are inputting in numbers below, but function is expecting a letter like from above step
def slow_pass_fail(grades):
    pass_fail_grades = []
    for grade in grades:
        if grade == "A":
            pass_fail_grades.append("P")
        elif grade == "B":
            pass_fail_grades.append("P")
        elif grade == "C":
            pass_fail_grades.append("P")
        else:
            pass_fail_grades.append("F")
    return np.array(pass_fail_grades)


def vec_pass_fail(grades):
    #pass  # insert your code here
    return np.where(np.isin(grades, ["A","B","C"]),"P","F")

def random_grades(num_grades: int):
    return np.random.randint(0, 100, size=num_grades)

#MKO NOTE: this function is wrong...we are inputting in numbers below, but function is expecting a letter like from above step
def test_pass_fail(num_grades: int = 1000):
    print("\n\nQ6: Running test_pass_fail...\n")

    input = random_grades(num_grades)
    #MKO NOTE: Code has error in it - I believe they wanted the ABCF letters to go into this new P/F function
    #output_slow = pd.DataFrame(slow_pass_fail(input))
    output_slow = pd.DataFrame(slow_pass_fail(slow_grade(input)))
    #output_vec = vec_pass_fail(input)
    output_vec = vec_pass_fail(vec_grade(input))
    if output_vec is not None:
        output_vec_df = pd.DataFrame(output_vec)
        pd.testing.assert_frame_equal(output_slow, output_vec_df, check_dtype=False)
        timings, _ = time_funcs(
            [slow_pass_fail, vec_pass_fail],
            [[input], [input]],
            ["slow_pass_fail", "vec_pass_fail"],
            reps=20,
        )
        print_time_results(timings, num_grades)
    else:
        print("  vec_pass_fail is not implemented")



In [38]:
test_pass_fail()




Q6: Running test_pass_fail...

n_reps=20; data_size= 1000 records
slow_pass_fail:  0.0057s +/- 0.0002s
vec_pass_fail:  0.0001s +/- 0.0s
speedup:  107.8X  +/- 45.8X



In [157]:

import math
import time

import numpy as np
import pandas as pd

#from util import print_time_results, time_funcs

# Q7: Write a vectorized function vec_insurance_factor which has the same arguments are returns as slow_insurance_factor_wrapper.


def slow_insurance_factor(car_df: pd.DataFrame):

    def slow_insurance_factor_row(year: int, color: str, model: str, mileage: float):
        """Return insurance factor based on car attributes."""
        factor = 1
        if year < 2000:
            factor *= 4
        elif year < 2015:
            factor *= 3
        elif year < 2020:
            factor *= 2
        if color in ["Red"]:
            factor *= 2
        if color == "Blue":
            factor *= 1.5
        factor += max((mileage - 100000) // 1000, 1)
        return factor

    return car_df.apply(
        lambda row: slow_insurance_factor_row(
            row["year"], row["color"], row["model"], row["mileage"]
        ),
        axis=1,
    )


def vec_insurance_factor(car_df: pd.DataFrame):
    #pass  # insert your code here
    return np.sum(np.stack((np.multiply(np.where(car_df["year"]<2000,4,np.where(car_df["year"]<2015,3,np.where(car_df["year"]<2020,2,1))),
                                        np.where(np.isin(car_df["color"],["Red"]),2,np.where(np.isin(car_df["color"],["Blue"]),1.5,1))),
                            np.max(np.column_stack(((car_df['mileage'] - 100000) // 1000,
                                                       np.ones(len(car_df)))),axis=1)),axis=1),axis=1)


  #  return np.sum(myp,axis=1)

  #  return np.sum(np.multipy(p1,p2,axis=1),p3,axis=1)
 #   return np.sum(
 #       np.multiply(
 #         np.where(car_df[0]<2000,4,np.where(car_df[0]<2015,3,np.where(car_df[0]<2020,2,1)))
 #         ,
 #         np.where(np.isin(car_df[1],["Red"]),2,np.isin(car_df[1],["Blue"],1.5,1))
  #        ,axis=1),
 #       max((car_df[3] - 100000) // 1000, 1),
 #       axis=1)


class Car:
    def __init__(self, model: str, year: int, color: str, mileage: float):
        self.model = model
        self.year = year
        self.color = color
        self.mileage = mileage


def make_car_list(n=1000):
    models = ["Sienna", "Corolla", "Forester", "Jetta", "Civic", "Escape", "Escalade"]
    colors = ["Black", "White", "Red", "Grey", "Blue"]
    years = range(1980, 2025)
    return [
        Car(
            model=np.random.choice(models),
            color=np.random.choice(colors),
            year=np.random.choice(years),
            mileage=np.random.random() * 200000,
        )
        for i in range(n)
    ]


def make_car_data(size: int = 1000):
    cars = make_car_list(n=size)
    df = pd.concat(
        [
            pd.DataFrame.from_dict(cars[i].__dict__, orient="index").transpose()
            for i in range(len(cars))
        ]
    )
    df.set_index(np.array(range(0,len(df))),inplace=True)
    return df


def test_insurance_factor(size: int = 1000):
    print("\n\nQ7: Running test_insurance_factor...\n")
    input = make_car_data(size)
    output_slow = pd.DataFrame(slow_insurance_factor(input))
    print('output_slow=',output_slow)
    output_vec_raw = vec_insurance_factor(input)
    print('output_vec_raw=',pd.DataFrame(output_vec_raw))
    if output_vec_raw is not None:
        output_vec = pd.DataFrame(output_vec_raw)
        pd.testing.assert_frame_equal(output_slow, output_vec, check_dtype=False)
        timings, _ = time_funcs(
            [slow_insurance_factor, vec_insurance_factor],
            [[input], [input]],
            ["slow_insurance_factor", "vec_insurance_factor"],
            reps=20,
        )
        print_time_results(timings, size)
    else:
        print("  vec_insurance_factor is not implemented")


In [158]:
test_insurance_factor()



Q7: Running test_insurance_factor...

output_slow=          0
0     11.0
1      5.0
2     34.0
3      5.0
4      9.0
..     ...
995   69.0
996   48.0
997    4.0
998    5.0
999  101.0

[1000 rows x 1 columns]
output_vec_raw=          0
0     11.0
1      5.0
2     34.0
3      5.0
4      9.0
..     ...
995   69.0
996   48.0
997    4.0
998    5.0
999  101.0

[1000 rows x 1 columns]
n_reps=20; data_size= 1000 records
slow_insurance_factor:  0.0145s +/- 0.0012s
vec_insurance_factor:  0.0018s +/- 0.0002s
speedup:  7.9X  +/- 0.6X

