In [2]:
import random
import requests
import pandas as pd

In [9]:
r = requests.get("https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt")
r.raise_for_status()
stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [8]:
workload[:10]

['US1WIPP0001',
 'US1WIPP0001',
 'USC00470516',
 'US1WIPP0001',
 'US1WIDA0063',
 'USW00004891',
 'USC00473332',
 'USC00473332',
 'US1WIPP0001',
 'US1WIPP0001']

In [30]:
import time
start = time.time()
time.sleep(2)
end = time.time()
print((end-start)*1000, "ms")

2000.3397464752197 ms


In [34]:
import numpy as np

In [35]:
np.quantile([1,2,3], 0.5)

np.float64(2.0)

In [40]:
# FIFO
cache_size = 3
cache = {} # key=station name, value=DataFrame with weather
cache_order = []  # evict from left, try to keep right in the cache

hits = []  # 1 for hit, 0 for miss
latency_ms = []

def get_station(station):
    start = time.time()
    if station in cache:
        #print("HIT", station)
        df = cache[station]
        hits.append(1)
    else:
        #print("MISS", station)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                    names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)
        cache[station] = df
        cache_order.append(station)
        hits.append(0)

        if len(cache) > cache_size:
            #print("EVICT!")
            victim = cache_order.pop(0) # pop from the left.  This is slow!  O(N)
            cache.pop(victim)
    #print("CACHE STATE:", cache_order)
    end = time.time()
    latency_ms.append((end-start)*1000)
    return df

for station in workload:
    df = get_station(station)

print("HITS", sum(hits))
print("HIT RATE", sum(hits) / len(hits))
print("AVG LATENCY", sum(latency_ms)/len(latency_ms))
p50, p99 = np.quantile(latency_ms, [0.5, 0.99])
print("p99 latency", p99)

HITS 79
HIT RATE 0.79
AVG LATENCY 4.649014472961426
p99 latency 30.29000997543379


In [41]:
# LRU
cache_size = 3
cache = {} # key=station name, value=DataFrame with weather
cache_order = []  # evict from left, try to keep right in the cache

hits = []  # 1 for hit, 0 for miss
latency_ms = []

def get_station(station):
    start = time.time()
    if station in cache:
        print("HIT", station)
        df = cache[station]
        hits.append(1)

        cache_order.remove(station)
        cache_order.append(station)
    else:
        print("MISS", station)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                    names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)
        cache[station] = df
        cache_order.append(station)
        hits.append(0)

        if len(cache) > cache_size:
            print("EVICT!")
            victim = cache_order.pop(0) # pop from the left.  This is slow!  O(N)
            cache.pop(victim)
    print("CACHE STATE:", cache_order)
    end = time.time()
    latency_ms.append((end-start)*1000)
    return df

for station in workload[:10]:
    df = get_station(station)

print("HITS", sum(hits))
print("HIT RATE", sum(hits) / len(hits))
print("AVG LATENCY", sum(latency_ms)/len(latency_ms))
p50, p99 = np.quantile(latency_ms, [0.5, 0.99])
print("p99 latency", p99)

MISS US1WIBR0038
CACHE STATE: ['US1WIBR0038']
MISS US1WIDD0006
CACHE STATE: ['US1WIBR0038', 'US1WIDD0006']
MISS US1WIWD0004
CACHE STATE: ['US1WIBR0038', 'US1WIDD0006', 'US1WIWD0004']
HIT US1WIWD0004
CACHE STATE: ['US1WIBR0038', 'US1WIDD0006', 'US1WIWD0004']
HIT US1WIDD0006
CACHE STATE: ['US1WIBR0038', 'US1WIWD0004', 'US1WIDD0006']
HIT US1WIBR0038
CACHE STATE: ['US1WIWD0004', 'US1WIDD0006', 'US1WIBR0038']
HIT US1WIDD0006
CACHE STATE: ['US1WIWD0004', 'US1WIBR0038', 'US1WIDD0006']
MISS US1WIOZ0009
EVICT!
CACHE STATE: ['US1WIBR0038', 'US1WIDD0006', 'US1WIOZ0009']
HIT US1WIDD0006
CACHE STATE: ['US1WIBR0038', 'US1WIOZ0009', 'US1WIDD0006']
HIT US1WIDD0006
CACHE STATE: ['US1WIBR0038', 'US1WIOZ0009', 'US1WIDD0006']
HITS 6
HIT RATE 0.6
AVG LATENCY 6.311511993408203
p99 latency 22.04059362411499
