In [1]:
import random
import requests
import pandas as pd

In [2]:
r = requests.get("https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt")
r.raise_for_status()
stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [4]:
workload[:10]

['USC00478080',
 'US1WIMW0028',
 'USC00478349',
 'USC00478080',
 'US1WIMW0031',
 'USC00478080',
 'US1WIMW0031',
 'US1WICR0004',
 'US1WIMW0031',
 'USC00478080']

In [15]:
import time

In [32]:
start = time.time()
time.sleep(3)
end = time.time()
(end-start)*1000

3000.316381454468

In [44]:
# FIFO of size ????
cache = {} # key=station name, value=station DataFrame
cache_order = []  # evict from the left, keep entries in the right
cache_size = 3

hits = [] # 1 for hit, 0 for miss
latency_ms = []

def get_station(station):
    start = time.time()
    if station in cache:
        #print("HIT", station)
        df = cache[station]
        hits.append(1)
    else:
        #print("MISS", station)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                    names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)
        hits.append(0)
        cache[station] = df
        cache_order.append(station)
        if len(cache) > cache_size:
            #print("EVICT!")
            victim = cache_order.pop(0) # SLOW!  O(N) operation
            cache.pop(victim)
    #print("CACHE:", cache_order)
    end = time.time()
    latency_ms.append((end-start)*1000)
    return df

for station in workload:
    station = get_station(station)

median_latency, p99_latency = np.quantile(latency_ms, [0.5, 0.99])
print("HITS", sum(hits))
print("HIT RATE", sum(hits)/len(hits))
print("AVG LATENCY", sum(latency_ms)/len(latency_ms))
print("MED LATENCY", median_latency)
print("p99", p99_latency)

HITS 32
HIT RATE 0.32
AVG LATENCY 24.601354598999023
MED LATENCY 13.72218132019043
p99 175.90838909149173


In [46]:
# LRU of size ????
cache = {} # key=station name, value=station DataFrame
cache_order = []  # evict from the left, keep entries in the right
cache_size = 3

hits = [] # 1 for hit, 0 for miss
latency_ms = []

def get_station(station):
    start = time.time()
    if station in cache:
        #print("HIT", station)
        df = cache[station]
        cache_order.remove(station)
        cache_order.append(station)
        hits.append(1)
    else:
        #print("MISS", station)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                    names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)
        hits.append(0)
        cache[station] = df
        cache_order.append(station)
        if len(cache) > cache_size:
            #print("EVICT!")
            victim = cache_order.pop(0) # SLOW!  O(N) operation
            cache.pop(victim)
    #print("CACHE:", cache_order)
    end = time.time()
    latency_ms.append((end-start)*1000)
    return df

for station in workload:
    station = get_station(station)

median_latency, p99_latency = np.quantile(latency_ms, [0.5, 0.99])
print("HITS", sum(hits))
print("HIT RATE", sum(hits)/len(hits))
print("AVG LATENCY", sum(latency_ms)/len(latency_ms))
print("MED LATENCY", median_latency)
print("p99", p99_latency)

HITS 35
HIT RATE 0.35
AVG LATENCY 38.46561670303345
MED LATENCY 16.175150871276855
p99 209.5409107208278
