1.4 Predicting high temperatures at SFO

In [33]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [13]:
df = pd.read_csv('simplified-sfo-weather.csv')
df = df.dropna(subset=['temphigh'])  # drop rows where temphigh is NA
df

Unnamed: 0,year,day,precip,templow,temphigh
0,1960,1,0.00,34.0,48.0
1,1960,2,0.00,29.0,47.0
2,1960,3,0.00,38.0,53.0
3,1960,4,0.00,33.0,56.0
4,1960,5,0.00,35.0,52.0
...,...,...,...,...,...
22277,2020,363,0.02,47.0,56.0
22278,2020,364,0.00,41.0,61.0
22279,2020,365,0.04,40.0,57.0
22280,2020,366,0.03,47.0,60.0


In [29]:
def generate(data):
    N = data.shape[0]
    t = data['day'].to_numpy() 

    X = np.ones((N, 3))
    X[:,1] = np.sin(2 * np.pi / 365 * (t-1))
    X[:,2] = np.cos(2 * np.pi / 365 * (t-1))

    Y = data['temphigh'].to_numpy()

    return X, Y

data = df[df['year']<1990]
X, Y = generate(data)
beta_hat = np.linalg.solve(X.T @ X, X.T @ Y)

In [32]:
for i in [1961, 1971, 1981, 1991, 2001, 2011]:
    test_data = df[(df['year'] >= i) & (df['year'] < i+10)]
    test_X, test_Y = generate(test_data)
    print(f'mean actual high temp minus predicted for decade {i}-{i+9}:', np.mean(test_Y - test_X @ beta_hat))

mean actual high temp minus predicted for decade 1961-1970: -0.4275654508959994
mean actual high temp minus predicted for decade 1971-1980: -0.4115153262151383
mean actual high temp minus predicted for decade 1981-1990: 0.9015692698049863
mean actual high temp minus predicted for decade 1991-2000: 0.5917696450413631
mean actual high temp minus predicted for decade 2001-2010: 0.31942250091122953
mean actual high temp minus predicted for decade 2011-2020: 1.8144859407148064
