In [150]:
#Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
from geopy.geocoders import Nominatim
from sklearn import preprocessing
import pickle
import math
import copy

#Path to datafile
datapath = "All_zip.csv"
WINDOW_SIZE = 50
LEARNING_RATE = 0.000025
N_EPOCHS = 200
HIDDEN_SIZE = 100
EPOCHS_EARLY = 20

In [151]:
#Data cleaning
class PriceData():

    def __init__(self, filename):
        self.df = self.generateDF(filename)

    def generateDF(self, filename):
        
        def lat(row):
            return geolocator.geocode({"postalcode":row["RegionName"]})[1][0]

        def long(row):
            return geolocator.geocode({"postalcode":row["RegionName"]})[1][1]

        
        orig_df = pd.read_csv(filename)
        filtered_df = orig_df.loc[(orig_df['State'] == "CA")]
        
        

        columns = filtered_df.columns
        

        cal_df = filtered_df \
            .dropna() \
            .drop(columns=columns[3:9]) \
            .drop(columns=columns[0:2]) \
            .rename(columns={'RegionName': 'ZipCode'}) \
            .reset_index(drop=True)
        
        
        cal_df.set_index("ZipCode", inplace = True)

#         for zip_code in cal_df.index:
#             print(zip_code)
#             cal_df["windows"] = cal_df.apply(lambda row: createBatches(row["prices"]), axis = 1)
        

        return cal_df

In [152]:
class LSTM(nn.Module):
    
    def __init__(self,input_size = 1, window_size = WINDOW_SIZE, hidden_size = HIDDEN_SIZE, out_size = 1, learning_rate = LEARNING_RATE, epochs = N_EPOCHS):

        super().__init__()
        
        #Model
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size,out_size)
        self.hidden = (torch.zeros(1,1,hidden_size),torch.zeros(1,1,hidden_size))
        self.window_size = None
        self.prices = None
        self.window_size = window_size
        
        #Data processing
        self.scaler = None
        
        #Training parameters
        self.criterion = nn.MSELoss()
        self.optimiser = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        
        #Hyperparameters
        self.epochs = epochs
        
        
        
    
    def _preprocessor(self, ySeries):
        y = ySeries.values

        
        
        window_size = self.window_size

        out = []
        self.prices = y[:window_size].reshape(window_size)
        for i in range(len(y) - window_size):
            scaler = preprocessing.MinMaxScaler()
            y_scaled = scaler.fit_transform(y[i:i+window_size+1].reshape(-1, 1))
            self.scaler = scaler
            window = torch.tensor(y_scaled[:window_size]).to(torch.float32)
            label = torch.tensor(y_scaled[window_size:window_size+1]).to(torch.float32)
            self.prices = np.append(self.prices, y[i+window_size:i+window_size+1][0])
            out.append((window, label))
        return out
    
    def fit(self, y, y_val):
        y_train = self._preprocessor(y)
        
        epochs_wo_imp = 0
        lowest_rmse = math.inf
        val_rmse = math.inf
        
        
        for epoch in range(self.epochs):
            for seq, y_hat in y_train:
                self.optimiser.zero_grad()
                self.hidden = (torch.zeros(1,1,self.hidden_size), torch.zeros(1,1,self.hidden_size))

                y_pred = self(seq)
                loss = self.criterion(y_hat[0], y_pred)
                loss.backward() 
                self.optimiser.step()
            
            val_rmse = self.score(y_val)[0]
            
            if (val_rmse < lowest_rmse):
                best_model = copy.deepcopy(self)
                best_model.prices = np.append(best_model.prices, y_val)
                lowest_rmse = val_rmse
                epochs_wo_imp = 0
            else:
                epochs_wo_imp += 1
                
            if (epochs_wo_imp == EPOCHS_EARLY):
                return best_model, lowest_rmse
            
            print(f'Epoch: {epoch+1:2} Test Loss: {loss.item():10.8f} Val RMSE: {val_rmse}')
            
        return (best_model, lowest_rmse)
                
            
    def predict(self, future_instances):
        
        self.eval()
        for i in range(future_instances):
            scaler = preprocessing.MinMaxScaler()
            seq = torch.FloatTensor(scaler.fit_transform(self.prices[-self.window_size:].reshape(-1, 1)))
            self.scaler = scaler
            with torch.no_grad():
                self.hidden = (torch.zeros(1,1,self.hidden_size),
                                torch.zeros(1,1,self.hidden_size))
                self.prices = np.append(self.prices, self.scaler.inverse_transform(np.array(self(seq).item()).reshape(-1, 1)))
        
        predictions = self.prices[-future_instances:]
        self.prices = self.prices[:-future_instances]
        return predictions
    
    def predict_inplace(self, future_instances):
        
        self.eval()
        for i in range(future_instances):
            scaler = preprocessing.MinMaxScaler()
            seq = torch.FloatTensor(scaler.fit_transform(self.prices[-self.window_size:].reshape(-1, 1)))
            self.scaler = scaler
            with torch.no_grad():
                self.hidden = (torch.zeros(1,1,self.hidden_size),
                                torch.zeros(1,1,self.hidden_size))
                self.prices = np.append(self.prices, self.scaler.inverse_transform(np.array(self(seq).item()).reshape(-1, 1)))
        
        predictions = self.prices[-future_instances:]
        return predictions
    
    def score(self, y):
        y_pred = self.predict(len(y))
        rmse = np.sqrt(np.mean(((np.array(y_pred)-np.array(y)))**2))
        return rmse, y_pred

        
        
    def forward(self,seq):
        lstm_out, self.hidden = self.lstm(seq.view(len(seq),1,-1), self.hidden)
        pred = self.linear(lstm_out.view(len(seq),-1))
        return pred[-1]

In [153]:
def split_dataset(y, valid_proportion, test_proportion):

   
    
    n_test = round(len(y) * test_proportion)
    n_valid = round(len(y) * valid_proportion)
    n_train = (len(y) - n_test) - n_valid
    
    y_train = y[:n_train]
    y_valid = y[n_train:n_train + n_valid]
    y_test = y[n_train + n_valid:]
    
    return (y_train, y_valid, y_test)


def save_regressor(trained_model, zipcode): 
    """ 
    Utility function to save the trained regressor model in part2_model.pickle.
    """
    # If you alter this, make sure it works in tandem with load_regressor
    with open(str(zipcode) + '_model.pickle', 'wb') as target:
        pickle.dump(trained_model, target)
    print(f"\nSaved model in {zipcode}_model.pickle\n")


def load_regressor(zipcode): 
    """ 
    Utility function to load the trained regressor model in part2_model.pickle.
    """
    # If you alter this, make sure it works in tandem with save_regressor
    with open(str(zipcode) + '_model.pickle', 'rb') as target:
        trained_model = pickle.load(target)
    print(f"\nLoaded model in {zipcode}_model.pickle\n")
    return trained_model

    

In [None]:
def hyperparameterSearch( window_sizes = [10, 20, 40, 80, 100], hidden_sizes = [10, 20, 40, 60, 80, 100, 150], zipcode = 91331, df = cal_df):
    lowest_rmse = math.inf
    val_rmse = math.inf
    

    for window_size in window_sizes:
        WINDOW_SIZE = window_size
        for hidden_size in hidden_sizes:
            HIDDEN_SIZE = hidden_size
            
            print(f"Fitting for window_size of {WINDOW_SIZE} and hidden size of {HIDDEN_SIZE}")
            y = df.loc[zipcode]

            y_train, y_valid, y_test = split_dataset(y, 0.1, 0.1)

            model = LSTM(window_size = WINDOW_SIZE, hidden_size = HIDDEN_SIZE)
            model, val_rmse = model.fit(y_train, y_valid)
            print(f"Model trained for window_size of {model.window_size} and hidden size of {model.hidden_size}")


            if (val_rmse < lowest_rmse):
                best_model = copy.deepcopy(model)
                lowest_rmse = val_rmse
                
    save_regressor(best_model,zipcode)
    return best_model

def evaluate(zipcode):
    model = load_regressor(zipcode)
    print("Window size:", model.window_size, "Hidden size:", model.hidden_size)
    y = cal_df.loc[zipcode]
    y_train, y_valid, y_test = split_dataset(y, 0.1, 0.1)
    test_rmse, y_preds = model.score(y_test)
    print("RMSE:", test_rmse)


    plt.plot(np.append(model.prices, y_preds))
    plt.plot(np.append(y_train, np.append(y_valid, y_test)))
                

if __name__ == "__main__":
    cal_df = PriceData(datapath).df
    zip_codes = cal_df.index[:10]
    for zip_code in zip_codes:
        print(f"Training Model for ZipCode {zip_code}")
        hyperparameterSearch(zipcode = zip_code, df = cal_df)

        print(f"Evaluating Model for ZipCode {zip_code}")
        evaluate(zip_code)

Training Model for ZipCode 91331
Fitting for window_size of 10 and hidden size of 10
Epoch:  1 Test Loss: 0.54288834 Val RMSE: 53803.49783892542
Epoch:  2 Test Loss: 0.50027466 Val RMSE: 52416.04993946533
Epoch:  3 Test Loss: 0.45984966 Val RMSE: 51060.538186458514
Epoch:  4 Test Loss: 0.42151058 Val RMSE: 49739.28999504545
Epoch:  5 Test Loss: 0.38514471 Val RMSE: 48458.85692610428
Epoch:  6 Test Loss: 0.35064951 Val RMSE: 47218.17037789746
Epoch:  7 Test Loss: 0.31793797 Val RMSE: 46010.99460238595
Epoch:  8 Test Loss: 0.28696826 Val RMSE: 44852.39534790395
Epoch:  9 Test Loss: 0.25776991 Val RMSE: 43740.71303747185
Epoch: 10 Test Loss: 0.23044603 Val RMSE: 42677.693999607654
Epoch: 11 Test Loss: 0.20513573 Val RMSE: 41670.63316118327
Epoch: 12 Test Loss: 0.18196914 Val RMSE: 40749.11147209886
Epoch: 13 Test Loss: 0.16103907 Val RMSE: 39903.35058080892
Epoch: 14 Test Loss: 0.14238365 Val RMSE: 39136.58976230024
Epoch: 15 Test Loss: 0.12597829 Val RMSE: 38457.73527090033
Epoch: 16 Tes

Epoch: 58 Test Loss: 0.01391969 Val RMSE: 34927.62240286275
Epoch: 59 Test Loss: 0.01176443 Val RMSE: 35077.699457266455
Epoch: 60 Test Loss: 0.00965686 Val RMSE: 35263.466137007876
Epoch: 61 Test Loss: 0.00764596 Val RMSE: 35482.81765438434
Epoch: 62 Test Loss: 0.00578538 Val RMSE: 35756.132850887356
Epoch: 63 Test Loss: 0.00412864 Val RMSE: 36102.41715321133
Epoch: 64 Test Loss: 0.00272293 Val RMSE: 36241.997887894206
Epoch: 65 Test Loss: 0.00160311 Val RMSE: 36330.987082462525
Epoch: 66 Test Loss: 0.00078660 Val RMSE: 36442.39938310546
Epoch: 67 Test Loss: 0.00027021 Val RMSE: 36560.26908682617
Epoch: 68 Test Loss: 0.00002960 Val RMSE: 36403.983009496784
Epoch: 69 Test Loss: 0.00002252 Val RMSE: 36062.65164563662
Epoch: 70 Test Loss: 0.00019499 Val RMSE: 35732.63836739863
Epoch: 71 Test Loss: 0.00048901 Val RMSE: 35464.002212238134
Model trained for window_size of 10 and hidden size of 20
Fitting for window_size of 10 and hidden size of 40
Epoch:  1 Test Loss: 0.49781933 Val RMSE: 5

Epoch: 20 Test Loss: 0.02007427 Val RMSE: 34453.30700682975
Epoch: 21 Test Loss: 0.01435105 Val RMSE: 34568.794153366005
Epoch: 22 Test Loss: 0.00885230 Val RMSE: 34822.524071585
Epoch: 23 Test Loss: 0.00425539 Val RMSE: 35263.408117173225
Epoch: 24 Test Loss: 0.00121712 Val RMSE: 35430.22771144594
Epoch: 25 Test Loss: 0.00003029 Val RMSE: 35192.37378772701
Epoch: 26 Test Loss: 0.00041311 Val RMSE: 34231.96993375574
Epoch: 27 Test Loss: 0.00166319 Val RMSE: 33501.01193134094
Epoch: 28 Test Loss: 0.00307242 Val RMSE: 33053.203788445506
Epoch: 29 Test Loss: 0.00422566 Val RMSE: 32801.01548782108
Epoch: 30 Test Loss: 0.00499607 Val RMSE: 32687.13303601007
Epoch: 31 Test Loss: 0.00540687 Val RMSE: 32630.793068602692
Epoch: 32 Test Loss: 0.00552991 Val RMSE: 32428.264976648632
Epoch: 33 Test Loss: 0.00544021 Val RMSE: 32232.11438659309
Epoch: 34 Test Loss: 0.00520062 Val RMSE: 32094.805962855124
Epoch: 35 Test Loss: 0.00485962 Val RMSE: 32028.847453719543
Epoch: 36 Test Loss: 0.00445387 Val

Epoch:  7 Test Loss: 0.06665197 Val RMSE: 35657.70329412074
Epoch:  8 Test Loss: 0.06303789 Val RMSE: 35552.83397130359
Epoch:  9 Test Loss: 0.05941722 Val RMSE: 35449.20967170489
Epoch: 10 Test Loss: 0.05559086 Val RMSE: 35340.00309865827
Epoch: 11 Test Loss: 0.05138844 Val RMSE: 35219.555696474716
Epoch: 12 Test Loss: 0.04664468 Val RMSE: 35081.41605380744
Epoch: 13 Test Loss: 0.04119417 Val RMSE: 34921.7879819793
Epoch: 14 Test Loss: 0.03488135 Val RMSE: 34735.89820949756
Epoch: 15 Test Loss: 0.02760890 Val RMSE: 34553.99770454303
Epoch: 16 Test Loss: 0.01947960 Val RMSE: 34439.03647988288
Epoch: 17 Test Loss: 0.01110714 Val RMSE: 34565.43968915745
Epoch: 18 Test Loss: 0.00399465 Val RMSE: 34934.60407243185
Epoch: 19 Test Loss: 0.00026309 Val RMSE: 35033.3616807676
Epoch: 20 Test Loss: 0.00073101 Val RMSE: 33115.08299773251
Epoch: 21 Test Loss: 0.00328781 Val RMSE: 31712.089871383305
Epoch: 22 Test Loss: 0.00542578 Val RMSE: 31491.87102814283
Epoch: 23 Test Loss: 0.00668190 Val RMSE

Epoch: 64 Test Loss: 0.02226522 Val RMSE: 34238.95223341143
Epoch: 65 Test Loss: 0.02182591 Val RMSE: 34143.953859058594
Epoch: 66 Test Loss: 0.02137258 Val RMSE: 34049.197755259214
Epoch: 67 Test Loss: 0.02090414 Val RMSE: 33955.60479497938
Epoch: 68 Test Loss: 0.02041966 Val RMSE: 33864.31252494941
Epoch: 69 Test Loss: 0.01991789 Val RMSE: 33776.57589361207
Epoch: 70 Test Loss: 0.01939755 Val RMSE: 33693.83094147392
Epoch: 71 Test Loss: 0.01885743 Val RMSE: 33617.66855947126
Epoch: 72 Test Loss: 0.01829649 Val RMSE: 33549.8427178219
Epoch: 73 Test Loss: 0.01771337 Val RMSE: 33431.673779292825
Epoch: 74 Test Loss: 0.01710677 Val RMSE: 33309.86047009787
Epoch: 75 Test Loss: 0.01647523 Val RMSE: 33015.00142418969
Epoch: 76 Test Loss: 0.01581776 Val RMSE: 32657.619240609085
Epoch: 77 Test Loss: 0.01513339 Val RMSE: 32251.557082402043
Epoch: 78 Test Loss: 0.01442130 Val RMSE: 31822.491838511363
Epoch: 79 Test Loss: 0.01368095 Val RMSE: 31340.56369309485
Epoch: 80 Test Loss: 0.01291240 Val

Epoch: 86 Test Loss: 0.00044274 Val RMSE: 16839.02164626893
Epoch: 87 Test Loss: 0.00043251 Val RMSE: 16892.59922388818
Epoch: 88 Test Loss: 0.00042259 Val RMSE: 16947.501188772407
Epoch: 89 Test Loss: 0.00041299 Val RMSE: 17003.17518114193
Epoch: 90 Test Loss: 0.00040369 Val RMSE: 17059.116655959147
Epoch: 91 Test Loss: 0.00039467 Val RMSE: 17115.007921624958
Epoch: 92 Test Loss: 0.00038592 Val RMSE: 17170.541232603584
Epoch: 93 Test Loss: 0.00037744 Val RMSE: 17225.510652989367
Epoch: 94 Test Loss: 0.00036920 Val RMSE: 17279.73297426878
Epoch: 95 Test Loss: 0.00036121 Val RMSE: 17333.13986852128
Epoch: 96 Test Loss: 0.00035343 Val RMSE: 17385.552743858276
Model trained for window_size of 20 and hidden size of 20
Fitting for window_size of 20 and hidden size of 40
Epoch:  1 Test Loss: 1.42535675 Val RMSE: 2253224.015187576
Epoch:  2 Test Loss: 1.27563107 Val RMSE: 687562.2731673368
Epoch:  3 Test Loss: 1.11465311 Val RMSE: 210644.4106661223
Epoch:  4 Test Loss: 0.91295010 Val RMSE: 84

Epoch: 63 Test Loss: 0.00199844 Val RMSE: 16431.740081438875
Epoch: 64 Test Loss: 0.00192551 Val RMSE: 16540.27109727234
Epoch: 65 Test Loss: 0.00185711 Val RMSE: 16652.790891558576
Epoch: 66 Test Loss: 0.00179283 Val RMSE: 16769.125626397876
Epoch: 67 Test Loss: 0.00173233 Val RMSE: 16888.978565718156
Epoch: 68 Test Loss: 0.00167530 Val RMSE: 17007.382536103654
Epoch: 69 Test Loss: 0.00162150 Val RMSE: 17129.269960745478
Epoch: 70 Test Loss: 0.00157064 Val RMSE: 17254.460174160315
Epoch: 71 Test Loss: 0.00152251 Val RMSE: 17382.599823409175
Model trained for window_size of 20 and hidden size of 60
Fitting for window_size of 20 and hidden size of 80
Epoch:  1 Test Loss: 1.04213071 Val RMSE: 121133.10713075043
Epoch:  2 Test Loss: 0.75439876 Val RMSE: 76823.14662576994
Epoch:  3 Test Loss: 0.11304074 Val RMSE: 41410.456688489765
Epoch:  4 Test Loss: 0.09157392 Val RMSE: 40154.220262423274
Epoch:  5 Test Loss: 0.08342589 Val RMSE: 39683.66584963269
Epoch:  6 Test Loss: 0.07897518 Val RMS

Epoch: 13 Test Loss: 0.17473365 Val RMSE: 65530.803844720795
Epoch: 14 Test Loss: 0.13966124 Val RMSE: 61955.75219958067
Epoch: 15 Test Loss: 0.11169599 Val RMSE: 58718.96374597927
Epoch: 16 Test Loss: 0.09010515 Val RMSE: 55800.79154851411
Epoch: 17 Test Loss: 0.07380769 Val RMSE: 53276.76937097259
Epoch: 18 Test Loss: 0.06165823 Val RMSE: 51163.7472348996
Epoch: 19 Test Loss: 0.05262998 Val RMSE: 49432.31599801762
Epoch: 20 Test Loss: 0.04589031 Val RMSE: 48028.060751455
Epoch: 21 Test Loss: 0.04080480 Val RMSE: 46889.81396031441
Epoch: 22 Test Loss: 0.03690699 Val RMSE: 45960.17795135864
Epoch: 23 Test Loss: 0.03386142 Val RMSE: 45190.26068502089
Epoch: 24 Test Loss: 0.03142899 Val RMSE: 44540.4747574218
Epoch: 25 Test Loss: 0.02944040 Val RMSE: 43979.899961510775
Epoch: 26 Test Loss: 0.02777534 Val RMSE: 43484.78611175959
Epoch: 27 Test Loss: 0.02634864 Val RMSE: 43037.194701680215
Epoch: 28 Test Loss: 0.02509953 Val RMSE: 42623.65109259866
Epoch: 29 Test Loss: 0.02398524 Val RMSE:

Epoch: 20 Test Loss: 0.03184095 Val RMSE: 43021.67259226572
Epoch: 21 Test Loss: 0.03069704 Val RMSE: 42656.740363780205
Epoch: 22 Test Loss: 0.02961858 Val RMSE: 42295.122398245774
Epoch: 23 Test Loss: 0.02859640 Val RMSE: 41935.045831130104
Epoch: 24 Test Loss: 0.02762436 Val RMSE: 41575.524425015006
Epoch: 25 Test Loss: 0.02669827 Val RMSE: 41216.05347682268
Epoch: 26 Test Loss: 0.02581475 Val RMSE: 40856.333029881775
Epoch: 27 Test Loss: 0.02497110 Val RMSE: 40496.25170478836
Epoch: 28 Test Loss: 0.02416520 Val RMSE: 40135.78803301115
Epoch: 29 Test Loss: 0.02339487 Val RMSE: 39774.85414316429
Epoch: 30 Test Loss: 0.02265816 Val RMSE: 39413.42193240144
Epoch: 31 Test Loss: 0.02195352 Val RMSE: 39051.54397064728
Epoch: 32 Test Loss: 0.02127928 Val RMSE: 38689.292065729256
Epoch: 33 Test Loss: 0.02063411 Val RMSE: 38326.89034400251
Epoch: 34 Test Loss: 0.02001624 Val RMSE: 37964.5154229962
Epoch: 35 Test Loss: 0.01942392 Val RMSE: 37602.25651616542
Epoch: 36 Test Loss: 0.01885471 Val

Epoch: 64 Test Loss: 0.00305980 Val RMSE: 25567.168404755772
Epoch: 65 Test Loss: 0.00293385 Val RMSE: 28482.8725662181
Epoch: 66 Test Loss: 0.00280474 Val RMSE: 31506.228667198702
Epoch: 67 Test Loss: 0.00267092 Val RMSE: 34619.11815610967
Epoch: 68 Test Loss: 0.00253188 Val RMSE: 37820.22328667733
Epoch: 69 Test Loss: 0.00238818 Val RMSE: 41146.67727043762
Epoch: 70 Test Loss: 0.00224112 Val RMSE: 44656.446392284204
Epoch: 71 Test Loss: 0.00209263 Val RMSE: 48275.53440400652
Epoch: 72 Test Loss: 0.00194492 Val RMSE: 51993.92402872607
Epoch: 73 Test Loss: 0.00180029 Val RMSE: 55786.32853580105
Epoch: 74 Test Loss: 0.00166091 Val RMSE: 59614.12338909267
Epoch: 75 Test Loss: 0.00152867 Val RMSE: 63423.52843022643
Model trained for window_size of 40 and hidden size of 40
Fitting for window_size of 40 and hidden size of 60
Epoch:  1 Test Loss: 0.97992855 Val RMSE: 244813.55166496555
Epoch:  2 Test Loss: 0.79246360 Val RMSE: 126554.11190548625
Epoch:  3 Test Loss: 0.43485010 Val RMSE: 9475