In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
from geopy.geocoders import Nominatim
from sklearn import preprocessing
import pickle
import math
import copy

datapath = "All_zip.csv"
WINDOW_SIZE = 50
LEARNING_RATE = 0.000025
N_EPOCHS = 100
HIDDEN_SIZE = 100
EPOCHS_EARLY = 20

class PriceData():

    def __init__(self, filename):
        self.df = self.generateDF(filename)

    def generateDF(self, filename):
        
        def lat(row):
            return geolocator.geocode({"postalcode":row["RegionName"]})[1][0]

        def long(row):
            return geolocator.geocode({"postalcode":row["RegionName"]})[1][1]

        
        orig_df = pd.read_csv(filename)
        filtered_df = orig_df.loc[(orig_df['State'] == "CA")]
        
        

        columns = filtered_df.columns
        

        cal_df = filtered_df \
            .dropna() \
            .drop(columns=columns[3:9]) \
            .drop(columns=columns[0:2]) \
            .rename(columns={'RegionName': 'ZipCode'}) \
            .reset_index(drop=True)
        
        
        cal_df.set_index("ZipCode", inplace = True)

#         for zip_code in cal_df.index:
#             print(zip_code)
#             cal_df["windows"] = cal_df.apply(lambda row: createBatches(row["prices"]), axis = 1)
        

        return cal_df
    
    
    
class LSTM(nn.Module):
    
    def __init__(self,input_size = 1, window_size = WINDOW_SIZE, hidden_size = HIDDEN_SIZE, out_size = 1, learning_rate = LEARNING_RATE, epochs = N_EPOCHS):

        super().__init__()
        
        #Model
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size,out_size)
        self.hidden = (torch.zeros(1,1,hidden_size),torch.zeros(1,1,hidden_size))
        self.window_size = None
        self.prices = None
        self.window_size = window_size
        
        #Data processing
        self.scaler = None
        
        #Training parameters
        self.criterion = nn.MSELoss()
        self.optimiser = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        
        #Hyperparameters
        self.epochs = epochs
        
        
        
    
    def _preprocessor(self, ySeries):
        y = ySeries.values

        
        
        window_size = self.window_size

        out = []
        self.prices = y[:window_size].reshape(window_size)
        for i in range(len(y) - window_size):
            scaler = preprocessing.MinMaxScaler()
            y_scaled = scaler.fit_transform(y[i:i+window_size+1].reshape(-1, 1))
            self.scaler = scaler
            window = torch.tensor(y_scaled[:window_size]).to(torch.float32)
            label = torch.tensor(y_scaled[window_size:window_size+1]).to(torch.float32)
            self.prices = np.append(self.prices, y[i+window_size:i+window_size+1][0])
            out.append((window, label))
        return out
    
    def fit(self, y, y_val):
        y_train = self._preprocessor(y)
        
        epochs_wo_imp = 0
        lowest_rmse = math.inf
        val_rmse = math.inf
        
        
        for epoch in range(self.epochs):
            for seq, y_hat in y_train:
                self.optimiser.zero_grad()
                self.hidden = (torch.zeros(1,1,self.hidden_size), torch.zeros(1,1,self.hidden_size))

                y_pred = self(seq)
                loss = self.criterion(y_hat[0], y_pred)
                loss.backward() 
                self.optimiser.step()
            
            val_rmse = self.score(y_val)[0]
            
            if (epoch > 2):
                if (val_rmse < lowest_rmse):
                    best_model = copy.deepcopy(self)
                    best_model.prices = np.append(best_model.prices, y_val)
                    lowest_rmse = val_rmse
                    epochs_wo_imp = 0
                else:
                    epochs_wo_imp += 1

                if (epochs_wo_imp == EPOCHS_EARLY):
                    return best_model, lowest_rmse
            
            print(f'Epoch: {epoch+1:2} Test Loss: {loss.item():10.8f} Val RMSE: {val_rmse}')
            
        return (best_model, lowest_rmse)
                
            
    def predict(self, future_instances):
        
        self.eval()
        for i in range(future_instances):
            scaler = preprocessing.MinMaxScaler()
            seq = torch.FloatTensor(scaler.fit_transform(self.prices[-self.window_size:].reshape(-1, 1)))
            self.scaler = scaler
            with torch.no_grad():
                self.hidden = (torch.zeros(1,1,self.hidden_size),
                                torch.zeros(1,1,self.hidden_size))
                self.prices = np.append(self.prices, self.scaler.inverse_transform(np.array(self(seq).item()).reshape(-1, 1)))
        
        predictions = self.prices[-future_instances:]
        self.prices = self.prices[:-future_instances]
        return predictions
    
    def predict_inplace(self, future_instances):
        
        self.eval()
        for i in range(future_instances):
            scaler = preprocessing.MinMaxScaler()
            seq = torch.FloatTensor(scaler.fit_transform(self.prices[-self.window_size:].reshape(-1, 1)))
            self.scaler = scaler
            with torch.no_grad():
                self.hidden = (torch.zeros(1,1,self.hidden_size),
                                torch.zeros(1,1,self.hidden_size))
                self.prices = np.append(self.prices, self.scaler.inverse_transform(np.array(self(seq).item()).reshape(-1, 1)))
        
        predictions = self.prices[-future_instances:]
        return predictions
    
    def score(self, y):
        y_pred = self.predict(len(y))
        rmse = np.sqrt(np.mean(((np.array(y_pred)-np.array(y)))**2))
        return rmse, y_pred

        
        
    def forward(self,seq):
        lstm_out, self.hidden = self.lstm(seq.view(len(seq),1,-1), self.hidden)
        pred = self.linear(lstm_out.view(len(seq),-1))
        return pred[-1]
    
    
def split_dataset(y, valid_proportion, test_proportion):

    n_test = round(len(y) * test_proportion)
    n_valid = round(len(y) * valid_proportion)
    n_train = (len(y) - n_test) - n_valid
    
    y_train = y[:n_train]
    y_valid = y[n_train:n_train + n_valid]
    y_test = y[n_train + n_valid:]
    
    return (y_train, y_valid, y_test)


def save_regressor(trained_model, zipcode): 
    """ 
    Utility function to save the trained regressor model in part2_model.pickle.
    """
    # If you alter this, make sure it works in tandem with load_regressor
    with open(str(zipcode) + '_model.pickle', 'wb') as target:
        pickle.dump(trained_model, target)
    print(f"\nSaved model in {zipcode}_model.pickle\n")


def load_regressor(zipcode): 
    """ 
    Utility function to load the trained regressor model in part2_model.pickle.
    """
    # If you alter this, make sure it works in tandem with save_regressor
    with open(str(zipcode) + '_model.pickle', 'rb') as target:
        trained_model = pickle.load(target)
    print(f"\nLoaded model in {zipcode}_model.pickle\n")
    return trained_model


def hyperparameterSearch( window_sizes = [10, 20, 40, 80, 100], hidden_sizes = [10, 20, 40, 60, 80, 100, 150], zipcode = 91331, df = cal_df):
    lowest_rmse = math.inf
    val_rmse = math.inf
    

    for window_size in window_sizes:
        WINDOW_SIZE = window_size
        for hidden_size in hidden_sizes:
            HIDDEN_SIZE = hidden_size
            
            print(f"Fitting for window_size of {WINDOW_SIZE} and hidden size of {HIDDEN_SIZE}")
            y = df.loc[zipcode]

            y_train, y_valid, y_test = split_dataset(y, 0.1, 0.1)

            model = LSTM(window_size = WINDOW_SIZE, hidden_size = HIDDEN_SIZE)
            model, val_rmse = model.fit(y_train, y_valid)
            print(f"Model trained for window_size of {model.window_size} and hidden size of {model.hidden_size}")


            if (val_rmse < lowest_rmse):
                best_model = copy.deepcopy(model)
                lowest_rmse = val_rmse
                
    save_regressor(best_model,zipcode)
    return best_model

def evaluate(zipcode, cal_df):
    model = load_regressor(zipcode)
    print("Window size:", model.window_size, "Hidden size:", model.hidden_size)
    y = cal_df.loc[zipcode]
    y_train, y_valid, y_test = split_dataset(y, 0.1, 0.1)
    test_rmse, y_preds = model.score(y_test)
    print("RMSE:", test_rmse)


    plt.plot(np.append(model.prices, y_preds))
    plt.plot(np.append(y_train, np.append(y_valid, y_test)))
    
if __name__ == "__main__":
    cal_df = PriceData(datapath).df
    zip_codes = cal_df.index[10:11]
    for zip_code in zip_codes:
        print(f"Training Model for ZipCode {zip_code}")
        hyperparameterSearch( window_sizes = [50], hidden_sizes = [100], zipcode = zip_code, df = cal_df)

        print(f"Evaluating Model for ZipCode {zip_code}")
        evaluate(zip_code, cal_df)


Training Model for ZipCode 91342
Fitting for window_size of 50 and hidden size of 100
Epoch:  1 Test Loss: 0.93648630 Val RMSE: 145848.70541209934
Epoch:  2 Test Loss: 0.71711433 Val RMSE: 124605.04052143593
Epoch:  3 Test Loss: 0.16873655 Val RMSE: 68582.11572668009
Epoch:  4 Test Loss: 0.09283120 Val RMSE: 55677.11870716107
Epoch:  5 Test Loss: 0.10093425 Val RMSE: 58704.58399920813
Epoch:  6 Test Loss: 0.09235884 Val RMSE: 57610.72533828434
Epoch:  7 Test Loss: 0.08046671 Val RMSE: 55442.74324564358
Epoch:  8 Test Loss: 0.07162179 Val RMSE: 53750.075327539336
Epoch:  9 Test Loss: 0.06448394 Val RMSE: 52316.788032330165
Epoch: 10 Test Loss: 0.05827413 Val RMSE: 50992.72633898256
Epoch: 11 Test Loss: 0.05258998 Val RMSE: 49689.40890725855
Epoch: 12 Test Loss: 0.04721517 Val RMSE: 48348.85654217488
Epoch: 13 Test Loss: 0.04202900 Val RMSE: 46926.78967463009
Epoch: 14 Test Loss: 0.03696821 Val RMSE: 45385.478331923114
Epoch: 15 Test Loss: 0.03201133 Val RMSE: 43691.58614665234


In [None]:
print(f"Evaluating Model for ZipCode {zip_code}")
evaluate(zip_code, cal_df)