In [14]:
#Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
from geopy.geocoders import Nominatim
from sklearn import preprocessing

#Path to datafile
datapath = "All_zip.csv"
WINDOW_SIZE = 50
LEARNING_RATE = 0.0001
N_EPOCHS = 10

In [15]:
#Data cleaning
class PriceData():

    def __init__(self, filename):
        self.df = self.generateDF(filename)

    def generateDF(self, filename):
        
        def lat(row):
            return geolocator.geocode({"postalcode":row["RegionName"]})[1][0]

        def long(row):
            return geolocator.geocode({"postalcode":row["RegionName"]})[1][1]

        
        orig_df = pd.read_csv(filename)
        filtered_df = orig_df.loc[(orig_df['State'] == "CA")]
        
        

        columns = filtered_df.columns
        

        cal_df = filtered_df \
            .dropna() \
            .drop(columns=columns[3:9]) \
            .drop(columns=columns[0:2]) \
            .rename(columns={'RegionName': 'ZipCode'}) \
            .reset_index(drop=True)
        
        
        cal_df.set_index("ZipCode", inplace = True)

#         for zip_code in cal_df.index:
#             print(zip_code)
#             cal_df["windows"] = cal_df.apply(lambda row: createBatches(row["prices"]), axis = 1)
        

        return cal_df

In [16]:
class LSTM(nn.Module):
    
    def __init__(self,input_size = 1, hidden_size = 100, out_size = 1, learning_rate = LEARNING_RATE, epochs = N_EPOCHS):

        super().__init__()
        
        #Model
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size,out_size)
        self.hidden = (torch.zeros(1,1,hidden_size),torch.zeros(1,1,hidden_size))
        self.window_size = None
        self.prices = None
        
        #Data processing
        self.scaler = None
        
        #Training parameters
        self.criterion = nn.MSELoss()
        self.optimiser = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        
        #Hyperparameters
        self.epochs = epochs
        
        
        
    
    def _preprocessor(self, ySeries, window_size = WINDOW_SIZE):
        y = ySeries.values

        scaler = preprocessing.MinMaxScaler()
        y_scaled = scaler.fit_transform(y.reshape(-1, 1))
        self.scaler = scaler
        self.window_size = window_size

        out = []
        self.prices = y_scaled[:window_size].reshape(window_size)
        for i in range(len(y_scaled) - window_size):
            window = torch.tensor(y_scaled[i:i+window_size]).to(torch.float32)
            label = torch.tensor(y_scaled[i+window_size:i+window_size+1]).to(torch.float32)
            self.prices = np.append(self.prices, y_scaled[i+window_size:i+window_size+1][0][0])
            out.append((window, label))
        return out
    
    def fit(self, y):
        y_train = self._preprocessor(y)
        
        for epoch in range(self.epochs):
            for seq, y_hat in y_train:
                self.optimiser.zero_grad()
                self.hidden = (torch.zeros(1,1,self.hidden_size), torch.zeros(1,1,self.hidden_size))

                y_pred = self(seq)
#                 print("y_pred:", y_pred, "y_hat:", y_hat)
                loss = self.criterion(y_hat[0], y_pred)
                loss.backward() 
                self.optimiser.step()
            print(f'Epoch: {epoch+1:2} Loss: {loss.item():10.8f}')
        
    def predict(self, future_instances):
        
        self.eval()
        for i in range(future_instances):
            seq = torch.FloatTensor(self.prices[-self.window_size:])
            print("Predict window", i, ":", seq)
            with torch.no_grad():
                self.hidden = (torch.zeros(1,1,self.hidden_size),
                                torch.zeros(1,1,self.hidden_size))
                self.prices = np.append(self.prices, self(seq).item())
        
        predictions = self.scaler.inverse_transform(np.array(self.prices[self.window_size:]).reshape(-1, 1))
        print(self.prices[self.window_size])
        return predictions
    
    def score(self, y):
        y_pred = self.predict(len(y))
        rmse = np.sqrt(np.mean(((np.array(y_pred)-np.array(y)))**2))
        return rmse

        
        
    def forward(self,seq):
        lstm_out, self.hidden = self.lstm(seq.view(len(seq),1,-1), self.hidden)
        pred = self.linear(lstm_out.view(len(seq),-1))
        return pred[-1]

In [17]:
def split_dataset(ySeries, test_proportion):

    
    n_test = round(len(ySeries) * test_proportion)
    n_train = (len(ySeries) - n_test)

    y_train = y.iloc[:n_train]
    

    y_test = y.iloc[n_train :]
    
    #print (x_train, x_valid, x_test, y_train, y_valid, y_test)
    return (y_train, y_test)


if __name__ == "__main__":
    cal_df = PriceData(datapath).df

In [18]:
# plt.figure(figsize = (12,4))
# plt.title('House Prices')
# plt.xlabel('Time')
# plt.ylabel('Price')
# plt.grid(True)
# plt.autoscale(axis='x',tight=True)
# plt.plot(cal_df.loc[91331],color='#8000ff')
# plt.show()

In [19]:
y = cal_df.loc[91331]

y_train, y_test = split_dataset(y, 0.1)

model = LSTM()
model.fit(y_train)

preds = model.predict(len(y_test))
print("Predictions:", preds)
print("Golden:", y_test)

Epoch:  1 Loss: 0.00067032
Epoch:  2 Loss: 0.00600693
Epoch:  3 Loss: 0.00010316
Epoch:  4 Loss: 0.00051821
Epoch:  5 Loss: 0.00113403
Epoch:  6 Loss: 0.00104663
Epoch:  7 Loss: 0.00067235
Epoch:  8 Loss: 0.00036965
Epoch:  9 Loss: 0.00021266
Epoch: 10 Loss: 0.00015679
Predict window 0 : tensor([0.6729, 0.6774, 0.6850, 0.6948, 0.7017, 0.7054, 0.7106, 0.7149, 0.7197,
        0.7210, 0.7260, 0.7328, 0.7406, 0.7481, 0.7546, 0.7615, 0.7718, 0.7811,
        0.7920, 0.8056, 0.8197, 0.8318, 0.8396, 0.8482, 0.8565, 0.8634, 0.8700,
        0.8758, 0.8802, 0.8789, 0.8769, 0.8724, 0.8681, 0.8681, 0.8690, 0.8725,
        0.8687, 0.8703, 0.8738, 0.8855, 0.8954, 0.9097, 0.9242, 0.9430, 0.9616,
        0.9772, 0.9785, 0.9777, 0.9819, 1.0000])
Predict window 1 : tensor([0.6774, 0.6850, 0.6948, 0.7017, 0.7054, 0.7106, 0.7149, 0.7197, 0.7210,
        0.7260, 0.7328, 0.7406, 0.7481, 0.7546, 0.7615, 0.7718, 0.7811, 0.7920,
        0.8056, 0.8197, 0.8318, 0.8396, 0.8482, 0.8565, 0.8634, 0.8700, 0.8758,
   

In [None]:
y_train