In [0]:
# Break dataset into multiple dataset per unique geohash
def splitDataset(data):
  dict_of_df = {}
  alluniqueGeohash = data.geohash6.unique()
  geospecificdata = pd.DataFrame();
  for geohash in alluniqueGeohash:
    keyName = geohash;
    dict_of_df[keyName] = data.loc[data['geohash6'] == keyName]
    print( keyName + " dataset created")
  return dict_of_df, alluniqueGeohash;

In [0]:
# Convert day & timestamp into date
def parseToDateTime( geospecificdata):

  currentDate = date.today()
  maxDay = max(geospecificdata['day']) + 1;

  for index,row in geospecificdata.iterrows():    
      historyStart = currentDate - timedelta(days= maxDay -row['day'])  
      dateTime = datetime.datetime.strptime( str( historyStart) + row['timestamp'], "%Y-%m-%d%H:%M")
      geospecificdata.at[index,'Datetime'] = dateTime;

  newdf = geospecificdata.sort_values(by=[('Datetime' )], inplace = False ).copy()
  return newdf

In [0]:
# Split into train and test sets
def splitTrainTest( data ):
    
  train_size = int(len(data) * trainRatio)
  test_size = len(data) - train_size
  trainData, testData = data[:train_size], data.tail(test_size)
  
  # Get the subset of the data for training
  training_set = trainData.iloc[:, 3].values

  # Create input for training
  import numpy as np
  X_train = []
  y_train =[]
  for i in range(nrofinputs, len(training_set)):
      X_train.append(training_set[i-nrofinputs:i]);
      y_train.append(training_set[i])

  X_train, y_train = np.array(X_train), np.array(y_train)
  if(len(X_train) > 0 ):
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
  return X_train,y_train,trainData,testData

In [0]:
# Plot demand vs DateTime
def plotData( geospecificdata):

  plt.figure(figsize=(20,10))
  plt.plot(geospecificdata['Datetime'], geospecificdata['demand'], linestyle = 'solid', marker = 'None')
  plt.title( geospecificdata.iloc[0,0])

In [0]:
def validate( regressor, trainData, testData, isLSTM ):
    training_set = trainData.iloc[:, 3].values
    # Create test data for prediction
    test_set = training_set[-nrofinputs:]
    #test_set = np.concatenate(( test_set ,(testData.iloc[:, 3].values)) )

    predictedValues = [];
    results = [];
    X_test = np.array(test_set[0:nrofinputs]  );
    X_test = X_test.reshape(-1,nrofinputs)
    predictedValues = regressor.predict(X_test)
    results.append( predictedValues[0])
    X_test = np.append( np.delete(X_test, 0 ), predictedValues[0] );

    # Make prediction
    for i in range(nrofinputs + 1, len(testData) + nrofinputs ):
        if( isLSTM):
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
        X_test = X_test.reshape(-1,nrofinputs)
        predictedValues = regressor.predict(X_test);
        results.append( predictedValues[0])
        X_test = np.append( np.delete(X_test, 0 ), predictedValues[0] );
    return results;

In [0]:
def compileAndVisualizeResult( data, predictedValues, isVisualize, name ):
  
  # Get actualValues
  train_size = int(len(data) * trainRatio )
  test_size = len(data) - train_size
  actualValues = data['demand'].tail(test_size).values
  # Calculate RMSE
  rmse = sqrt(mean_squared_error(actualValues, predictedValues ))
  print('Test RMSE: %.3f' % rmse)
  
  # Store result in dictionary
  result = pd.DataFrame();
  result["ActualValues"] = actualValues;
  result["PredictedValues"] = predictedValues;
  result['RMSE'] = rmse;
  dict_of_results[name] = result;
 
  # Visualising the results
  if( isVisualize):
      plt.plot(data['Datetime'].tail(test_size), actualValues, color = 'red', label = 'Actual demand')
      plt.plot(data['Datetime'].tail(test_size), predictedValues, color = 'blue', label = 'Predicted demand')
      plt.title('Actual vs predicted')
      plt.xlabel('Datetime')
      plt.ylabel('Demand')
      plt.legend()
      plt.show()

In [0]:
def buildGradientBoostingRegressor( X_train, y_train ):
    clf = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=23)
    lr_train = X_train.reshape(-1,nrofinputs)
    clf.fit(lr_train, y_train)
    return clf;

In [0]:
def buildRandomForestRegressor( X_train, y_train ):
    # Initialize Random Forest Regressor from scikit-learn
    clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=23)
    lr_train = X_train.reshape(-1,nrofinputs)
    clf.fit(lr_train, y_train)
    return clf;

In [0]:
def buildLinearRegressor( X_train, y_train):
    #implement linear regression
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    lr_train = X_train.reshape(-1,nrofinputs)
    model.fit(lr_train,y_train)
    return model;

In [0]:
def exportModel( model,name, isKeras, isLR, isGBR ):
  if( isKeras):
    name = MODEL_FILE_PATH + name + ".h5"
    model.save( name )
  else:
    if(isLR):
      name = MODEL_FILE_PATH + name + ".sav";
    elif( isGBR ):
      name = MODEL_FILE_PATH + name + ".sav"
    else:
      name = MODEL_FILE_PATH + name + ".sav"
    joblib.dump( model, name );

In [0]:
# Get data for testing
def processTestData( data, nrofinputs ):
  # Get the subset of the data for testing
  testing_set = data.iloc[:, 3].values
  # Create input for testing
  X_test = []
  X_test.append(testing_set[len(testing_set)-nrofinputs:len(testing_set)]);

  X_test = np.array(X_test)
  if(len(X_test) > 0 ):
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
  return X_test

In [0]:
def getNrofInputs(data):
    nrofdata = len(data);
    nrofdays = len(data.day.unique() )
    averagedataperday = int( nrofdata / nrofdays );
    nrOfInput = averagedataperday * 7;
    return nrOfInput

In [0]:
def predict( testFilepath, modelFolderpath, isVisualizeResult ):
    
    dict_of_test_results = {};
    # Test filepath
    data = pd.read_csv( testFilepath );
    
    # Split test file into multiple dataset
    dict_of_test_df, alluniqueTestGeohash = splitDataset(data);
    count = 1;
    
    # Define number of predictions, default will be 5
    nroftimesteppredicted = 5;
    
    with open('nrofinputs.pkl', 'rb') as f:
        dict_of_nr_inputs = pickle.load(f)
    
    # Loop thorough all GeoHash
    for geoHash in alluniqueTestGeohash:
        print("Testing-----------------------------", count, geoHash)
        df = parseToDateTime( dict_of_test_df[geoHash] );
        
        nrofinputs = dict_of_nr_inputs[geoHash];
        
        # Get the number of inputs from history
        X_test = processTestData( df, nrofinputs );
        X_test_array = np.array(X_test).reshape(1,X_test.shape[1] )
    
        # Get the pre-trained model based on geohash code
        filename = modelFolderpath + geoHash + ".sav"
        model = joblib.load(filename)
        results = [];
        
        # Perform recurvie multi-step forecast
        # 1st predicted value will be used as input to predict next value
        for i in range( 0, nroftimesteppredicted ):
            inputRow = np.array( X_test_array[0] )
            inputRow = inputRow.reshape(1,inputRow.shape[0])
            predictedValues = model.predict(inputRow )
            results.append(predictedValues[0])
            
            # Discard first value from inputs, and append predicted value to the end of array
            X_test_array = np.append( np.delete(X_test_array, 0 ), predictedValues[0] );
            X_test_array = X_test_array.reshape(1,X_test.shape[1] )
        
        count = count + 1;
        dict_of_test_results[geoHash] = results;
        # Plot the results
        if( isVisualizeResult ):
          plt.figure(figsize=(20,10))
          plt.plot(df.iloc[len(df) - len(results):len(df)]['Datetime'], results, linestyle = 'solid', marker = 'None')
          plt.title( geoHash)
    return dict_of_test_results;

In [0]:
def ImportData(filepath):
  # Import data

  #Mount google drive
  #from google.colab import drive 
  #drive.mount('/content/gdrive', force_remount=True)

  #Read the file from google drive
  data = pd.read_csv( filepath )
  #data = pd.read_csv( 'gdrive/My Drive/Machine Learning Playground/training.csv' )

  # Split into multiple datasets based on Hash
  dict_of_df, alluniqueGeohash = splitDataset(data);
  return dict_of_df, alluniqueGeohash;

In [0]:
def Train( dict_of_df, alluniqueGeohash ):
   
  count = 1;
  # LinearRegressor
  for geoHash in alluniqueGeohash:
      print("Linear Regression----------------------------", count, geoHash)
      
      # Convert day & timestamp into datetime
      df = parseToDateTime( dict_of_df[geoHash] );
      
      # Calculate number of inputs per geohash
      nrofinputs = getNrofInputs(dict_of_df[geoHash] );
      dict_of_nrofinputs[geoHash] = nrofinputs
      
      # Visualize the results, can be disabled to speed up the training
      if( isVisualizeResult):
        plotData(df); 
      
      # Split the dataset into train - test 
      X_train, y_train, trainData, testData = splitTrainTest(df);
      
      
      if(len(X_train) > 0 and len(y_train ) > 0):
        
        # Build Linear regression model
        dict_of_model_lr[geoHash] = buildLinearRegressor( X_train, y_train)
        
        # Validate the results
        predictedValues_lr = validate( dict_of_model_lr[geoHash], trainData, testData, isKeras )
        
        # Compute results in RMSE and visualize the results
        compileAndVisualizeResult( df, predictedValues_lr, isVisualizeResult, geoHash )
        
        # export models to local filepath
        exportModel( dict_of_model_lr[geoHash], geoHash, isKeras, isLR, isGBR )
      count = count + 1;

  # Export expected number of inputs
  f = open("nrofinputs.pkl","wb")
  pickle.dump(dict_of_nrofinputs,f)
  f.close()

In [0]:
# Main execution block

import numpy as np
import pandas as pd
import datetime 
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.externals import joblib
from math import sqrt
from datetime import date, timedelta

pd.options.mode.chained_assignment = None

# Define variable
TRAINING_FILE_PATH = 'C:/Users/PXY/Desktop/training.csv';
TESTING_FILE_PATH = 'C:/Users/PXY/Desktop/training.csv';
MODEL_FILE_PATH = "C:/Users/PXY/Desktop/Models_LR/";
trainRatio = 0.8
nrofinputs = 50
dict_of_model_arima = {};
dict_of_model_lr = {};
dict_of_model_gbr = {};
dict_of_model_rfr = {};
dict_of_model_lstm = {};
dict_of_results = {};
dict_of_nrofinputs = {};
dict_of_predicted_results = {};

isKeras = False;
isLSTM = False;
isLR = True;
isGBR = False;
isVisualizeResult = False;

dict_of_df, alluniqueGeohash = ImportData( TRAINING_FILE_PATH );
Train(dict_of_df, alluniqueGeohash)
dict_of_predicted_results = predict( TESTING_FILE_PATH, MODEL_FILE_PATH, isVisualizeResult)

In [0]:
dict_of_predicted_results = predict( TESTING_FILE_PATH, MODEL_FILE_PATH, False)