In [492]:
# Break dataset into multiple dataset per unique geohash
def splitDataset(data):
  dict_of_df = {}
  alluniqueGeohash = data.geohash6.unique()
  geospecificdata = pd.DataFrame();
  for geohash in alluniqueGeohash[:100]:
    keyName = geohash;
    dict_of_df[keyName] = data.loc[data['geohash6'] == keyName]
    print( keyName + " dataset created")
  return dict_of_df, alluniqueGeohash;

In [23]:
# Convert day & timestamp into date
def parseToDateTime( geospecificdata):

  currentDate = date.today()
  maxDay = max(geospecificdata['day']) + 1;

  for index,row in geospecificdata.iterrows():    
      historyStart = currentDate - timedelta(days= maxDay -row['day'])  
      dateTime = datetime.datetime.strptime( str( historyStart) + row['timestamp'], "%Y-%m-%d%H:%M")
      geospecificdata.at[index,'Datetime'] = dateTime;

  newdf = geospecificdata.sort_values(by=[('Datetime' )], inplace = False ).copy()
  return newdf

In [449]:
# Split into train and test sets
def splitTrainTest( data ):
    
  train_size = int(len(data) * trainRatio)
  test_size = len(data) - train_size
  trainData, testData = data[:train_size], data.tail(test_size)
  
  # Get the subset of the data for training
  training_set = trainData.iloc[:, 3].values

  # Create input for training
  import numpy as np
  X_train = []
  y_train =[]
  for i in range(nrofinputs, len(training_set)):
      X_train.append(training_set[i-nrofinputs:i]);
      y_train.append(training_set[i])

  X_train, y_train = np.array(X_train), np.array(y_train)
  if(len(X_train) > 0 ):
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
  return X_train,y_train,trainData,testData

In [447]:
# Plot demand vs DateTime
def plotData( geospecificdata):

  plt.figure(figsize=(20,10))
  plt.plot(geospecificdata['Datetime'], geospecificdata['demand'], linestyle = 'solid', marker = 'None')
  plt.title( geospecificdata.iloc[0,0])

In [446]:
def validate( regressor, trainData, testData, isLSTM ):
    training_set = trainData.iloc[:, 3].values
    # Create test data for prediction
    test_set = training_set[-nrofinputs:]
    #test_set = np.concatenate(( test_set ,(testData.iloc[:, 3].values)) )

    predictedValues = [];
    results = [];
    X_test = np.array(test_set[0:nrofinputs]  );
    X_test = X_test.reshape(-1,nrofinputs)
    predictedValues = regressor.predict(X_test)
    results.append( predictedValues[0])
    X_test = np.append( np.delete(X_test, 0 ), predictedValues[0] );

    # Make prediction
    for i in range(nrofinputs + 1, len(testData) + nrofinputs ):
        if( isLSTM):
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
        X_test = X_test.reshape(-1,nrofinputs)
        predictedValues = regressor.predict(X_test);
        results.append( predictedValues[0])
        X_test = np.append( np.delete(X_test, 0 ), predictedValues[0] );
    return results;

In [445]:
def compileAndVisualizeResult( data, predictedValues, isVisualize, name ):
  
  # Get actualValues
  train_size = int(len(data) * trainRatio )
  test_size = len(data) - train_size
  actualValues = data['demand'].tail(test_size).values
  # Calculate RMSE
  rmse = sqrt(mean_squared_error(actualValues, predictedValues ))
  print('Test RMSE: %.3f' % rmse)
  
  # Store result in dictionary
  result = pd.DataFrame();
  result["ActualValues"] = actualValues;
  result["PredictedValues"] = predictedValues;
  result['RMSE'] = rmse;
  dict_of_results[name] = result;
 
  # Visualising the results
  if( isVisualize):
      plt.plot(data['Datetime'].tail(test_size), actualValues, color = 'red', label = 'Actual demand')
      plt.plot(data['Datetime'].tail(test_size), predictedValues, color = 'blue', label = 'Predicted demand')
      plt.title('Actual vs predicted')
      plt.xlabel('Datetime')
      plt.ylabel('Demand')
      plt.legend()
      plt.show()

In [440]:
def buildGradientBoostingRegressor( X_train, y_train ):
    clf = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=23)
    lr_train = X_train.reshape(-1,nrofinputs)
    clf.fit(lr_train, y_train)
    return clf;

In [441]:
def buildRandomForestRegressor( X_train, y_train ):
    # Initialize the famous Random Forest Regressor from scikit-learn
    clf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=23)
    lr_train = X_train.reshape(-1,nrofinputs)
    clf.fit(lr_train, y_train)
    return clf;

In [442]:
def buildLinearRegressor( X_train, y_train):
    #implement linear regression
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    lr_train = X_train.reshape(-1,nrofinputs)
    model.fit(lr_train,y_train)
    return model;

In [443]:
def exportModel( model,name, isKeras, isLR, isGBR ):
  if( isKeras):
    name = "gdrive/My Drive/Machine Learning Playground/Models/" + name + ".h5"
    model.save( name )
  else:
    #name = "C:/Users/PXY/Desktop/Models/" + name + ".sav"
    if(isLR):
      name = "C:/Users/PXY/Desktop/Models_LR/" + name + ".sav";
      #name = "gdrive/My Drive/Machine Learning Playground/Models_LR/" + name + ".sav"
    elif( isGBR ):
      name = "gdrive/My Drive/Machine Learning Playground/Models_GBR/" + name + ".sav"
    else:
      name = "gdrive/My Drive/Machine Learning Playground/Models_RFR/" + name + ".sav"
    joblib.dump( model, name );

In [497]:
# Get data for testing
def processTestData( data, nrofinputs ):
  # Get the subset of the data for testing
  testing_set = data.iloc[:, 3].values
  # Create input for testing
  X_test = []
  X_test.append(testing_set[len(testing_set)-nrofinputs:len(testing_set)]);

  X_test = np.array(X_test)
  if(len(X_test) > 0 ):
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
  return X_test

In [467]:
def getNrofInputs(data):
    nrofdata = len(data);
    nrofdays = len(data.day.unique() )
    averagedataperday = int( nrofdata / nrofdays );
    nrOfInput = averagedataperday * 7;
    return nrOfInput

In [504]:
def predict( testFilepath, modelFolderpath ):
    
    dict_of_test_results = {};
    # Test filepath
    data = pd.read_csv( testFilepath );
    
    # Split test file into multiple dataset
    dict_of_test_df, alluniqueTestGeohash = splitDataset(data);
    count = 1;
    
    # Define number of predictions, default will be 5
    nroftimesteppredicted = 5;
    
    with open('nrofinputs.pkl', 'rb') as f:
        dict_of_nr_inputs = pickle.load(f)
    
    # Loop thorough all GeoHash
    for geoHash in alluniqueTestGeohash:
        print("Testing-----------------------------", count, geoHash)
        df = parseToDateTime( dict_of_test_df[geoHash] );
        
        nrofinputs = dict_of_nr_inputs[geoHash];
        
        # Get the number of inputs from history
        X_test = processTestData( df, nrofinputs );
        X_test_array = np.array(X_test).reshape(1,X_test.shape[1] )
    
        # Get the pre-trained model based on geohash code
        filename = modelFolderpath + geoHash + ".sav"
        model = joblib.load(filename)
        results = [];
        
        # Perform recurvie multi-step forecast
        # 1st predicted value will be used as input to predict next value
        for i in range( 0, nroftimesteppredicted ):
            inputRow = np.array( X_test_array[0] )
            inputRow = inputRow.reshape(1,inputRow.shape[0])
            predictedValues = model.predict(inputRow )
            results.append(predictedValues[0])
            X_test_array = np.append( np.delete(X_test_array, 0 ), predictedValues[0] );
            X_test_array = X_test_array.reshape(1,X_test.shape[1] )
        
        count = count + 1;
        dict_of_test_results[geoHash] = results;
        # Plot the results
        #plt.figure(figsize=(20,10))
        #plt.plot(df.iloc[len(df) - len(results):len(df)]['Datetime'], results, linestyle = 'solid', marker = 'None')
        #plt.title( geoHash)
    return dict_of_test_results;

In [32]:
# Import data
import numpy as np
import pandas as pd
import datetime 
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.externals import joblib
from math import sqrt
from datetime import date, timedelta

pd.options.mode.chained_assignment = None

#Mount google drive
#from google.colab import drive 
#drive.mount('/content/gdrive', force_remount=True)

#Read the file from google drive

#Read the file from google drive
data = pd.read_csv( 'C:/Users/PXY/Desktop/training.csv' )
#data = pd.read_csv( 'gdrive/My Drive/Machine Learning Playground/training.csv' )

# Split into multiple datasets based on Hash
dict_of_model = {};
dict_of_df, alluniqueGeohash = splitDataset(data);

qp03wc dataset created
qp03pn dataset created
qp09sw dataset created
qp0991 dataset created
qp090q dataset created
qp03tu dataset created
qp096d dataset created
qp03nr dataset created
qp093r dataset created
qp03r2 dataset created
qp09kb dataset created
qp03rp dataset created
qp03w9 dataset created
qp096m dataset created
qp091u dataset created
qp03md dataset created
qp099z dataset created
qp0990 dataset created
qp03mf dataset created
qp09fu dataset created
qp091w dataset created
qp0dhw dataset created
qp09fh dataset created
qp0961 dataset created
qp03zr dataset created
qp09de dataset created
qp09g8 dataset created
qp0d0k dataset created
qp09m0 dataset created
qp03yx dataset created
qp09u8 dataset created
qp09df dataset created
qp0djh dataset created
qp03rs dataset created
qp03q7 dataset created
qp03wp dataset created
qp09u3 dataset created
qp094k dataset created
qp09bd dataset created
qp03wm dataset created
qp0d0j dataset created
qp093d dataset created
qp09kz dataset created
qp06nc data

qp099m dataset created
qp03x5 dataset created
qp03q3 dataset created
qp09en dataset created
qp0953 dataset created
qp099v dataset created
qp08cy dataset created
qp0d4f dataset created
qp099h dataset created
qp09gs dataset created
qp03rd dataset created
qp03qs dataset created
qp0997 dataset created
qp03zz dataset created
qp0948 dataset created
qp09cj dataset created
qp03tf dataset created
qp06nf dataset created
qp09jv dataset created
qp09md dataset created
qp0909 dataset created
qp09er dataset created
qp03ws dataset created
qp09dn dataset created
qp09vy dataset created
qp09b3 dataset created
qp097h dataset created
qp093g dataset created
qp0958 dataset created
qp0dh8 dataset created
qp03xz dataset created
qp0djm dataset created
qp094h dataset created
qp092w dataset created
qp03re dataset created
qp06pu dataset created
qp09ft dataset created
qp0d0g dataset created
qp095k dataset created
qp0946 dataset created
qp03mq dataset created
qp03zs dataset created
qp09k2 dataset created
qp09ed data

qp03zv dataset created
qp095y dataset created
qp0dhh dataset created
qp03tt dataset created
qp094g dataset created
qp06p0 dataset created
qp098m dataset created
qp097s dataset created
qp0d5e dataset created
qp099w dataset created
qp0d4s dataset created
qp09es dataset created
qp03xj dataset created
qp09t4 dataset created
qp09v2 dataset created
qp09uf dataset created
qp03qn dataset created
qp03yu dataset created
qp09ku dataset created
qp09hh dataset created
qp0d4d dataset created
qp09gr dataset created
qp0d06 dataset created
qp08gp dataset created
qp095e dataset created
qp03wk dataset created
qp09th dataset created
qp03z2 dataset created
qp0dj8 dataset created
qp03wf dataset created
qp099d dataset created
qp0947 dataset created
qp0d1c dataset created
qp0983 dataset created
qp09gt dataset created
qp09k9 dataset created
qp0968 dataset created
qp0d1m dataset created
qp095t dataset created
qp09eq dataset created
qp06nu dataset created
qp09fw dataset created
qp08cp dataset created
qp09e8 data

qp0dj3 dataset created
qp0d55 dataset created
qp09cz dataset created
qp09v4 dataset created
qp0d14 dataset created
qp091r dataset created
qp08cu dataset created
qp08bv dataset created
qp09tt dataset created
qp03zq dataset created
qp02zg dataset created
qp0d1b dataset created
qp09h4 dataset created
qp09hz dataset created
qp08be dataset created
qp0908 dataset created
qp02zu dataset created
qp03p7 dataset created
qp0d0y dataset created
qp095g dataset created
qp0d50 dataset created
qp03tq dataset created
qp06ng dataset created
qp0dje dataset created
qp09sv dataset created
qp09kr dataset created
qp09cb dataset created
qp08gv dataset created
qp0938 dataset created
qp09me dataset created
qp09y1 dataset created
qp03mx dataset created
qp06nt dataset created
qp0djb dataset created
qp09jx dataset created
qp09gm dataset created
qp03yz dataset created
qp090y dataset created
qp09c2 dataset created
qp06p8 dataset created
qp0dhm dataset created
qp03p0 dataset created
qp08bu dataset created
qp0d1q data

In [498]:
# GlobalParameter
nrofinputs = 50
trainRatio = 0.8

dict_of_model_arima = {};
dict_of_model_lr = {};
dict_of_model_gbr = {};
dict_of_model_rfr = {};
dict_of_model_lstm = {};
dict_of_results = {};
dict_of_nrofinputs = {};

isKeras = False;
isLSTM = False;
isLR = True;
isGBR = False;
count = 1;
isVisualizeResult = False;

# LinearRegressor
for geoHash in alluniqueGeohash:
    print("Linear Regression----------------------------", count, geoHash)
    df = parseToDateTime( dict_of_df[geoHash] );
    nrofinputs = getNrofInputs(dict_of_df[geoHash] );
    dict_of_nrofinputs[geoHash] = nrofinputs
    if( isVisualizeResult):
      plotData(df); 
    X_train, y_train, trainData, testData = splitTrainTest(df);
    if(len(X_train) > 0 and len(y_train ) > 0):
      dict_of_model_lr[geoHash] = buildLinearRegressor( X_train, y_train)
      predictedValues_lr = validate( dict_of_model_lr[geoHash], trainData, testData, isKeras )
      compileAndVisualizeResult( df, predictedValues_lr, isVisualizeResult, geoHash )
      exportModel( dict_of_model_lr[geoHash], geoHash, isKeras, isLR, isGBR )
    count = count + 1;
    
# Export expected number of inputs
f = open("nrofinputs.pkl","wb")
pickle.dump(dict_of_nrofinputs,f)
f.close()

Linear Regression---------------------------- 1 qp03wc


NameError: name 'dict_of_nrofinput' is not defined

In [505]:
predict( "C:/Users/PXY/Desktop/training.csv","C:/Users/PXY/Desktop/Models_LR/" )

qp03wc dataset created
qp03pn dataset created
qp09sw dataset created
qp0991 dataset created
qp090q dataset created
qp03tu dataset created
qp096d dataset created
qp03nr dataset created
qp093r dataset created
qp03r2 dataset created
qp09kb dataset created
qp03rp dataset created
qp03w9 dataset created
qp096m dataset created
qp091u dataset created
qp03md dataset created
qp099z dataset created
qp0990 dataset created
qp03mf dataset created
qp09fu dataset created
qp091w dataset created
qp0dhw dataset created
qp09fh dataset created
qp0961 dataset created
qp03zr dataset created
qp09de dataset created
qp09g8 dataset created
qp0d0k dataset created
qp09m0 dataset created
qp03yx dataset created
qp09u8 dataset created
qp09df dataset created
qp0djh dataset created
qp03rs dataset created
qp03q7 dataset created
qp03wp dataset created
qp09u3 dataset created
qp094k dataset created
qp09bd dataset created
qp03wm dataset created
qp0d0j dataset created
qp093d dataset created
qp09kz dataset created
qp06nc data

KeyboardInterrupt: 