In [None]:
#@title
# Upload
# synthetic_control_data.csv
# airline_passengers.csv
# concrete_compressive_strength.csv

# 6.6. K-Nearest Neighbors (KNN)

In [None]:
#@title 6.6.1. Import some necessary packages
import numpy as np
import pandas as pd

import sklearn.preprocessing as pg
import keras.utils           as ku 
from sklearn.model_selection import train_test_split

import matplotlib
from matplotlib import pyplot as plt

from sklearn.neighbors import KNeighborsClassifier as knnc
from sklearn.neighbors import KNeighborsRegressor  as knnr

import tensorflow as tf
from tensorflow.keras import layers, models


In [None]:
#@title 6.6.2. Classification example: data preprocessing
data = pd.read_csv('/content/synthetic_control_data.csv')

datain = data.iloc[:,0:-1]
dataou = data.iloc[:,-1:]

datain_tr, datain_te, dataou_tr, dataou_te = train_test_split(datain, dataou, test_size = 0.2, random_state = 42)

print("training inputs: \n\n {} \n\n".format(datain_tr))
print("testing  inputs: \n\n {} \n\n".format(datain_te))

# get the values
datain_tr = datain_tr.values
dataou_tr = dataou_tr.values

datain_te = datain_te.values
dataou_te = dataou_te.values

# output calibration (inputs are calibrated row-wised between 0 and 1)

fun_calibration_in = pg.MinMaxScaler(feature_range=(0,1))
fun_calibration_in.fit(datain_tr)

datain_tr_calibrated = fun_calibration_in.transform(datain_tr)
datain_te_calibrated = fun_calibration_in.transform(datain_te)


In [None]:
#@title 6.6.3. Classification example: train and test the KNN model

model = knnc(n_neighbors=3)
model.fit(datain_tr_calibrated, dataou_tr)

dataes_tr = model.predict(datain_tr_calibrated)
dataes_te = model.predict(datain_te_calibrated)

dataes_tr = np.expand_dims(dataes_tr, axis = 1)
dataes_te = np.expand_dims(dataes_te, axis = 1)


In [None]:
#@title 6.6.4. Classification example: compute accuracy
def fun_accuracy(dataes, dataou):
  num_err = np.count_nonzero( dataes - dataou )
  accuracy = 1 - num_err/dataou.shape[0]
  return accuracy 

print('training accuracy: {} %'.format( fun_accuracy(dataes_tr,dataou_tr)*100) )
print('testing  accuracy: {} %'.format( fun_accuracy(dataes_te,dataou_te)*100) )


In [None]:
#@title 6.6.5. Regression example: data preprocessing
data = pd.read_csv('/content/concrete_compressive_strength.csv')

datain = data.iloc[:,:-1]
dataou = data.iloc[:,-1:]

datain = datain.values
dataou = dataou.values

datain_tr, datain_te, dataou_tr, dataou_te = train_test_split(datain, dataou, test_size = 0.1, random_state = 42)

scalerin = pg.MinMaxScaler(feature_range=(0,1))
scalerin.fit(datain_tr)

scalerou = pg.MinMaxScaler(feature_range=(0,1))
scalerou.fit(dataou_tr)

datain_tr_calibrated = scalerin.transform(datain_tr)
datain_te_calibrated = scalerin.transform(datain_te)

dataou_tr_calibrated = scalerou.transform(dataou_tr)
dataou_te_calibrated = scalerou.transform(dataou_te)


In [None]:
#@title 6.6.6. Regression example: train and test the KNN
model = knnr(n_neighbors=3)
model.fit(datain_tr_calibrated, dataou_tr_calibrated)

dataes_tr_calibrated = model.predict(datain_tr_calibrated)
dataes_te_calibrated = model.predict(datain_te_calibrated)

# plot training real vs estimation
plt.figure(figsize=[5,5])
plt.plot(dataou_tr_calibrated, dataes_tr_calibrated, '.', markersize = 1)
plt.plot([0,1],[0,1], '-r', linewidth = 2)
plt.xlabel('Real', fontsize = 20)
plt.ylabel('Estimated', fontsize = 20)
plt.title('Training Results | KNN', fontsize = 20)
plt.legend(['datapoint','bi-sector'], fontsize = 20)
matplotlib.rc('xtick', labelsize = 20)
matplotlib.rc('ytick', labelsize = 20)

# plot testing real vs estimation
plt.figure(figsize=[5,5])
plt.plot(dataou_te_calibrated, dataes_te_calibrated, '.', markersize = 1)
plt.plot([0,1],[0,1], '-r', linewidth = 2)
plt.xlabel('Real', fontsize = 20)
plt.ylabel('Estimated', fontsize = 20)
plt.title('Testing Results | KNN', fontsize = 20)
plt.legend(['datapoint','bi-sector'], fontsize = 20)
matplotlib.rc('xtick', labelsize = 20)
matplotlib.rc('ytick', labelsize = 20)

# 6.7. Long Short-Term Memory (LSTM)

In [None]:
#@title 6.7.1. Data preprocessing
data = pd.read_csv('/content/airline_passengers.csv')
data = data.iloc[:,-1].values

plt.figure(figsize = [8,8])
plt.plot(data, 'k-')
plt.xlabel('records over time'    , fontsize = 20)
plt.ylabel('number of passengers' , fontsize = 20)

size_tr = int(len(data) * 0.8)

data_tr = np.expand_dims(data[:size_tr], axis = 1)
data_te = np.expand_dims(data[size_tr:], axis = 1)

# one way of data processing 
fun_calibration = pg.MinMaxScaler(feature_range=(0,1))
fun_calibration.fit(data_tr)

data_tr_calibrated = fun_calibration.transform(data_tr)
data_te_calibrated = fun_calibration.transform(data_te)

# LSTM data preparation function 
def fun_dataprocessing_lstm(data, lag = 1, size_in = 5, size_ou = 1):
  datain = []
  dataou = []

  for i0 in range(0, data.shape[0] - size_in - size_ou, lag):

    datain.append(np.ndarray.tolist(data[i0           : i0 + size_in, 0]))
    dataou.append(np.ndarray.tolist(data[i0 + size_in : i0 + size_in + size_ou, 0]))

  datain = np.asarray(datain)
  dataou = np.asarray(dataou)

  return datain, dataou

size_in = 10
datain_tr_calibrated, dataou_tr_calibrated = fun_dataprocessing_lstm(data_tr_calibrated, lag = 1, size_in = size_in, size_ou = 1)
datain_te_calibrated, dataou_te_calibrated = fun_dataprocessing_lstm(data_te_calibrated, lag = 1, size_in = size_in, size_ou = 1)

# expand dimension to be [datapoints or samples, time steps, features (here is number of passengers)]
datain_tr_calibrated = np.expand_dims(datain_tr_calibrated, axis = 2)
datain_te_calibrated = np.expand_dims(datain_te_calibrated, axis = 2)

In [None]:
#@title 6.7.2. Create and compile an LSTM model
model = models.Sequential()

model.add(layers.LSTM(units = 8, input_shape=(size_in, 1))) 
model.add(layers.Dense(25, activation='relu'))
model.add(layers.Dropout(rate = 0.1))
model.add(layers.Dense(10 , activation='relu'))
model.add(layers.Dropout(rate = 0.1))
model.add(layers.Dense(5 , activation='relu'))
model.add(layers.Dropout(rate = 0.1))
model.add(layers.Dense(1  , activation='linear'))

model.compile(optimizer='adam', loss = "mean_squared_error")

model.summary()

# Some good points from https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/
# The LSTM input layer must be 3D.
# The meaning of the 3 input dimensions are: samples, time steps, and features.
# The LSTM input layer is defined by the input_shape argument on the first hidden layer.
# The input_shape argument takes a tuple of two values that define the number of time steps and features.
# The number of samples is assumed to be 1 or more.
# The reshape() function on NumPy arrays can be used to reshape your 1D or 2D data to be 3D.
# The reshape() function takes a tuple as an argument that defines the new shape.


In [None]:
#@title 6.7.3. Train and test the LSTM model
history = model.fit(datain_tr_calibrated, dataou_tr_calibrated[:,0], epochs = 100, batch_size = 1, verbose = 1, shuffle=True, validation_split=0.1)

dataes_tr_calibrated = model.predict(datain_tr_calibrated)
dataes_te_calibrated = model.predict(datain_te_calibrated)


In [None]:
#@title 6.7.4. Plots
plt.figure(figsize=[5,5])
plt.plot(dataou_tr_calibrated[:,0], dataes_tr_calibrated[:,0], '*', markersize = 3)
plt.plot([0,1],[0,1], '-r', linewidth = 2)
plt.xlabel('Real', fontsize = 20)
plt.ylabel('Estimated', fontsize = 20)
plt.title('Training Results | LSTM', fontsize = 20)
plt.legend(['datapoint','bi-sector'], fontsize = 20)
matplotlib.rc('xtick', labelsize = 20)
matplotlib.rc('ytick', labelsize = 20)

plt.figure(figsize=[5,5])
plt.plot(dataou_te_calibrated[:,0], dataes_te_calibrated[:,0], '*', markersize = 3)
plt.plot([0,1.5],[0,1.5], '-r', linewidth = 2)
plt.xlabel('Real', fontsize = 20)
plt.ylabel('Estimated', fontsize = 20)
plt.title('Testing Results | LSTM', fontsize = 20)
plt.legend(['datapoint','bi-sector'], fontsize = 20)
matplotlib.rc('xtick', labelsize = 20)
matplotlib.rc('ytick', labelsize = 20)

plt.figure(figsize=[8,8])
plt.plot(history.history['loss'],'--')
plt.plot(history.history['val_loss'],'-')
plt.title('model loss', fontsize = 20)
plt.ylabel('loss', fontsize = 20)
plt.xlabel('epoch', fontsize = 20)
plt.legend(['training loss','validation loss'], fontsize = 20)
plt.show()

plt.figure(figsize=[8,8])
plt.plot(np.concatenate((dataou_tr_calibrated, dataou_te_calibrated), axis = 0),'k-')
plt.plot(dataes_tr_calibrated,'b--')
plt.plot(list(range(dataou_tr_calibrated.shape[0],dataou_tr_calibrated.shape[0] + dataou_te_calibrated.shape[0])), dataes_te_calibrated,'r:')
plt.title('model loss', fontsize = 20)
plt.ylabel('calibrated number of passengers', fontsize = 20)
plt.xlabel('records over time', fontsize = 20)
plt.legend(['real', 'training estimation','testing estimation'], fontsize = 20)
plt.show()
