<a href="https://colab.research.google.com/github/nedlecky/CSC485B/blob/main/CSC485_140_PythagorasPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSC 485B Spring 2023: CSC485_140_PythagorasPipeline using MLP
## Using the Pythagoras problem to illustrate pipelines
### Input the length of the two sides, ML computes hypotenuse, perimeter, and area
* SUNY Plattsburgh, Spring 2023
* Dr. Ned Lecky
* nleck001@plattsburgh.edu
* ned@lecky.com

In [1]:
# Create our output directories
from pathlib import Path

OUTPUT_PATH = Path() / "pipeline"
IMAGES_PATH = Path() / OUTPUT_PATH / "images"

In [2]:
# Setup and Support Functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import random

# This makes us reproducible (and we can adjust fixed_seed to get different results)
fixed_seed = 1

# Return n random floats between lo and hi as 1-column NumPy matrix
def rand_nlohi(n=1, lo=0, hi=1):
  # This is just a uniform distribution from lo to hi... we can adjust if appropriate in the future
  return (np.random.rand(n) * (hi - lo) + lo).reshape(-1,1)

# Often a good idea as long as we are keeping values near +/- 1... don't need exponential notation
np.set_printoptions(floatmode='fixed', precision=4, suppress=True)
# This will get us all 400 rows printed... which fails past 40 x 2 columns
np.set_printoptions(threshold=sys.maxsize)

# Simple numpy array print with optional push to file
def nprint(m, name='', also_write_file=False):
  print(f"{name} {m.shape} {m.dtype}")
  print(m)
  if also_write_file and name != '':
    fprint(m, name)

# Print numpy array to file (needs name)
def fprint(m, name='', path=OUTPUT_PATH):
  if name != '':
    with open(path /  name, 'w') as f:
      print(f"{name} {m.shape} {m.dtype}", file=f)
      print(m, file=f)
  else:
    print('fprint needs a name!')

# Remove a file and don't complain if it doesn't exist
def remove_file(name):
  try:
    os.remove(name)
  except:
    return

# Delete a directory, recursively removing files and subdirectories
def delete_directory(path):
  if not OUTPUT_PATH.exists():
    return

  print(f"delete_directory({path})")
  for file_name in os.listdir(path):
    # construct full file path
    file = path / file_name
    if os.path.isdir(file):
        print('  found subdirectory', file)
        delete_directory(file)
    elif os.path.isfile(file):
        print('  deleting file', file)
        os.remove(file)
  os.rmdir(path)

# Save a matplotlib figure to a png file
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Makes default plots a bit cleaner
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

# Compare a Y with a Y_pred
def compare_results(Y, Y_pred):
  print(f"Mean squared error: {mean_squared_error(Y, Y_pred):.2f}")
  print(f"Mean absolute error: {mean_absolute_error(Y, Y_pred):.2f}")
  print(f"Mean absolute percentage error: {mean_absolute_percentage_error(Y, Y_pred):.2f}")

  # Add the Pandas describe()
  df = pd.DataFrame(data = Y_pred - Y)
  print(df.describe())


In [3]:
# Setup (and clear) output directories
delete_directory(OUTPUT_PATH)

OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

delete_directory(pipeline)
  deleting file pipeline/X_train
  deleting file pipeline/Y_test
  deleting file pipeline/Y_test_pred1
  deleting file pipeline/Y_pred1
  deleting file pipeline/X
  deleting file pipeline/X_test
  deleting file pipeline/Y_train_pred1
  deleting file pipeline/XY
  deleting file pipeline/Y_train
  deleting file pipeline/X_raw
  deleting file pipeline/Y:Y_pred2
  deleting file pipeline/Y:Y_pred1
  deleting file pipeline/Y
  found subdirectory pipeline/images
delete_directory(pipeline/images)
  deleting file pipeline/X:X_scaled


# Make Test Data



## X is triangles with side1 side2 spread from 2 to 2000 cm

In [4]:
# This is the full test input data for right triangles
# Reminder: Final input is the length of the two sides, output is length of hypotenuse, perimeter, and area
# x = [side1, side2]
# y = [hypotenuse, perimeter, area]

np.random.seed(fixed_seed)

# Setup what you want to generate
N = 1000
shortest_side = 2
longest_side = 2000
raw_scale = np.array([0.01, 0.01])

# Generate X
side1 = rand_nlohi(N, shortest_side, longest_side)
side2 = rand_nlohi(N, shortest_side, longest_side)

X_raw = np.hstack([side1, side2])
fprint(X_raw,'X_raw')
X = X_raw * raw_scale

fprint(X,'X')


## Y can be computed from X

In [5]:
# Now let's compute the FULL Y expected results
# Reminder: We tell you the length of the two sides, you compute length of hypotenuse, perimeter, and area
# x = [side1, side2]
# y = [hypotenuse, perimeter, area]

hypotenuse = np.sqrt(np.square(X[:,0:1]) + np.square(X[:,1:2]))
perimeter = X[:,0:1] + X[:,1:2] + hypotenuse
area = (X[:,0:1] * X[:,1:2]) / 2.
Y = np.hstack([hypotenuse, perimeter, area])
fprint(Y,'Y')
fprint(np.hstack([X,Y]), 'XY')

# Optional noise in Y
# Not tested yet!
# Just bump all up or down by up to 1%
#Y = Y * (100. + (np.random.rand(Y.shape[0],Y.shape[1])-0.5))/100.


# Split, Scale, Train, Test

## Without Pipeline

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Split
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size=0.3, random_state=1)
fprint(X_train, 'X_train')
fprint(X_test, 'X_test')
fprint(Y_train, 'Y_train')
fprint(Y_test, 'Y_test')

# Scale
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
fprint(np.hstack([X, X_scaled]),'X:X_scaled')

# Train
mlp = MLPRegressor(solver='lbfgs', alpha=1e-5,
          hidden_layer_sizes=(20,20),
          activation='relu',
          max_iter=10000,
          random_state=1,
          verbose=True)
mlp.fit(X_train_scaled, Y_train)

# Test
Y_test_pred1 = mlp.predict(scaler.transform(X_test)).reshape(Y_test.shape[0],-1)
Y_train_pred1 = mlp.predict(scaler.transform(X_train)).reshape(Y_train.shape[0],-1)
Y_pred1 = mlp.predict(scaler.transform(X)).reshape(Y.shape[0],-1)
fprint(Y_test_pred1,'Y_test_pred1')
fprint(Y_train_pred1,'Y_train_pred1')
fprint(Y_pred1,'Y_pred1')

fprint(np.hstack([Y, Y_pred1]),'Y:Y_pred1')

compare_results(Y_test, Y_test_pred1)
compare_results(Y_train, Y_train_pred1)
compare_results(Y, Y_pred1)

Mean squared error: 0.05
Mean absolute error: 0.14
Mean absolute percentage error: 0.03
                0           1           2
count  300.000000  300.000000  300.000000
mean    -0.003450   -0.002841    0.022104
std      0.102420    0.103762    0.338390
min     -0.304946   -0.301717   -1.026736
25%     -0.060992   -0.060091   -0.185651
50%      0.006714    0.006038   -0.005809
75%      0.058255    0.060986    0.204326
max      0.309296    0.312034    0.963792
Mean squared error: 0.03
Mean absolute error: 0.12
Mean absolute percentage error: 0.01
                0           1           2
count  700.000000  700.000000  700.000000
mean    -0.000145   -0.000025    0.000199
std      0.087893    0.088099    0.275008
min     -0.309017   -0.305330   -0.747611
25%     -0.047334   -0.046788   -0.191338
50%      0.007440    0.005454   -0.012277
75%      0.055687    0.057641    0.156301
max      0.234268    0.231949    1.434314
Mean squared error: 0.03
Mean absolute error: 0.12
Mean absolute per

## With Pipeline

In [7]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('MLP', MLPRegressor(solver='lbfgs', alpha=1e-5,
          hidden_layer_sizes=(20,20),
          activation='relu',
          max_iter=10000,
          random_state=1,
          verbose=True))
    ])
pipe.fit(X_train, Y_train)
print(f"pipe.score = {100*pipe.score(X_test, Y_test):.4f}%")
Y_pred2 = pipe.predict(X)
Y_test_pred2 = pipe.predict(X_test)
Y_train_pred2 = pipe.predict(X_train)

fprint(np.hstack([Y, Y_pred2]),'Y:Y_pred2')

compare_results(Y_test, Y_test_pred2)
compare_results(Y_train, Y_train_pred2)
compare_results(Y, Y_pred2)

pipe.score = 99.9909%
Mean squared error: 0.03
Mean absolute error: 0.12
Mean absolute percentage error: 0.02
                0           1           2
count  300.000000  300.000000  300.000000
mean     0.004843    0.005319   -0.006210
std      0.085009    0.088187    0.299799
min     -0.346555   -0.339149   -0.979231
25%     -0.036678   -0.034487   -0.174367
50%      0.004430    0.003833   -0.026518
75%      0.039556    0.046481    0.175358
max      0.325738    0.430920    0.904748
Mean squared error: 0.02
Mean absolute error: 0.09
Mean absolute percentage error: 0.01
                0           1           2
count  700.000000  700.000000  700.000000
mean     0.000051    0.000057   -0.000253
std      0.063795    0.063696    0.226644
min     -0.296048   -0.320930   -0.617571
25%     -0.033548   -0.032011   -0.148816
50%      0.004504    0.001960    0.003413
75%      0.031778    0.034881    0.131749
max      0.217836    0.207927    0.958258
Mean squared error: 0.02
Mean absolute error: 