In [12]:
import warnings

import numpy as np
import pandas as pd
import pylab as plot
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import  SVR
from IPython.display import display

from module_utils.scoreRegression import scoreClassif, scoreClassifier
from module_utils.utils import head_shape, describe_plus, highlight_bests_metrics, train_test_df
from module_utils.preprocess import msc, snv, savgol_smoothing, norml, extract_pipeline_preprocess_poly
from module_utils.model_training import train_model, compute_losses_reg, training_models

sns.set(font_scale=1.2, style="darkgrid", palette="colorblind", color_codes=True)
params = {"legend.fontsize":15,
          "legend.handlelength":2}
plot.rcParams.update(params)
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_csv('../All Data (human readable column names).csv', sep = ',')

In [14]:
# drop useless columns
drop_id = ['ID', 'Surface Scan Device ID', 'Juice Scan Device ID']
df = df.drop(drop_id, axis=1)

# targets columns
target_variables = df.filter(regex="Brix|Antioxidants|Polyphenols", axis=1).columns

# Juice scans columns
juice_col = df.filter(regex="Juice", axis=1).columns
juice_target = df.filter(regex="Juice|Antioxidants|Polyphenols|Brix", axis=1).columns

# Surface scans columns
df.rename(columns={"Surface Scan 386nm":"Surface Scan 385nm"}, inplace=True)
surface_col = df.loc[:, df.columns.str.startswith("Surface")].columns
surface_target = df.filter(regex="Surface|Antioxidants|Polyphenols|Brix", axis=1).columns

# Juice and surface scans columns
surface_juice_col = surface_col.append(juice_col)
surface_juice_target = surface_juice_col.append(target_variables)

## Preprocessing and Polynomial Features
The goal of data preprocessing is to eliminate or minimize wavelength variability.
http://www.models.life.ku.dk/sites/default/files/preproNIR.pdf

In [15]:
# init model objects
lin_regression = LinearRegression()
#pls = MultiOutputRegressor(PLSRegression(n_components=5))
svr_rbf = MultiOutputRegressor(SVR(kernel='rbf', C=10, gamma='auto', epsilon=.1, coef0=1))
rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=0)
models_list = [lin_regression, svr_rbf, rf_regressor]
# create list of models used for training - for columns name
models_name = ['Linear regression', 'SVR RBF', 'RF regressor']

- #### Columns: Surface Scans
- #### Type: Kale
- #### Target: Brix

### Multiplicative Scatter Correction
https://nirpyresearch.com/two-scatter-correction-techniques-nir-spectroscopy-python/

In [16]:
x, y = extract_pipeline_preprocess_poly(df=df[df["Type"]=="kale"][surface_target],
                     target=target_variables, preprocess=msc, degree=2)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [17]:
print("x_train", x_train.shape)
print("x_test", x_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

x_train (170, 66)
x_test (73, 66)
y_train (170, 3)
y_test (73, 3)


In [18]:
train_df = training_models(x=x_train, y=y_train, models=models_list, col_name=models_name)
test_df = training_models(x=x_test, y=y_test, models=models_list, col_name=models_name)
highlight_bests_metrics(train_df)

Unnamed: 0,Linear regression,SVR RBF,RF regressor
MAE,3264.579402,1902.613903,710.120628
MSE,53772451.717962,17174578.098891,2288785.604326
R2,-0.68869,0.32563,0.874133
RMSE,7332.970184,4144.222255,1512.873294
Score Classifier,0.607843,0.998039,0.935294


In [19]:
train_test_df(df_1=train_df, df_2=test_df, column='RF regressor')

Unnamed: 0_level_0,Train,Test
RF regressor,Unnamed: 1_level_1,Unnamed: 2_level_1
MAE,710.120628,720.620322
MSE,2288785.604326,2595373.560109
R2,0.874133,0.864623
RMSE,1512.873294,1611.016313
Score Classifier,0.935294,0.894977


### Standard Normal Variate
https://nirpyresearch.com/two-scatter-correction-techniques-nir-spectroscopy-python/

In [20]:
x, y = extract_pipeline_preprocess_poly(df=df[df["Type"]=="kale"][surface_target],
                     target=target_variables, preprocess=snv, degree=2)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [22]:
train_df = training_models(x_train, y_train, models_list, models_name)
test_df = training_models(x_test, y_test, models_list, models_name)
highlight_bests_metrics(train_df)

Unnamed: 0,Linear regression,SVR RBF,RF regressor
MAE,2459.643725,1906.780463,701.531636
MSE,29373094.396983,17200455.760614,2260612.693568
R2,-0.454512,0.019168,0.875334
RMSE,5419.695046,4147.343217,1503.533403
Score Classifier,0.605882,0.596078,0.935294


In [23]:
train_test_df(df_1=train_df, df_2=test_df, column='RF regressor')

Unnamed: 0_level_0,Train,Test
RF regressor,Unnamed: 1_level_1,Unnamed: 2_level_1
MAE,701.531636,700.688861
MSE,2260612.693568,2505675.054412
R2,0.875334,0.865635
RMSE,1503.533403,1582.932423
Score Classifier,0.935294,0.894977


### Savitsky-Golay
https://nirpyresearch.com/savitzky-golay-smoothing-method/

In [24]:
x, y = extract_pipeline_preprocess_poly(df=df[df["Type"]=="kale"][surface_target],
                     target=target_variables, preprocess=savgol_smoothing, degree=2)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [26]:
train_df = training_models(x_train, y_train, models_list, models_name)
test_df = training_models(x_test, y_test, models_list, models_name)
highlight_bests_metrics(train_df)

Unnamed: 0,Linear regression,SVR RBF,RF regressor
MAE,2383.634858,1902.613903,701.352438
MSE,26889453.213933,17174578.098891,2266199.714985
R2,-0.205744,0.32563,0.87548
RMSE,5185.504143,4144.222255,1505.39022
Score Classifier,0.654902,0.998039,0.947059


In [27]:
train_test_df(df_1=train_df, df_2=test_df, column='SVR RBF')

Unnamed: 0_level_0,Train,Test
SVR RBF,Unnamed: 1_level_1,Unnamed: 2_level_1
MAE,1902.613903,1951.5006
MSE,17174578.098891,20006936.10989
R2,0.32563,0.284951
RMSE,4144.222255,4472.911368
Score Classifier,0.998039,0.995434


### Spectral normalisation

In [28]:
x, y = extract_pipeline_preprocess_poly(df=df[df["Type"]=="kale"][surface_target],
                     target=target_variables, preprocess=norml, degree=2)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [30]:
train_df = training_models(x_train, y_train, models_list, models_name)
test_df = training_models(x_test, y_test, models_list, models_name)
highlight_bests_metrics(train_df)

Unnamed: 0,Linear regression,SVR RBF,RF regressor
MAE,1968.003098,1909.727434,672.784496
MSE,18320908.514009,17212282.156921,2074370.878853
R2,0.112733,-0.012586,0.879276
RMSE,4280.293041,4148.768752,1440.267641
Score Classifier,0.680392,0.576471,0.935294


In [31]:
train_test_df(df_1=train_df, df_2=test_df, column='RF regressor')

Unnamed: 0_level_0,Train,Test
RF regressor,Unnamed: 1_level_1,Unnamed: 2_level_1
MAE,672.784496,809.686241
MSE,2074370.878853,3099858.48036
R2,0.879276,0.854657
RMSE,1440.267641,1760.641497
Score Classifier,0.935294,0.863014
