Data description
A total of 72 milk samples were analysed using hyperspectral imaging. The number of pixels considered for each sample ranges from 112 to 300. For each pixel, a spectrum containing 3,424 wavelengths in the range from 400 cm-1 to 7,000 cm-1 was generated. For each sample, the spectra were extracted from the pixels and included into a single dataset, therefore the spatial information were successively lost.  For each sample the lactose concentration was analysed and expressed as mg/mL. 
Data provided
Training set covariates: an excel file with 64 different sheets, each one corresponding to a different sample. Each sheet corresponds to a data matrix with ni rows and p = 3424 columns, corresponding to the spectra for the ni considered pixels for the i-th sample. 
Training set response: an excel file containing the information on the lactose content for the samples included in the training set. 
Test set covariates: an excel file with 8 different sheets, with the same structure as the training set covariates but without the information on the lactose content. 
Task
Participants should develop prediction models to quantify the lactose content employing the spectral information. Each participant should send a csv file with the predicted lactose contents for the samples in the test set. 
Tips
For each sample there are both outlier spectra and noise regions to be deleted before the development of the prediction model. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#import seaborn as sns
#sns.set()
import os, sys
import time

#linear models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.cross_decomposition import PLSRegression

#ensembles
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
#from lightgbm import LGBMRegressor
#from xgboost.sklearn import XGBRegressor

#knn
from sklearn.neighbors import KNeighborsRegressor

#neural networks
from sklearn.neural_network import MLPRegressor

#svm: try both linear kernel and rbf kernel
from sklearn.svm import SVR

#deep neural networks aka deep learning
#tbd: initial results by Ashish did not look good

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
#from patsy import dmatrices
from sklearn.utils import shuffle
#from pandas_profiling import ProfileReport

from sklearn import preprocessing
from scipy.interpolate import interp1d

import math
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.feature_selection import SelectKBest, f_regression


# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
#For showing plots directly in the notebook run the command below
%matplotlib inline
# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

import plotly.express as px

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIG_SIZE = 12

from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

plt.rc('font', size=BIG_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIG_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIG_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIG_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIG_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIG_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIG_SIZE)  # fontsize of the figure title

In [None]:
def load_data_from_excel_multisheets(path_to_input_file, output_3d_npy):

    print(path_to_input_file)
    xls = pd.ExcelFile(path_to_input_file, engine='openpyxl')
    print(xls.sheet_names)

    idx = 0
    for sheet_name in xls.sheet_names:
        df_2d = pd.read_excel(xls, sheet_name)
        print(sheet_name)
        print(df_2d.shape)
        print(df_2d.head(1))
       
        # if sample has less rows than expected, add zero rows
        if(df_2d.shape[0] < output_3d_npy.shape[1]):
            nrows = output_3d_npy.shape[1] - df_2d.shape[0]
            print(nrows)
            a = np.zeros((nrows, df_2d.shape[1]))
            print(a.shape)
            output_3d_npy[idx] = np.concatenate((df_2d, a), axis = 0)
        else:
            output_3d_npy[idx] = df_2d
        idx += 1

    return output_3d_npy

In [None]:
train_X = load_data_from_excel_multisheets('train_dataset_samples.xlsx', np.zeros((64,300,3424)))
test_X = load_data_from_excel_multisheets('test_dataset_samples.xlsx', np.zeros((8,300,3424)))


In [None]:
print(train_X.shape)
print(train_X[0].shape)
print(train_X[0])

print(test_X.shape)
print(test_X[0].shape)
print(test_X[0])

#save 3d numpy array to disk
np.save("train_X.npy", train_X)
np.save("test_X.npy", test_X)

#load 3d numpy array from disk
#test_X_tryload = np.load("test_X.npy")

#print(test_X_tryload.shape)
#print(test_X_tryload[7])

In [None]:
train_y = pd.read_excel('train_targets.xlsx', 0, engine='openpyxl')