In [2]:
import pandas as pd
import numpy as np
import re
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression

In [3]:
gen_path = 'C:\\Users\\helga\\OneDrive\\Рабочий стол\\Lab\\2024\\wines\\' # path to the directory containing data and results of future calculations

path_to_data = gen_path + 'data\\wines_ir_cleared_spec.csv' # spectra
Y_data = gen_path + 'data\\wines_ir_cleared_conc.csv' #Y DATA

k_folds = [[42,12],[612,45],[72,172],[871,48],[52,134],[139,15],[287,403],[32,29]]

In [4]:
Y = pd.read_csv(Y_data, sep=',') # import data with ions concentrations

In [35]:
x = pd.read_csv(path_to_data)
k=1

ion_name = ['ethanol_pct','sugar_gL','acids_gL','glycerol_gL','sulfur_dioxide_gL']

for ion in ion_name:
    
    case_name = 'wines_ir_' + ion + '_PLS'
    case_path = gen_path + case_name+'\\'
    
    ION = Y.iloc[:,[0,k]]
    
    for fold in k_folds:
        
        MAE = {}
        R2 = {}

        split_path = case_path+'split_'+ str(fold[0])+'_'+str(fold[1])+ '/'
        os.makedirs(split_path, exist_ok=True)

        Y_trn, Y_30 = train_test_split(ION, test_size=0.3, random_state=fold[0])
        Y_vld, Y_tst = train_test_split(Y_30, test_size = 0.3333, random_state=fold[1])

        a = ['pattern_number',ion]

        pd.DataFrame(Y_trn).to_csv(split_path + 'Y_trn.csv',sep=',', index=False, header = a)
        pd.DataFrame(Y_vld).to_csv(split_path + 'Y_vld.csv',sep=',', index=False, header = a)
        pd.DataFrame(Y_tst).to_csv(split_path + 'Y_tst.csv',sep=',', index=False, header = a) 

        split_path = case_path+'split_'+ str(fold[0])+'_'+str(fold[1])+ '\\'

        scaler_x = MinMaxScaler()
        scaler_y = MinMaxScaler()

        y_trn = scaler_y.fit_transform(np.array((Y_trn.sort_values(by=['pattern_number'])).iloc[:,1:]))
        x_trn = scaler_x.fit_transform(np.array(x[x['pattern_number'].isin(Y_trn['pattern_number'])]))

        y_tst = scaler_y.transform(np.array((Y_tst.sort_values(by=['pattern_number'])).iloc[:,1:]))
        x_tst = scaler_x.transform(np.array(x[x['pattern_number'].isin(Y_tst['pattern_number'])]))

        for j in range(1,75):
            pls = PLSRegression(n_components=j)
            pls.fit(x_trn, y_trn)

            outs = pls.predict(x_tst)
            outs = scaler_y.inverse_transform(outs)
            outs[outs<0]=0

            MAE[j]=mean_absolute_error(outs,  scaler_y.inverse_transform(y_tst))
            R2[j]=r2_score(outs,  scaler_y.inverse_transform(y_tst))

        pd.DataFrame([MAE]).to_csv(split_path+'MAE_'+ion+'.csv')
        pd.DataFrame([R2]).to_csv(split_path+'R2_'+ion+'.csv')
    k+=1

In [8]:
x = pd.read_csv(path_to_data)
k=1

ion_name = {'ethanol_pct':31,'sugar_gL':11,'acids_gL':18,'glycerol_gL':19,'sulfur_dioxide_gL':48}

for ion in ion_name:
    
    case_name = 'wines_ir_' + ion + '_PLS'
    case_path = gen_path + case_name+'\\'
    
    ION = Y.iloc[:,[0,k]]
    
    mae = 0
    r2 = 0
    
    for fold in k_folds:

        split_path = case_path+'split_'+ str(fold[0])+'_'+str(fold[1])+ '/'
        os.makedirs(split_path, exist_ok=True)

        Y_trn, Y_30 = train_test_split(ION, test_size=0.3, random_state=fold[0])
        Y_vld, Y_tst = train_test_split(Y_30, test_size = 0.3333, random_state=fold[1])

        a = ['pattern_number',ion]

        #pd.DataFrame(Y_trn).to_csv(split_path + 'Y_trn.csv',sep=',', index=False, header = a)
        #pd.DataFrame(Y_vld).to_csv(split_path + 'Y_vld.csv',sep=',', index=False, header = a)
        #pd.DataFrame(Y_tst).to_csv(split_path + 'Y_tst.csv',sep=',', index=False, header = a) 

        split_path = case_path+'split_'+ str(fold[0])+'_'+str(fold[1])+ '\\'

        scaler_x = MinMaxScaler()
        scaler_y = MinMaxScaler()

        y_trn = scaler_y.fit_transform(np.array((Y_trn.sort_values(by=['pattern_number'])).iloc[:,1:]))
        x_trn = scaler_x.fit_transform(np.array(x[x['pattern_number'].isin(Y_trn['pattern_number'])]))

        y_tst = scaler_y.transform(np.array((Y_tst.sort_values(by=['pattern_number'])).iloc[:,1:]))
        x_tst = scaler_x.transform(np.array(x[x['pattern_number'].isin(Y_tst['pattern_number'])]))


        pls = PLSRegression(n_components=ion_name[ion])
        pls.fit(x_trn, y_trn)

        outs = pls.predict(x_trn)
        outs = scaler_y.inverse_transform(outs)
        outs[outs<0]=0
        
        mae += mean_absolute_error(outs,  scaler_y.inverse_transform(y_trn))
        r2 += r2_score(outs,  scaler_y.inverse_transform(y_trn))
        
    print(ion, mae/8, r2/8)
    k+=1

ethanol_pct 0.2245618880816618 0.9934505748070698
sugar_gL 3.901676390443183 0.9918005868161867
acids_gL 0.5617107620478065 0.934041834922675
glycerol_gL 1.3804029317382203 0.8910718450635815
sulfur_dioxide_gL 0.07745827252682774 0.8892155645804863


In [30]:
x = pd.read_csv(path_to_data)
x_wines = pd.read_csv(gen_path + 'data\\true_wine.csv')
wine_names = ['wine11', 'wine12', 'wine13', 'wine14', 'wine15', 'wine16', 'wine17', 'wine18', 'wine19', 'wine20', 'wine21']
k=1

ion_name = {'ethanol_pct':31,'sugar_gL':11,'acids_gL':18,'glycerol_gL':19,'sulfur_dioxide_gL':48}
fin = pd.DataFrame()

k_folds = [[42,12],[612,45],[72,172],[871,48],[52,134],[139,15],[287,403],[32,29]]

for ion in ion_name:
    
    case_name = 'wines_ir_' + ion + '_PLS'
    case_path = gen_path + case_name+'\\'
    
    ION = Y.iloc[:,[0,k]]
    
    predictions = []
    
    for fold in k_folds:
        
        MAE = {}
        R2 = {}

        split_path = case_path+'split_'+ str(fold[0])+'_'+str(fold[1])+ '/'
        os.makedirs(split_path, exist_ok=True)

        Y_trn, Y_30 = train_test_split(ION, test_size=0.3, random_state=fold[0])

        a = ['pattern_number',ion]

        split_path = case_path+'split_'+ str(fold[0])+'_'+str(fold[1])+ '\\'

        scaler_x = MinMaxScaler()
        scaler_y = MinMaxScaler()

        y_trn = scaler_y.fit_transform(np.array((Y_trn.sort_values(by=['pattern_number'])).iloc[:,1:]))
        x_trn = scaler_x.fit_transform(np.array(x[x['pattern_number'].isin(Y_trn['pattern_number'])]))

        x_wines_scaled = scaler_x.transform(np.array(x_wines))

        pls = PLSRegression(n_components=ion_name[ion])
        pls.fit(x_trn, y_trn)

        outs = pls.predict(x_wines_scaled)
        outs = scaler_y.inverse_transform(outs)
        outs[outs<0]=0
        
        predictions.append(outs)
        
    predictions = np.array(predictions).reshape((11,8))
    target_res = pd.DataFrame(np.std(predictions,axis=1), columns = [ion], index = wine_names)
    fin = pd.concat([fin,target_res], axis=1)
    
    k+=1

In [50]:
wine_names = ['wine11', 'wine12', 'wine13', 'wine14', 'wine15', 'wine16', 'wine17', 'wine18', 'wine19', 'wine20', 'wine21']
aa = pd.DataFrame(predictions/8, columns = [ion], index = wine_names)

In [32]:
fin.to_csv(gen_path+'PLS_real_wine_predictions_std.csv')

In [31]:
fin

Unnamed: 0,ethanol_pct,sugar_gL,acids_gL,glycerol_gL,sulfur_dioxide_gL
wine11,1.912815,16.43433,1.697172,2.881089,0.837239
wine12,1.729346,18.52028,1.405053,2.676624,0.757104
wine13,0.527319,14.198893,1.024438,2.234413,0.283401
wine14,1.619899,18.045598,1.906949,3.381787,0.960198
wine15,1.617321,18.758394,1.823003,3.243251,0.988619
wine16,1.710739,19.007198,1.648549,2.694858,0.673784
wine17,0.541324,12.937097,1.006603,2.440831,0.270136
wine18,1.43585,16.504947,1.639673,2.814018,0.758932
wine19,1.808082,19.784855,1.432399,2.557169,0.91101
wine20,0.959509,17.89975,1.634683,1.819604,0.325077


In [9]:
z = np.array([])


In [25]:
np.std(predictions,axis=1).shape

(8, 1)