# General information:

Model was developed using Python 3.12.4

Libraries and their versions used for model development are 
listed in requirements.txt file


- _t - training set
- _v - validation set
- _std - standardized values
- _pred - predicted values
- _loo - leave-one-out cross validation

In [None]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

## Load data and split into subsets

In [15]:
# load dataset
data_df = pd.read_excel('data.xlsx', sheet_name='dataset', index_col=0)

In [17]:
# split into sets
y_t = data_df['y'][data_df['Split'] == 'T']
y_v = data_df['y'][data_df['Split'] == 'V']

X_t = data_df[data_df['Split'] == 'T'].drop(['y', 'Split'], axis=1)
X_v = data_df[data_df['Split'] == 'V'].drop(['y', 'Split'], axis=1)

## Fit and predict

In [38]:
# data standardization
scaler = StandardScaler()
X_t_std = pd.DataFrame(scaler.fit_transform(X_t), index=X_t.index.values, columns=X_t.columns.values)
X_v_std = pd.DataFrame(scaler.transform(X_v), index=X_v.index.values, columns=X_v.columns.values)

# develop model
estimator = LinearRegression()
estimator.fit(X_t_std, y_t)

y_t_pred = estimator.predict(X_t_std)
y_v_pred = estimator.predict(X_v_std)

# leave-one-out cross validation
y_t_pred_loo = cross_val_predict(estimator, X_t_std, y_t, cv=X_t_std.shape[0])
y_v_pred_loo = cross_val_predict(estimator, X_v_std, y_v, cv=X_v_std.shape[0])

## Generate model statistics and errors

In [42]:
# model statistics
r_2 = metrics.r2_score(y_pred=y_t_pred, y_true=y_t)
q_2 = metrics.r2_score(y_pred=y_v_pred, y_true=y_v)

r_2_loo = metrics.r2_score(y_pred=y_t_pred_loo, y_true=y_t)
q_2_loo = metrics.r2_score(y_pred=y_v_pred_loo, y_true=y_v)

# errors
rmse_t= metrics.root_mean_squared_error(y_true=y_t, y_pred=y_t_pred)
rmse_v = metrics.root_mean_squared_error(y_true=y_v, y_pred=y_v_pred)
mae_t = metrics.mean_absolute_error(y_true=y_t, y_pred=y_t_pred)
mae_v = metrics.mean_absolute_error(y_true=y_v, y_pred=y_v_pred)

In [50]:
# print statistics
print('\nModel statistics:')
print(f'R_square: \t{round(r_2,3)}')
print(f'Q_square: \t{round(q_2,3)}')
print(f'R_square LOO: \t{round(r_2_loo,3)}')
print(f'Q_square LOO: \t{round(q_2_loo,3)}')

print('\nModel errors:')
print(f'RMSE_t: \t{round(rmse_t,3)}')
print(f'RMSE_v: \t{round(rmse_v,3)}')
print(f'MAE_t:  \t{round(mae_t,3)}')
print(f'MAE_v:  \t{round(mae_v,3)}')


Model statistics:
R_square: 	0.924
Q_square: 	0.888
R_square LOO: 	0.882
Q_square LOO: 	0.795

Model errors:
RMSE_t: 	0.583
RMSE_v: 	0.72
MAE_t:  	0.493
MAE_v:  	0.574
