In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

############################################## PREPROCESSING ##############################################
from sklearn.preprocessing import MinMaxScaler

################################################# METRICS #################################################
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

################################### MODEL SELECTION & OPTIMIZATION ########################################
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

######################################### MODELS ############################################
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR


In [4]:
df= pd.read_csv(r"C:\Users\pedro\OneDrive\Desktop\DSProject Folder\src\Int DFs\df_mod.csv")

In [5]:
df

Unnamed: 0,Effect of forex changes on cash,SG&A Expenses Growth,Receivables Turnover,5Y Revenue Growth (per Share),3Y Shareholders Equity Growth (per Share),Issuance (buybacks) of shares,eBITperRevenue,Net cash flow / Change in cash,priceSalesRatio,SG&A to Revenue,...,Operating Cash Flow,3Y Net Income Growth (per Share),returnOnCapitalEmployed,Tangible Book Value per Share,Investing Cash flow,Gross Margin,PFCF ratio,5Y Operating CF Growth (per Share),Class,Following Year Price Variation [%]
0,-1.564869e+07,1.7313,6.48354,0.06600,0.7973,1.767840e+06,0.050221,4.463169e+08,0.095727,0.0922,...,5.267456e+08,0.25206,0.00000,4.4939,-6.866936e+08,0.2487,1.3589,0.11996,0,-25.512193
1,0.000000e+00,0.0234,90.79370,0.10380,0.0789,-4.130000e+08,0.027578,1.630000e+08,0.000000,0.1545,...,3.573000e+09,0.18920,0.08590,25.7240,-4.771000e+09,0.2057,14.6302,0.09370,1,33.118297
2,0.000000e+00,-0.0060,27.17690,-0.02900,0.0000,3.321700e+07,0.026436,1.695400e+07,0.000000,0.2570,...,7.020460e+08,0.00000,0.10620,134.7850,-3.649240e+08,0.2869,17.2736,0.11640,1,2.752291
3,-2.920000e+07,-0.0220,12.22500,0.05670,0.0217,-1.637200e+09,0.168072,1.259000e+08,1.553911,0.1940,...,2.541000e+09,0.01770,0.10410,15.4290,-5.618000e+08,0.3557,17.6902,0.08280,1,12.897715
4,-3.760000e+08,0.0161,20.39100,0.09610,0.0000,-3.833000e+09,0.145332,-4.720000e+08,1.299472,0.0874,...,7.739000e+09,-0.00840,0.37520,15.3270,-9.960000e+08,0.2413,19.2150,0.03770,1,13.980937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19713,-2.295000e+03,-0.2602,0.00000,-0.04238,-0.1237,0.000000e+00,0.000000,6.238200e+04,5.319410,0.0000,...,-6.299496e+06,0.00000,-0.01998,2.2390,9.064000e+03,0.0000,0.0000,0.10706,0,-90.962099
19714,-2.500000e+04,-0.0993,3.98570,-0.61450,-0.5207,1.240000e+05,-16.492806,-1.144900e+07,14.827274,9.1205,...,-8.754000e+06,0.00000,-0.95620,1.3050,-2.788000e+06,1.0000,0.0000,0.00000,0,-77.922077
19715,3.354130e+05,0.8987,1.56780,0.11272,0.7411,-3.800000e+05,0.175924,-3.595886e+06,0.793525,0.1279,...,-2.234995e+06,-0.02510,0.12690,5.5440,-7.102750e+05,0.3333,0.0000,0.00190,0,-17.834400
19716,0.000000e+00,0.1457,0.00000,0.29168,-0.0397,1.976147e+06,-8.781190,-1.507882e+06,351.580968,0.0000,...,-1.114753e+07,0.00000,-6.06120,0.0640,-6.189970e+05,0.0000,0.0000,0.00000,0,-73.520000


In [6]:
X = df.drop(columns=['Following Year Price Variation [%]', 'Class'])
y = df['Following Year Price Variation [%]']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
def apply_model_reg (X, y, model):

    train_mse, val_mse = [],[]
    train_mae, val_mae = [],[]
    train_r2, val_r2 = [],[]

    kf=KFold(n_splits=5, shuffle=True, random_state=42)

    pipe_reg = Pipeline ([('scaler', MinMaxScaler()),
                         ('model', model)
                         ])
    
    for train_idx, val_idx in kf.split(X):

        X_t, X_v = X.iloc[train_idx], X.iloc[val_idx]
        y_t, y_v = y.iloc[train_idx], y.iloc[val_idx]

        pipe_reg.fit(X_t, y_t)

        pred_train = pipe_reg.predict(X_t)
        pred_val = pipe_reg.predict(X_v)

        train_mse.append(mean_squared_error(y_t, pred_train))
        val_mse.append(mean_squared_error(y_v, pred_val))

        train_mae.append(mean_absolute_error(y_t, pred_train))
        val_mae.append(mean_absolute_error(y_v, pred_val))

        train_r2.append (r2_score(y_t, pred_train))
        val_r2.append (r2_score(y_v, pred_val))

    return {
        "Train MAE": f"{np.mean(train_mae):.3f} ± {np.std(train_mae):.2f}",
        "Val MAE":   f"{np.mean(val_mae):.3f} ± {np.std(val_mae):.2f}",
        "Train MSE": f"{np.mean(train_mse):.3f} ± {np.std(train_mse):.2f}",
        "Val MSE":   f"{np.mean(val_mse):.3f} ± {np.std(val_mse):.2f}",
        "Train R2":  f"{np.mean(train_r2):.3f} ± {np.std(train_r2):.2f}",
        "Val R2":    f"{np.mean(val_r2):.3f} ± {np.std(val_r2):.2f}"
    }

In [12]:
def model_assessment_reg (X, y, **models):

    results = [] #List for the dictionaries

    for name, model in models.items():
        metrics = apply_model_reg(X, y, model)  #A Dictionary is returned
        metrics['Model']=name # Add a new Key Value pair to the metrics dictionary e.g. "Model":"NN"
        results.append (metrics) #Add each dictionary to the list

    return pd.DataFrame (results).set_index("Model")



In [14]:
reg_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "SVR": SVR()
}

In [15]:
results_rg = model_assessment_reg (X_train, y_train, **reg_models)
print (results_rg)

                         Train MAE          Val MAE                 Train MSE  \
Model                                                                           
Linear Regression  138.137 ± 14.75  142.365 ± 38.81  4391068.073 ± 1446894.51   
Ridge Regression   126.566 ± 17.30  129.115 ± 33.60  4409865.071 ± 1436265.16   
Lasso Regression   117.104 ± 13.66  119.609 ± 39.65  4403997.618 ± 1448681.62   
Random Forest        39.861 ± 7.50  103.039 ± 43.39    768060.136 ± 298397.44   
Gradient Boosting    44.380 ± 3.43  107.884 ± 54.01      63237.814 ± 27579.90   
KNN Regressor        74.099 ± 9.68   90.500 ± 39.53   3024953.519 ± 925045.63   
SVR                  74.110 ± 9.94   74.155 ± 39.76  4518992.474 ± 1377276.52   

                                    Val MSE       Train R2           Val R2  
Model                                                                        
Linear Regression  5118611.632 ± 6095113.13   0.040 ± 0.05    -1.337 ± 2.02  
Ridge Regression   4883127.546 ± 587