In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

############################################## PREPROCESSING ##############################################
from sklearn.preprocessing import MinMaxScaler

################################################# METRICS #################################################
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve

################################### MODEL SELECTION & OPTIMIZATION ########################################
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

######################################### DECISION TREES PLOTS ############################################
from sklearn.tree import export_graphviz
import graphviz
import pydotplus

In [4]:
df= pd.read_csv(r"C:\Users\pedro\OneDrive\Desktop\DSProject Folder\src\Financial Data 14-18\Int DFs\df_mod.csv")

In [5]:
df.shape

(19718, 26)

In [6]:
df.head(10)

Unnamed: 0,Effect of forex changes on cash,SG&A Expenses Growth,Receivables Turnover,5Y Revenue Growth (per Share),3Y Shareholders Equity Growth (per Share),Issuance (buybacks) of shares,eBITperRevenue,Net cash flow / Change in cash,priceSalesRatio,SG&A to Revenue,...,Operating Cash Flow,3Y Net Income Growth (per Share),returnOnCapitalEmployed,Tangible Book Value per Share,Investing Cash flow,Gross Margin,PFCF ratio,5Y Operating CF Growth (per Share),Class,Following Year Price Variation [%]
0,-15648690.0,1.7313,6.48354,0.066,0.7973,1767840.0,0.050221,446316900.0,0.095727,0.0922,...,526745600.0,0.25206,0.0,4.4939,-686693600.0,0.2487,1.3589,0.11996,0,-25.512193
1,0.0,0.0234,90.7937,0.1038,0.0789,-413000000.0,0.027578,163000000.0,0.0,0.1545,...,3573000000.0,0.1892,0.0859,25.724,-4771000000.0,0.2057,14.6302,0.0937,1,33.118297
2,0.0,-0.006,27.1769,-0.029,0.0,33217000.0,0.026436,16954000.0,0.0,0.257,...,702046000.0,0.0,0.1062,134.785,-364924000.0,0.2869,17.2736,0.1164,1,2.752291
3,-29200000.0,-0.022,12.225,0.0567,0.0217,-1637200000.0,0.168072,125900000.0,1.553911,0.194,...,2541000000.0,0.0177,0.1041,15.429,-561800000.0,0.3557,17.6902,0.0828,1,12.897715
4,-376000000.0,0.0161,20.391,0.0961,0.0,-3833000000.0,0.145332,-472000000.0,1.299472,0.0874,...,7739000000.0,-0.0084,0.3752,15.327,-996000000.0,0.2413,19.215,0.0377,1,13.980937
5,-223000000.0,-0.0256,6.7449,-0.0494,-0.0607,-1700000000.0,0.094177,-991000000.0,1.644111,0.247,...,3562000000.0,-0.1354,0.0376,13.719,-1642000000.0,0.3679,31.6718,-0.0925,1,23.809818
6,0.0,0.085,205.205,0.0177,-0.0519,-939000000.0,0.349971,146000000.0,3.319691,0.1035,...,4663000000.0,0.1588,0.1948,8.673,177000000.0,0.4143,21.6402,0.0726,1,23.865489
7,37889600.0,-0.0076,11.9336,0.1534,0.09406,31318060.0,0.408287,-757078500.0,2.252915,0.2883,...,7052460000.0,0.05698,0.2515,0.9801,-1790462000.0,0.6635,6.747,0.19388,0,-22.605312
8,694398.0,0.346,0.0,0.4521,0.1217,0.0,0.214357,84849900.0,0.0,0.3379,...,101558400.0,0.2846,0.0,0.888,-17402870.0,0.5172,6.8263,0.2707,1,65.373665
9,-132000000.0,-0.01442,15.78346,0.03482,0.07494,0.0,0.134957,-161000000.0,12.993151,0.1565,...,2140000000.0,0.03054,0.0433,20.10842,-349000000.0,0.3,22.0915,0.04848,1,2.065963


Apply Train Test Split so we save a portion of unseen data to test the model

In [8]:
X = df.drop(columns=['Following Year Price Variation [%]', 'Class'])
y = df['Class']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15774, 24), (3944, 24), (15774,), (3944,))

In [13]:
def apply_model (X, y, model):

    score_train = []
    score_val = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=99) # Because it's a classification problem. In the regression problem we will use KFold
    
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),  #Scale the data
        ('clf', model) #Model is passed as an argument (KNN, Decision Tree, MLP, etc)
    ])
    #Cross Validation

    for train_idx, val_idx in skf.split(X, y):
        X_t, X_v = X.iloc[train_idx], X.iloc[val_idx]
        y_t, y_v = y.iloc[train_idx], y.iloc[val_idx]

        pipe.fit(X_t, y_t)

        pred_train = pipe.predict(X_t)
        pred_val = pipe.predict(X_v)

        score_train.append(f1_score(y_t, pred_train))  #Using F1-score as a metric because its a classification problem
        score_val.append(f1_score(y_v, pred_val))
    
    avg_train = round(np.mean(score_train), 3)
    avg_val = round(np.mean(score_val), 3)
    std_train = round(np.std(score_train), 2)
    std_val = round(np.std(score_val), 2) 

    return avg_train, std_train, avg_val, std_val

In [14]:
def model_assessment(X, y , **models):

    results = {}
    
    for name, model in models.items():
        avg_train, std_train, avg_val, std_val = apply_model(X, y, model)
        results[name] = {
            "Train": f"{avg_train:.3f} +/- {std_train:.2f}",
            "Validation": f"{avg_val:.3f} +/- {std_val:.2f}"
        }
    return pd.DataFrame(results).T

In [16]:
################################################# MODELS ##################################################
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

Create the dictionary of models to evaluate

In [17]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42)
}

In [18]:
results_df = model_assessment(X_train, y_train, **models)
print(results_df)


                              Train      Validation
Random Forest        1.000 +/- 0.00  0.679 +/- 0.00
Gradient Boosting    0.729 +/- 0.00  0.691 +/- 0.00
Logistic Regression  0.695 +/- 0.00  0.692 +/- 0.00
KNN                  0.750 +/- 0.00  0.609 +/- 0.00
Neural Network       0.687 +/- 0.02  0.683 +/- 0.01


Random Forest sems to be highly overfiited with a perfect score in the training phase and a considerable decrease in the Valiation Phase  
Best Models seem to be Gradient Boosting an Logistic Regression. Not overfiited and with the best Validation Scores.  

These 2 will move on to Fine Tuning/Model Optimization


In [20]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, randint

Let's use RandomizedSearchCV as GridSearch tests all possible combinations and is very computationally expensive

In [22]:
gb = GradientBoostingClassifier(random_state=42)
lr = LogisticRegression(max_iter=500, random_state=42)

Gradient Boosting (Not sensitive to scalling, tree based)

In [23]:
param_dist_gb = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10)
}

random_search_gb = RandomizedSearchCV(
    estimator=gb,
    param_distributions=param_dist_gb,
    n_iter=50,     # number of random combinations to try
    cv=5,
    scoring='f1',
    n_jobs=-1,
    random_state=42
)

random_search_gb.fit(X_train, y_train)
print(random_search_gb.best_params_)
print(random_search_gb.best_score_)

{'learning_rate': np.float64(0.011413261043943482), 'max_depth': 3, 'n_estimators': 98}
0.7064103616384327


Logistic Regression

In [26]:
pipeline_tun = Pipeline([
    ('scaler', MinMaxScaler()),  #Scale the data
    ('classifier', lr)
])

param_dist_lr = {
    'classifier__C': uniform(0.01, 10),           # regularization strength
    'classifier__penalty': ['l1', 'l2'],          # type of regularization
    'classifier__solver': ['liblinear', 'saga']   # solvers that support l1/l2
}

random_search_lr = RandomizedSearchCV(
    estimator=pipeline_tun,
    param_distributions=param_dist_lr,
    n_iter=20,     # number of random combinations to try
    cv=5, #For each combination of hyperparameters, perform 5-fold cross-validation. Trains on 4 folds and validates on the remaining fold.
    scoring='f1', #RS will try to maximize the F1-score
    n_jobs=-1, #Controls parallel processing, -1 means using all processors, 1= single processor etc
    random_state=42
)

random_search_lr.fit(X_train, y_train)
print(random_search_lr.best_params_)
print(random_search_lr.best_score_)

{'classifier__C': np.float64(0.21584494295802448), 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
0.6960541802339144


In [27]:
final_model_gb = GradientBoostingClassifier(learning_rate=0.011413261043943482, 
                                            max_depth= 3,
                                            n_estimators= 98,
                                            random_state=42)

In [29]:
final_model_lr=LogisticRegression(C=0.21584494295802448,
                                    penalty= 'l2', 
                                    solver= 'saga',
                                    max_iter=500,  #Not fine tuned but default =100 and LR oftern needs more iterations to converge
                                    random_state=42)

In [30]:
models_final = {"Gradient Boosting": final_model_gb,
                "Logistic Regression": final_model_lr}

In [31]:
fresults_df= model_assessment(X_train, y_train, **models_final)
print(fresults_df)

                              Train      Validation
Gradient Boosting    0.713 +/- 0.00  0.707 +/- 0.00
Logistic Regression  0.696 +/- 0.00  0.695 +/- 0.00


GB : Very small gap, model generalizes well, there is no overfitting.  

LR : Nearly identical. Excellent stability, just slightly less predictive power than Gradient Boosting.

Save/Export Models

In [32]:
import joblib

In [None]:
joblib.dump(final_model_gb, "final_model_gb.pkl")
joblib.dump(final_model_lr, "final_model_lr.pkl")

['final_model_lr.pkl']

In [36]:
scaler = MinMaxScaler().fit(X_train)
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [35]:
joblib.dump((X_test, y_test), "test_data.pkl")

['test_data.pkl']