# Data Cleaning

In [1]:
from __future__ import print_function
from packaging.version import parse as Version
from platform import python_version
import warnings;   warnings.filterwarnings("ignore")

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.10 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == Version(min_ver):
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(python_version())

if pyversion >= Version("3.10"):
    print(OK, "Python version is %s" % pyversion)
elif pyversion < Version("3.10"):
    print(FAIL, "Python version 3.10 is required,"
                " but %s is installed." % pyversion)
else:
    print(FAIL, "Unknown Python version: %s" % pyversion)

    
print()
requirements = {'numpy': "1.22.4", 'matplotlib': "3.5.2",'sklearn': "1.1.1", 
                'pandas': "1.4.2",'xgboost': "1.5.1", 'shap': "0.40.0"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.10.5

[42m[ OK ][0m numpy version 1.22.4 is installed.
[42m[ OK ][0m matplotlib version 3.5.2 is installed.
[42m[ OK ][0m sklearn version 1.1.1 is installed.
[42m[ OK ][0m pandas version 1.4.2 is installed.
[42m[ OK ][0m xgboost version 1.5.1 is installed.
[42m[ OK ][0m shap version 0.40.0 is installed.


In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
from itertools import accumulate
import six


final_dataset = pd.read_csv('../results/Tables/final_dataset.csv')

# Lagging

In [12]:
#creation of data with 7 lags

#features to lag
Referee = final_dataset['Referee']
Result = final_dataset['Result']
B365H = final_dataset['B365H']
B365D = final_dataset['B365D']
B365A = final_dataset['B365A']
HICT = final_dataset['HICT']
AICT = final_dataset['AICT']
DifICT = final_dataset['DifICT']

y7 = final_dataset['Result']
X7 = pd.concat([Referee, Result, 
                
                
B365H.shift(7),B365H.shift(6),B365H.shift(5),B365H.shift(4),B365H.shift(3),B365H.shift(2), B365H.shift(1),B365H, 
                
B365D.shift(7),B365D.shift(6), B365D.shift(5),B365D.shift(4),B365D.shift(3), B365D.shift(2),B365D.shift(1),B365D, 

B365A.shift(7),B365A.shift(6), B365A.shift(5),B365A.shift(4), B365A.shift(3),B365A.shift(2),B365A.shift(1),B365A, 

HICT.shift(7),HICT.shift(6), HICT.shift(5),HICT.shift(4),HICT.shift(3),HICT.shift(2),HICT.shift(1),HICT, 
               
AICT.shift(7),AICT.shift(6), AICT.shift(5),AICT.shift(4),AICT.shift(3),AICT.shift(2),AICT.shift(1),AICT, 
               
DifICT.shift(7),DifICT.shift(6), DifICT.shift(5),DifICT.shift(4),DifICT.shift(3),DifICT.shift(2),DifICT.shift(1),DifICT],
              
               axis=1)

X7.columns = ['Referee', 'Result',
              
              'B365H lag 7 matches','B365H lag 6 matches','B365H lag 5 matches', 'B365H lag 4 matches', 
              'B365H lag 3 matches','B365H lag 2 matches', 'B365H lag 1 match', 'B365H most recent match',
             
              'B365D lag 7 matches', 'B365D lag 6 matches', 'B365D lag 5 matches', 'B365D lag 4 matches', 
              'B365D lag 3 matches', 'B365D lag 2 matches', 'B365D lag 1 match', 'B365D most recent match',
             
              'B365A lag 7 matches', 'B365A lag 6 matches', 'B365A lag 5 matches', 'B365A lag 4 matches', 
              'B365A lag 3 matches','B365A lag 2 matches', 'B365A lag 1 match', 'B365A most recent match',       
             
              'HICT lag 7 matches', 'HICT lag 6 matches', 'HICT lag 5 matches', 'HICT lag 4 matches', 
              'HICT lag 3 matches','HICT lag 2 matches', 'HICT lag 1 match', 'HICT most recent match',       
             
              'AICT lag 7 matches', 'AICT lag 6 matches', 'AICT lag 5 matches', 'AICT lag 4 matches', 
              'AICT lag 3 matches','AICT lag 2 matches', 'AICT lag 1 match', 'AICT most recent match',
              
              'DifICT lag 7 matches', 'DifICT lag 6 matches', 'DifICT lag 5 matches', 'DifICT lag 4 matches', 
              'DifICT lag 3 matches','DifICT lag 2 matches', 'DifICT lag 1 match', 'DifICT most recent match'] 

# print(X7.tail(10))
# print(y7.tail(10))

# Encoding

In [17]:
# collect which encoder to use on each feature
onehot_ftrs_7 = ['Referee']
minmax_ftrs_7 = ['HICT lag 7 matches', 'HICT lag 6 matches','HICT lag 5 matches', 'HICT lag 4 matches', 
                 'HICT lag 3 matches', 'HICT lag 2 matches', 'HICT lag 1 match', 'HICT most recent match',       
            
                 'AICT lag 7 matches', 'AICT lag 6 matches','AICT lag 5 matches', 'AICT lag 4 matches', 
                 'AICT lag 3 matches','AICT lag 2 matches', 'AICT lag 1 match', 'AICT most recent match']
               
std_ftrs_7 = ['B365H lag 7 matches', 'B365H lag 6 matches', 'B365H lag 5 matches', 'B365H lag 4 matches', 
              'B365H lag 3 matches','B365H lag 2 matches', 'B365H lag 1 match', 'B365H most recent match',
              
              'B365D lag 7 matches', 'B365D lag 6 matches', 'B365D lag 5 matches', 'B365D lag 4 matches', 
              'B365D lag 3 matches', 'B365D lag 2 matches', 'B365D lag 1 match', 'B365D most recent match',
              
              'B365A lag 7 matches', 'B365A lag 6 matches', 'B365A lag 5 matches', 'B365A lag 4 matches', 
              'B365A lag 3 matches', 'B365A lag 2 matches', 'B365A lag 1 match', 'B365A most recent match',       
              
              'DifICT lag 7 matches', 'DifICT lag 6 matches','DifICT lag 5 matches', 'DifICT lag 4 matches', 
              'DifICT lag 3 matches', 'DifICT lag 2 matches', 'DifICT lag 1 match', 'DifICT most recent match']
    
# collect all the encoders
preprocessor_7 = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs_7),
        ('minmax', MinMaxScaler(), minmax_ftrs_7),
        ('std', StandardScaler(), std_ftrs_7)])

#create dataframe for splitting
X7_lagged = X7.iloc[7:].reset_index(drop = True)
#X7_lagged = X7_lagged.drop(['Result'], axis = 1)
y7_lagged = y7.iloc[7:].reset_index(drop = True)
X7_lagged

Unnamed: 0,Referee,Result,B365H lag 7 matches,B365H lag 6 matches,B365H lag 5 matches,B365H lag 4 matches,B365H lag 3 matches,B365H lag 2 matches,B365H lag 1 match,B365H most recent match,...,AICT lag 1 match,AICT most recent match,DifICT lag 7 matches,DifICT lag 6 matches,DifICT lag 5 matches,DifICT lag 4 matches,DifICT lag 3 matches,DifICT lag 2 matches,DifICT lag 1 match,DifICT most recent match
0,A Marriner,0.0,4.00,1.90,3.10,1.25,3.10,1.53,1.66,9.00,...,7800.0,8400.0,-600.0,300.0,0.0,700.0,-300.0,800.0,200.0,-1000.0
1,M Atkinson,0.0,1.90,3.10,1.25,3.10,1.53,1.66,9.00,3.20,...,8400.0,7900.0,300.0,0.0,700.0,-300.0,800.0,200.0,-1000.0,-300.0
2,A Taylor,2.0,3.10,1.25,3.10,1.53,1.66,9.00,3.20,5.50,...,7900.0,8500.0,0.0,700.0,-300.0,800.0,200.0,-1000.0,-300.0,-300.0
3,M Atkinson,1.0,1.25,3.10,1.53,1.66,9.00,3.20,5.50,2.55,...,8500.0,7300.0,700.0,-300.0,800.0,200.0,-1000.0,-300.0,-300.0,300.0
4,G Scott,2.0,3.10,1.53,1.66,9.00,3.20,5.50,2.55,1.08,...,7300.0,7400.0,-300.0,800.0,200.0,-1000.0,-300.0,-300.0,300.0,1100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,M Oliver,2.0,1.33,1.75,1.40,1.83,1.11,3.60,2.80,2.50,...,8303.1,8698.8,404.8,314.8,441.9,415.0,1194.0,-576.1,-526.5,-375.0
446,P Tierney,2.0,1.75,1.40,1.83,1.11,3.60,2.80,2.50,2.05,...,8698.8,7765.2,314.8,441.9,415.0,1194.0,-576.1,-526.5,-375.0,29.9
447,C Kavanagh,2.0,1.40,1.83,1.11,3.60,2.80,2.50,2.05,1.72,...,7765.2,7793.7,441.9,415.0,1194.0,-576.1,-526.5,-375.0,29.9,382.5
448,D Coote,0.0,1.83,1.11,3.60,2.80,2.50,2.05,1.72,3.60,...,7793.7,8402.0,415.0,1194.0,-576.1,-526.5,-375.0,29.9,382.5,-400.8


# Models

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import statistics as stat
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score
from catboost import CatBoostClassifier
import lightgbm as LGB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
import pickle

In [18]:
#ML Pipeline for Lightgbm
def MLpipe_RS_lgb(X, y, model, n_splits, preprocessor, scoring, refit, lags):
    '''ML Pipeline for LightGBM. Takes in a X df, the target variable, the number of splits for 
    timesplit, a preprocessor, the scoring method, the refit method, and lags of dataset
    Returns: best models, best test scores, confusion matrices, the lag used, and baseline scores of each split'''
    
    X_other = X.iloc[:250]
    X_test = X.iloc[250:]
    y_other = y.iloc[:250]
    y_test = y.iloc[250:]
    print(y_test.value_counts())

    tscv = TimeSeriesSplit(n_splits)
    
    #create empty Variables
    best_models = []
    best_scores = []
    cm = []
    random_states = [1, 10, 42, 60, 90]
    
    #loop through random states
    for i in random_states:
        
        #set parameter grid
        param_grid ={
            'ml__n_estimators': [5],
            'ml__num_leaves': [3,6,8,31], 
            'ml__max_depth': [40],
            'ml__colsample_bytree': [0.99],
            'ml__min_child_samples': [5,20,25,50], 
            'ml__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
            'ml__random_state': [i]}
             
        #create pipeline
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('ml', model)])

        #use gridsearchcv to tune
        gsearch = GridSearchCV(pipe, cv=tscv.split(X_other), param_grid= param_grid, scoring = scoring, refit = refit)
        
        #fit model
        gsearch.fit(X_other, y_other) 

        #save important info
        best_models.append(gsearch)
        best_param = gsearch.best_params_
        #feature_names = best_model[:-1].get_feature_names_out()
        
        #predict and save accuracy scores/best models
        y_pred = best_models[-1].predict(X_test)
        test_score = accuracy_score(y_test,y_pred)
        best_scores.append(test_score)
        cm.append(confusion_matrix(y_test,y_pred))
        print('---------------------------------------------------------------------------------------------------')

        print('\nBest Parameters\n')
        print(best_param)

        print('\nConfusion Matrix\n')
        print(confusion_matrix(y_test,y_pred))

        print('Accuracy: {:.2f}\n'.format(test_score))

        print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
        print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
        print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

        print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
        print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
        print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

        print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
        print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
        print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

        print('\nClassification Report\n')
        print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))
        
    #create average scores
    score = stat.mean(best_scores)
    score = score.round(3)
    std = stat.stdev(best_scores)
    std = round(std,3)
    
    print('---------------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------------')
    print(f'Averaged Accuracy score: {score:.2f}%, ' f'std: {std:.2f}%, ' f'# of Lags: {lags:.0f}')
    print('---------------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------------')
    
    return best_models, best_scores, cm, lags




In [23]:
#ML Pipeline for Lightgbm
def MLpipe_RS_lgb(X, y, model, n_splits, preprocessor, scoring, refit, lags):
    '''ML Pipeline for LightGBM. Takes in a X df, the target variable, the number of splits for 
    timesplit, a preprocessor, the scoring method, the refit method, and lags of dataset
    Returns: best models, best test scores, confusion matrices, the lag used, and baseline scores of each split'''

    tscv = TimeSeriesSplit(n_splits)
    
    #create empty Variables
    best_models = []
    best_scores = []
    cm = []
    random_states = [1, 10, 42, 60, 90]
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        #loop through random states
        for i in random_states:

            #set parameter grid
            param_grid ={
                'ml__n_estimators': [5],
                'ml__num_leaves': [3,6,8,31], 
                'ml__max_depth': [40],
                'ml__colsample_bytree': [0.99],
                'ml__min_child_samples': [5,20,25,50], 
                'ml__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
                'ml__random_state': [i]}

            #create pipeline
            pipe = Pipeline(steps=[('preprocessor', preprocessor), ('ml', model)])

            #use gridsearchcv to tune
            gsearch = GridSearchCV(pipe, cv=tscv.split(X_train), param_grid= param_grid, scoring = scoring, refit = refit)

            #fit model
            gsearch.fit(X_train, y_train) 

            #save important info
            best_models.append(gsearch)
            best_param = gsearch.best_params_
            #feature_names = best_model[:-1].get_feature_names_out()

            #predict and save accuracy scores/best models
            y_pred = best_models[-1].predict(X_test)
            test_score = accuracy_score(y_test,y_pred)
            best_scores.append(test_score)
            cm.append(confusion_matrix(y_test,y_pred))
            print('---------------------------------------------------------------------------------------------------')

            print('\nBest Parameters\n')
            print(best_param)

            print('\nConfusion Matrix\n')
            print(confusion_matrix(y_test,y_pred))

            print('Accuracy: {:.2f}\n'.format(test_score))

            print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
            print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
            print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

            print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
            print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
            print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

            print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
            print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
            print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

            print('\nClassification Report\n')
            print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))
        
    #create average scores
    score = stat.mean(best_scores)
    score = score.round(3)
    std = stat.stdev(best_scores)
    std = round(std,3)
    
    print('---------------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------------')
    print(f'Averaged Accuracy score: {score:.2f}%, ' f'std: {std:.2f}%, ' f'# of Lags: {lags:.0f}')
    print('---------------------------------------------------------------------------------------------------')
    print('---------------------------------------------------------------------------------------------------')
    
    return best_models, best_scores, cm, lags




In [24]:
#creation of LGB model
LGGclass =  LGB.LGBMClassifier()

lgb7, score_lgb7, cm_lgb7, lags_lgb7 = MLpipe_RS_lgb(X7_lagged, y7_lagged, LGGclass, 2,  preprocessor_7,'f1_micro','f1_micro',7)

---------------------------------------------------------------------------------------------------

Best Parameters

{'ml__colsample_bytree': 0.99, 'ml__max_depth': 40, 'ml__min_child_samples': 20, 'ml__min_child_weight': 1e-05, 'ml__n_estimators': 5, 'ml__num_leaves': 3, 'ml__random_state': 1}

Confusion Matrix

[[34  0 25]
 [10  2 18]
 [ 6  0 55]]
Accuracy: 0.61

Micro Precision: 0.61
Micro Recall: 0.61
Micro F1-score: 0.61

Macro Precision: 0.75
Macro Recall: 0.51
Macro F1-score: 0.48

Weighted Precision: 0.70
Weighted Recall: 0.61
Weighted F1-score: 0.55

Classification Report

              precision    recall  f1-score   support

     Class 0       0.68      0.58      0.62        59
     Class 1       1.00      0.07      0.12        30
     Class 2       0.56      0.90      0.69        61

    accuracy                           0.61       150
   macro avg       0.75      0.51      0.48       150
weighted avg       0.70      0.61      0.55       150

-----------------------------

---------------------------------------------------------------------------------------------------

Best Parameters

{'ml__colsample_bytree': 0.99, 'ml__max_depth': 40, 'ml__min_child_samples': 20, 'ml__min_child_weight': 1e-05, 'ml__n_estimators': 5, 'ml__num_leaves': 3, 'ml__random_state': 90}

Confusion Matrix

[[27  0 12]
 [14  1 25]
 [17  0 54]]
Accuracy: 0.55

Micro Precision: 0.55
Micro Recall: 0.55
Micro F1-score: 0.55

Macro Precision: 0.69
Macro Recall: 0.49
Macro F1-score: 0.42

Weighted Precision: 0.67
Weighted Recall: 0.55
Weighted F1-score: 0.47

Classification Report

              precision    recall  f1-score   support

     Class 0       0.47      0.69      0.56        39
     Class 1       1.00      0.03      0.05        40
     Class 2       0.59      0.76      0.67        71

    accuracy                           0.55       150
   macro avg       0.69      0.49      0.42       150
weighted avg       0.67      0.55      0.47       150

----------------------------

In [25]:
print(score_lgb7)

[0.6066666666666667, 0.6066666666666667, 0.6066666666666667, 0.6066666666666667, 0.6066666666666667, 0.5466666666666666, 0.5466666666666666, 0.5466666666666666, 0.5466666666666666, 0.5466666666666666]


## Best Model

In [26]:
#recreates best model prediciton using best parameters 
tscv = TimeSeriesSplit(2)
best_modellgb7 = []
best_scorelgb7 = []
cmlgb7 = []
feature_nameslgb7 = []

param_grid = {'ml__colsample_bytree': [0.99], 
 'ml__max_depth': [40], 
 'ml__min_child_samples':[20],
 'ml__min_child_weight': [1e-05], 
 'ml__n_estimators': [5],
 'ml__num_leaves': [3],
 'ml__random_state': [60]}

X =X7_lagged
y = y7_lagged

#stops model from computing second split
counter = 0

for train_index, test_index in tscv.split(X):
    if counter == 1:
        break
        
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    pipe = Pipeline(steps=[('preprocessor', preprocessor_7), ('ml', LGGclass)])
    gsearch = GridSearchCV(pipe, cv=tscv, param_grid= param_grid, scoring = 'f1_micro', refit = 'f1_micro')
    gsearch.fit(X_train, y_train) 

    best_model = gsearch.best_estimator_
    best_score = gsearch.best_score_
    best_param = gsearch.best_params_
    feature_nameslgb7 = best_model[:-1].get_feature_names_out()

    y_pred = best_model.predict(X_test)
    test_score = accuracy_score(y_test,y_pred)
    best_scorelgb7.append(test_score)
    cmlgb7.append(confusion_matrix(y_test,y_pred))
    best_modellgb7.append(best_model)
    
    print('\nConfusion Matrix\n')
    print(confusion_matrix(y_test,y_pred))

    print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

    print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
    print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
    print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

    print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
    print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
    print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

    print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
    print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
    print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

    print('\nClassification Report\n')
    print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3']))
    counter += 1


Confusion Matrix

[[34  0 25]
 [10  2 18]
 [ 6  0 55]]

Accuracy: 0.61

Micro Precision: 0.61
Micro Recall: 0.61
Micro F1-score: 0.61

Macro Precision: 0.75
Macro Recall: 0.51
Macro F1-score: 0.48

Weighted Precision: 0.70
Weighted Recall: 0.61
Weighted F1-score: 0.55

Classification Report

              precision    recall  f1-score   support

     Class 1       0.68      0.58      0.62        59
     Class 2       1.00      0.07      0.12        30
     Class 3       0.56      0.90      0.69        61

    accuracy                           0.61       150
   macro avg       0.75      0.51      0.48       150
weighted avg       0.70      0.61      0.55       150



In [35]:
pickle.dump(best_modellgb7[0][1], open('../results/best_model.pkl', 'wb'))