In [1]:
import os
import numpy as np 
import pandas as pd 
import joblib

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")

In [2]:
from src.components.data.api.game_functions import get_game_list
from src.components.data.api.player_functions import get_player_details, get_player_info, get_player_hist, get_player_id, get_player_name
from src.components.data.api.round_functions import get_round_info

from src.components.data.fetch_data import fetch_data

In [3]:
fetch_data(get_game_list, id_list = ["id","team_h","team_a","season_start_year"])
fetch_data(get_player_details, id_list = ["season_start_year","element","fixture"])
fetch_data(get_player_hist, id_list = ["season_name","element_code"], season_specific=False)
fetch_data(get_player_info, id_list = ["season_start_year", "id"])
fetch_data(get_player_id, id_list = ["id"], season_specific=False)
fetch_data(get_player_name, id_list = ["id"], season_specific=False)
fetch_data(get_round_info, id_list = ["id", "season_start_year"])

data/get_game_list.csv is fetched.
data/get_player_details.csv is fetched.
data/get_player_hist.csv is fetched.
data/get_player_info.csv is fetched.
data/get_player_id.csv is fetched.
data/get_player_name.csv is fetched.
data/get_round_info.csv is fetched.


In [4]:
from src.components.data.transform_data import create_data

create_data()

Create_data - Games base
Create_data - Team form
Create_data - Table
Create_data - Player
Create_data - Merging and interactions
Create_data - DONE! 



In [2]:
from src.components.ml.data_ingest_transform_train import DataIngest, DataTranformTrain

In [3]:
data_ingest = DataIngest()
data_ingest.create_train_and_test()  

create_train_and_test - Creating train, test, val and score
train: 863 rows 

test: 288 rows 

val: 288 rows 



('artifacts\\train.csv',
 'artifacts\\test.csv',
 'artifacts\\validation.csv',
 'artifacts\\score.csv')

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb 

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(),
}

params = {
    "Logistic Regression":{
        'model__C': [0.001, 0.01, 0.1, 1, 10], 
        'model__penalty': ['l1', 'l2'],  
        'model__max_iter': [100, 1000, 10000],  
        'model__solver': ['liblinear', 'saga']  
    },
    "Decision Tree": {
        'model__criterion': ['entropy', 'gini'], 
        'model__max_depth': [None, 2, 3, 4, 5, 6], 
        'model__min_samples_leaf': [1, 2, 5, 10, 20],  
        'model__min_samples_split': [2, 5, 10],  
    },
    "Random Forest":{
        'model__bootstrap': [True],
        'model__max_features': ['sqrt', 'log2', None],
        #'model__max_features': [10, 20, 50],
        'model__max_depth': [2, 3, 4, 6],
        'model__min_samples_leaf': [1, 2, 4, 5, 10, 20, 50],
        'model__n_estimators': [10, 50, 100, 500, 1000],
    },
    "Gradient Boosting":{
        "model__loss":["log_loss", "exponential"],
        'model__learning_rate': [0.001, 0.005, 0.01, 0.015, 0.03, 0.06],
        'model__min_samples_leaf': [1, 2, 5, 10, 20, 50],
        'model__max_depth': [2, 3, 4, 6],
        'model__n_estimators': [10, 50, 100],
    },
    "XGBoost":{
        'model__max_depth': [2, 3, 4, 6],
        'model__learning_rate': [0.001, 0.005, 0.01, 0.015, 0.03, 0.06],
        'model__n_estimators': [10, 50, 100, 500],
        'model__min_child_weight': [3, 5, 10, 50],
        'model__gamma': [0, 0.1, 1, 2],
        'model__reg_lambda': [0, 0.1, 1, 10]
    },   
}

In [None]:
# USED FOR TESTING FAST

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression
# import xgboost as xgb 
# 
# models = {
#     "Logistic Regression": LogisticRegression(),
#     #"Decision Tree": DecisionTreeClassifier(),
# }
# 
# params = {
#     "Logistic Regression":{
#         'model__C': [0.001, 0.01, 0.1, 1, 10], 
#         'model__penalty': ['l1', 'l2'],  
#         'model__max_iter': [100, 1000, 10000],  
#         'model__solver': ['liblinear', 'saga']  
#     },
#     #"Decision Tree": {
#     #    'model__criterion': ['entropy', 'gini'], 
#     #    'model__max_depth': [None, 2, 3, 4, 5, 6], 
#     #    'model__min_samples_leaf': [1, 2, 5, 10, 20],  
#     #    'model__min_samples_split': [2, 5, 10],  
#     #},
# }

In [5]:
algo_1 = DataTranformTrain(label = 'label_1')
algo_1.grid_search(models=models, params=params, save_to_mlflow=False)

label_1
- Importing train, test and val
- Calculating significant features
- Hyperparameter-tuning and training best model for each algo: 

Logistic Regression
- Hyperparameter-tuning
- Select best hyperparameters
- Train best model
- Calculating metrics 

                Metric  Metric value
0        AUC-ROC Train      0.734399
1          AUC-ROC Val      0.752132
2        AUC-PRC Train      0.690630
3          AUC-PRC Val      0.735304
4       Accuracy Train      0.674971
5         Accuracy Val      0.663194
6   Precision Train: 0      0.682609
7     Precision Val: 0      0.640394
8   Precision Train: 1      0.659722
9     Precision Val: 1      0.717647
10     Recall Train: 0      0.800204
11       Recall Val: 0      0.844156
12     Recall Train: 1      0.510067
13       Recall Val: 1      0.455224
14   F1-score Train: 0      0.736743
15     F1-score Val: 0      0.728291
16   F1-score Train: 1      0.575322
17     F1-score Val: 1      0.557078


- Collect feature importance 

Decisio

In [6]:
algo_X = DataTranformTrain(label = 'label_X')
algo_X.grid_search(models=models, params=params, save_to_mlflow=False)

label_X
- Importing train, test and val
- Calculating significant features
- Hyperparameter-tuning and training best model for each algo: 

Logistic Regression
- Hyperparameter-tuning
- Select best hyperparameters
- Train best model
- Calculating metrics 

                Metric  Metric value
0        AUC-ROC Train      0.646606
1          AUC-ROC Val      0.484322
2        AUC-PRC Train      0.281872
3          AUC-PRC Val      0.212616
4       Accuracy Train      0.754345
5         Accuracy Val      0.826389
6   Precision Train: 0      0.758480
7     Precision Val: 0      0.825784
8   Precision Train: 1      0.312500
9     Precision Val: 1      1.000000
10     Recall Train: 0      0.991590
11       Recall Val: 0      1.000000
12     Recall Train: 1      0.011962
13       Recall Val: 1      0.019608
14   F1-score Train: 0      0.859510
15     F1-score Val: 0      0.904580
16   F1-score Train: 1      0.023041
17     F1-score Val: 1      0.038462


- Collect feature importance 

Decisio

In [7]:
algo_2 = DataTranformTrain(label = 'label_2')
algo_2.grid_search(models=models, params=params, save_to_mlflow=False)

label_2
- Importing train, test and val
- Calculating significant features
- Hyperparameter-tuning and training best model for each algo: 

Logistic Regression
- Hyperparameter-tuning
- Select best hyperparameters
- Train best model
- Calculating metrics 

                Metric  Metric value
0        AUC-ROC Train      0.744366
1          AUC-ROC Val      0.707216
2        AUC-PRC Train      0.594265
3          AUC-PRC Val      0.558652
4       Accuracy Train      0.721321
5         Accuracy Val      0.684028
6   Precision Train: 0      0.732923
7     Precision Val: 0      0.697479
8   Precision Train: 1      0.656489
9     Precision Val: 1      0.620000
10     Recall Train: 0      0.922614
11       Recall Val: 0      0.897297
12     Recall Train: 1      0.305506
13       Recall Val: 1      0.300971
14   F1-score Train: 0      0.816901
15     F1-score Val: 0      0.784870
16   F1-score Train: 1      0.416970
17     F1-score Val: 1      0.405229


- Collect feature importance 

Decisio

In [8]:
all_algo_metrics_1 = pd.read_excel("artifacts/ml_results/label_1/all_algo_metrics.xlsx")
all_algo_metrics_X = pd.read_excel("artifacts/ml_results/label_X/all_algo_metrics.xlsx")
all_algo_metrics_2 = pd.read_excel("artifacts/ml_results/label_2/all_algo_metrics.xlsx")

In [18]:
concat_matrics = pd.concat([all_algo_metrics_1, all_algo_metrics_X, all_algo_metrics_2])
calculate_total_metrics = concat_matrics.groupby("Algorithm", as_index=False)["AUC-ROC Val"].sum()
rearrange_metrics = calculate_total_metrics.sort_values("AUC-ROC Val", ascending=False).reset_index(drop=True)
best_total_algorithm = rearrange_metrics["Algorithm"][0]
print(rearrange_metrics.to_string(index=False))

          Algorithm  AUC-ROC Val
      Random Forest     1.945849
Logistic Regression     1.943670
  Gradient Boosting     1.917307
            XGBoost     1.887906
      Decision Tree     1.771869


In [10]:
best_total_algorithm_1 = joblib.load('artifacts/ml_results/label_1/{0}.pkl'.format(best_total_algorithm))
best_total_algorithm_X = joblib.load('artifacts/ml_results/label_X/{0}.pkl'.format(best_total_algorithm))
best_total_algorithm_2 = joblib.load('artifacts/ml_results/label_2/{0}.pkl'.format(best_total_algorithm))

In [11]:
from src.components.ml.score import predict_result

In [21]:
import importlib
from src.components.ml import score  # Import the entire module

importlib.reload(score)  # Reload the entire module

# Now you can access the reloaded function
from src.components.ml.score import predict_result

In [22]:
predictions = predict_result(best_total_algorithm_1, best_total_algorithm_X, best_total_algorithm_2, predict_data='score')
predictions.to_csv('artifacts/result_predictions.csv',index=False,header=True)

In [13]:
# Calculated accuracy on validation set based on all three scores
predictions_val = predict_result(best_total_algorithm_1, best_total_algorithm_X, best_total_algorithm_2, predict_data='validation')[["proba_1_fix", "proba_X_fix", "proba_2_fix"]]

val = pd.read_csv("artifacts/validation.csv")[["label_1", "label_X", "label_2"]]

c = 0
for i in range(0, val.shape[0]):
    r = np.where(predictions_val.iloc[i, 0] == max(predictions_val.iloc[i, ]), "1", np.where(predictions_val.iloc[i, 2] == max(predictions_val.iloc[i, ]), "2", "X"))
    c = c + np.where(((r == "1") & (val["label_1"][i] == 1)) | ((r == "X") & (val["label_X"][i] == 1)) | ((r == "2") & (val["label_2"][i] == 1)), 1, 0)

print("Accuracy: ", c / val.shape[0])

Accuracy:  0.5590277777777778
