In [32]:
# barplot
# https://12jeong.github.io/python-barplot/

# plot color
# https://matplotlib.org/stable/gallery/color/named_colors.html

# 신뢰 구간
# https://angie-gil.medium.com/평점-순으로-정렬하지-않는-법-이론편-9ced2f26bc4b

# Import Dataset and Libraries

In [64]:
from typing import Any
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings(action='ignore')
plt.rc('font', family='Malgun Gothic')

In [34]:
preprocess_file = "./preprocessing.csv"

drop_columns = [
    'index',
    'date',
#    'round',
#    'game_count',
#    'game_goal',
#    'cur_game_count',
#    'win_odds',
#    'lose_odds',
    'track_E',
#    'cc',
#    'part_people',
#    'rank',
#    'prefix_rank',
#    'odds_result',
    'significant',
#    'RESULT',
#    'track_E_encoded',
#    'odds_result_SU',
#    'odds_result_UD',
]

df_pp = pd.read_csv(preprocess_file)
df_pp = df_pp.drop(drop_columns, axis=1)

In [35]:
df_pp.head()

Unnamed: 0,round,game_count,game_goal,cur_game_count,win_odds,lose_odds,cc,part_people,rank,prefix_rank,odds_result,RESULT,track_E_encoded,odds_result_SU,odds_result_UD
0,1,3,15,1,4.71,1.27,150,10,10,10,SU,False,41,1,0
1,1,3,15,2,4.71,1.27,150,12,7,17,SU,False,20,1,0
2,2,3,15,1,7.04,1.17,150,11,2,2,UD,True,41,0,1
3,2,3,15,2,7.04,1.17,150,11,5,7,UD,True,0,0,1
4,2,3,15,3,7.04,1.17,150,12,5,12,UD,True,18,0,1


# Modeling Data Subset

In [36]:
X = df_pp
X, y_rank, y_result = X.drop(['rank', 'RESULT'], axis=1), X['rank'], X['RESULT'].astype(bool)

In [37]:
X.head()

Unnamed: 0,round,game_count,game_goal,cur_game_count,win_odds,lose_odds,cc,part_people,prefix_rank,odds_result,track_E_encoded,odds_result_SU,odds_result_UD
0,1,3,15,1,4.71,1.27,150,10,10,SU,41,1,0
1,1,3,15,2,4.71,1.27,150,12,17,SU,20,1,0
2,2,3,15,1,7.04,1.17,150,11,2,UD,41,0,1
3,2,3,15,2,7.04,1.17,150,11,7,UD,0,0,1
4,2,3,15,3,7.04,1.17,150,12,12,UD,18,0,1


# Modeling

In [41]:
!python --version

Python 3.11.5


In [57]:
# !python.exe -m pip install --upgrade pip
# !pip install scikit-learn
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [83]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.base import BaseEstimator

# Classification Model Import
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Regression Model Import
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [47]:
clf_model_dict = {
    "clf_ridge": LogisticRegression(penalty='l2'),
    "clf_lasso": LogisticRegression(penalty='l1'),
    "clf_logistic_regression": LogisticRegression(),
    "clf_logistic_regression_": LogisticRegression(solver='liblinear'),
    "clf_adaboost": AdaBoostClassifier(),
    "clf_gradient_boosting": GradientBoostingClassifier(),
    "clf_random_forest": RandomForestClassifier(),
    "clf_xgb": XGBClassifier(),
    "clf_xgb_": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "clf_lgbm": LGBMClassifier(),
    "clf_catboost": CatBoostClassifier(verbose=0),
}

reg_model_dict = {
    "reg_ridge": Ridge(),
    "reg_lasso": Lasso(),
    "reg_linear_regression": LinearRegression(),
    "reg_adaboost": AdaBoostRegressor(),
    "reg_gradient_boosting": GradientBoostingRegressor(),
    "reg_random_forest": RandomForestRegressor(),
    "reg_xgb": XGBRegressor(),
    "reg_lgbm": LGBMRegressor(),
    "reg_catboost": CatBoostRegressor(verbose=0),
}

# Training Model

In [90]:
from sklearn.model_selection import train_test_split


def fit_model(
    model: BaseEstimator,
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float = 0.2,
    random_state: int = 42
) -> tuple[BaseEstimator, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size = test_size,
        random_state = random_state
    )
    model.fit(X_train, y_train)
    
    return model, X_train, X_test, y_train, y_test

# K-Fold Cross Validation

In [89]:
def perform_cross_validation():
    return

# Evaluate Model

## Classification

In [87]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)


def eval_clf_model(model: BaseEstimator, X: pd.DataFrame, y: pd.Series) ->tuple[float]:
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]

    accuracy = accuracy_score(y, y_pred)      # 정확도
    precision = precision_score(y, y_pred)    # 정밀도
    recall = recall_score(y, y_pred)          # 재현율
    f1 = f1_score(y, y_pred)                  # F1 점수
    roc_auc = roc_auc_score(y, y_proba)       # ROC-AUC 점수
    
    return accuracy, precision, recall, f1, roc_auc

## Regression

In [88]:
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error
)


def eval_reg_model(model: BaseEstimator, X: pd.DataFrame, y: pd.Series) -> tuple[float]:
    y_pred = model.predict(X)

    r2 = r2_score(y, y_pred)                # R² 점수
    mae = mean_absolute_error(y, y_pred)    # 평균 절대 오차 (MAE)
    mse = mean_squared_error(y, y_pred)     # 평균 제곱 오차 (MSE)
    rmse = mean_squared_error(y, y_pred, squared=False)    # 평균 제곱근 오차 (RMSE)

    return r2, mae, mse, rmse

# Test model

## Classification

In [59]:
# columns_name = [
#     'game_count',
#     'game_goal',
#     'cur_game_count',
#     'cc',
#     'part_people',
#     'track_E_encoded',
# ]

# new_data = [
#     1,    # 'game_count'
#     12,   # 'game_goal'
#     2,    # 'cur_game_count'
#     150,  # 'cc'
#     12,   # 'part_people'
#     71,  # 'track_E_encoded'
# ]

# df = pd.DataFrame([new_data], columns=columns_name)
# proba = rank_model.predict(df)
# f'{proba[0]:.0f}등'

## Regression

In [60]:
# columns_name = [
#     'game_count',
#     'game_goal',
#     'cur_game_count',
#     'cc',
#     'part_people',
#     'track_E_encoded',
# ]

# new_data = [
#     1,    # 'game_count'
#     12,   # 'game_goal'
#     2,    # 'cur_game_count'
#     150,  # 'cc'
#     12,   # 'part_people'
#     71,  # 'track_E_encoded'
# ]

# df = pd.DataFrame([new_data], columns=columns_name)
# proba = result_model.predict_proba(df)
# f, t = proba.flatten()

# print(f"True  probability: {t}")
# print(f"False probability: {f}")