In [15]:
import pandas as pd
import numpy as np

In [16]:
train_data = pd.read_csv("../data/vct_2023/processed_data/AvsB_train_data_players_stats_eco_SK_rounds.csv")
test_data = pd.read_csv("../data/vct_2023/processed_data/AvsB_test_data_players_stats_eco_SK_rounds.csv")

In [17]:
game_id_columns = ["Tournament", "Stage", "Match Type", "Match Name", "Team_A", "Team_B"]
map_composition = ["Map", "Composition_A", "Composition_B"]
outcome_columns = ["Team_A_score_diff", "Team_B_score_diff", "Team_A_win", "Team_B_win"]

In [18]:
train_input = train_data.drop(game_id_columns+map_composition+outcome_columns, axis=1)
train_target = train_data["Team_A_win"]

test_input = test_data.drop(game_id_columns+map_composition+outcome_columns, axis=1)
test_target = test_data["Team_A_win"]

In [19]:
# Sanity check
assert len(train_input.keys()) == len(test_input.keys()), "The number of train and test input are different."
assert set(train_input.keys()) == set(test_input.keys()), "Train and test have different input feature."
assert len(train_input) == len(train_target), "Train input and target sizes are different."
assert len(test_input) == len(test_target), "Test input and target sizes are different."

In [20]:
# Check if input has non numeric features.
from pandas.api.types import is_numeric_dtype

for key in train_input.keys():
    if not is_numeric_dtype(train_input[key]):
        print(key, "is not numeric.")

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier






In [85]:
players_stats = ["Rating_A", "Rating_B","Average Combat Score_A", "Average Combat Score_B",
                "Kill, Assist, Trade, Survive %_A", "Kill, Assist, Trade, Survive %_B",
                "Average Damage Per Round_A", "Average Damage Per Round_B",
                "Kills Per Round_A", "Kills Per Round_B",
                "Assists Per Round_A", "Assists Per Round_B",
                "First Kills Per Round_A", "First Kills Per Round_B",
                "First Deaths Per Round_A", "First Deaths Per Round_B",
                "Headshot %_A", "Headshot %_B",
                "Clutch Success %_A", "Clutch Success %_B"]

for i in range(len(players_stats)//2):
    '''
    We take only one pair of players' stats at a time and train models.
    We will do train test split on the train set to address data leakage issue.
    '''
    features = players_stats[2*i:2*(i+1)]

    pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())])

    dict_classifiers = {
        "Logreg": pipeline,
        "KNN": KNeighborsClassifier(),
        "DTree": DecisionTreeClassifier(),
        "RForest": RandomForestClassifier(),
        "XGB": XGBClassifier()
        }
    
    X = train_input[features]
    y = train_target
    X_test = test_input[features]
    y_test = test_target

    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=100)


    
    best_acc = 0
    second_best_acc = 0
    best_features = None
    second_best_features = None
    best_clf = None
    second_best_clf = None

    best_val_acc = 0
    best_val_feature = None
    best_val_clf = None

    for clf_name, clf in dict_classifiers.items():
        clf.fit(X_train, y_train)
        val_pred = clf.predict(X_val)
        test_pred = clf.predict(X_test)
        val_acc = accuracy_score(y_val, val_pred)
        test_acc = accuracy_score(y_test, test_pred)

        if test_acc >= best_acc:
            temp_acc = best_acc
            best_acc = test_acc
            second_best_acc = temp_acc

            temp_features = best_features
            best_features = features
            second_best_features = temp_features

            temp_clf = best_clf
            best_clf = clf
            second_best_clf = temp_clf
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_val_feature = features
            best_val_clf = clf

    # Uncomment the lines below to see
    # accuracies and prediction means of validation set and test set in each step
    #
    #     print("----------------------------------------")
    #     print(f"Classifier: {clf_name}")
    #     print(f"trained on {features}")
    #     print(" - - - - - - - - - - - - - - - - -")
    #     print(f"Validation accuracy: {val_acc}")
    #     print(f"y_val mean: {np.mean(y_val)}")
    #     print(f"Validation prediction mean: {np.mean(val_pred)}")
    #     print(" - - - - - - - - - - - - - - - - -")
    #     print(f"Test accuracy: {test_acc}")
    #     print(f"y_test mean: {np.mean(y_test)}")
    #     print(f"Test prediction mean: {np.mean(test_pred)}")
        
    # print("===============================================")

     

In [80]:
best_features, second_best_features

(['Clutch Success %_A', 'Clutch Success %_B'],
 ['Clutch Success %_A', 'Clutch Success %_B'])

In [81]:
best_clf, best_acc, second_best_clf, second_best_acc

(DecisionTreeClassifier(),
 0.5833333333333334,
 KNeighborsClassifier(),
 0.5833333333333334)

In [86]:
best_val_acc

0.679144385026738

In [84]:
# Similar to the above cell. Difference is we continuously adding more features.

players_stats = ["Rating_A", "Rating_B","Average Combat Score_A", "Average Combat Score_B",
                "Kill, Assist, Trade, Survive %_A", "Kill, Assist, Trade, Survive %_B",
                "Average Damage Per Round_A", "Average Damage Per Round_B",
                "Kills Per Round_A", "Kills Per Round_B",
                "Assists Per Round_A", "Assists Per Round_B",
                "First Kills Per Round_A", "First Kills Per Round_B",
                "First Deaths Per Round_A", "First Deaths Per Round_B",
                "Headshot %_A", "Headshot %_B",
                "Clutch Success %_A", "Clutch Success %_B"]

for i in range(len(players_stats)//2):
    '''
    We take only one pair of players' stats at a time and train models.
    We will do train test split on the train set to address data leakage issue.
    '''
    features = players_stats[0:2*(i+1)]

    pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())])

    dict_classifiers = {
        "Logreg": pipeline,
        "KNN": KNeighborsClassifier(),
        "DTree": DecisionTreeClassifier(),
        "RForest": RandomForestClassifier(),
        "XGB": XGBClassifier()
        }
    
    X = train_input[features]
    y = train_target
    X_test = test_input[features]
    y_test = test_target

    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=300)


    best_acc = 0
    second_best_acc = 0
    best_features = None
    second_best_features = None
    best_clf = None
    second_best_clf = None

    for clf_name, clf in dict_classifiers.items():
        clf.fit(X_train, y_train)
        val_pred = clf.predict(X_val)
        test_pred = clf.predict(X_test)
        val_acc = accuracy_score(y_val, val_pred)
        test_acc = accuracy_score(y_test, test_pred)


        if test_acc >= best_acc:
            temp_acc = best_acc
            best_acc = test_acc
            second_best_acc = temp_acc

            temp_features = best_features
            best_features = features
            second_best_features = temp_features

            temp_clf = best_clf
            best_clf = clf
            second_best_clf = temp_clf


    #     print("----------------------------------------")
    #     print(f"Classifier: {clf_name}")
    #     print(f"trained on {features}")
    #     print(" - - - - - - - - - - - - - - - - -")
    #     print(f"Validation accuracy: {val_acc}")
    #     print(f"y_val mean: {np.mean(y_val)}")
    #     print(f"Validation prediction mean: {np.mean(val_pred)}")
    #     print(" - - - - - - - - - - - - - - - - -")
    #     print(f"Test accuracy: {test_acc}")
    #     print(f"y_test mean: {np.mean(y_test)}")
    #     print(f"Test prediction mean: {np.mean(test_pred)}")
        
    # print("===============================================")

In [83]:
best_clf, best_acc, second_best_clf, second_best_acc

(XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
 0.6428571428571429,
 KNeighborsClassifier(),
 0.5595238095238095)

In [71]:
best_features

['Rating_A',
 'Rating_B',
 'Average Combat Score_A',
 'Average Combat Score_B',
 'Kill, Assist, Trade, Survive %_A',
 'Kill, Assist, Trade, Survive %_B',
 'Average Damage Per Round_A',
 'Average Damage Per Round_B',
 'Kills Per Round_A',
 'Kills Per Round_B',
 'Assists Per Round_A',
 'Assists Per Round_B',
 'First Kills Per Round_A',
 'First Kills Per Round_B',
 'First Deaths Per Round_A',
 'First Deaths Per Round_B',
 'Headshot %_A',
 'Headshot %_B',
 'Clutch Success %_A',
 'Clutch Success %_B']