In [1]:
import pandas as pd
import numpy as np
from utils import *
from data_processing import *

# load general data
general_pacific = pd.read_csv('champions-tour-2024-pacific-kickoff_data\general_data_champions-tour-2024-pacific-kickoff.csv')
general_americas = pd.read_csv('champions-tour-2024-americas-kickoff_data\general_data_champions-tour-2024-americas-kickoff.csv')
general_emea = pd.read_csv('champions-tour-2024-emea-kickoff_data\general_data_champions-tour-2024-emea-kickoff.csv')

# performance data
performance_pacific = pd.read_csv('champions-tour-2024-pacific-kickoff_data\performance_data_champions-tour-2024-pacific-kickoff.csv')
performance_americas = pd.read_csv('champions-tour-2024-americas-kickoff_data\performance_data_champions-tour-2024-americas-kickoff.csv')
performance_emea = pd.read_csv('champions-tour-2024-emea-kickoff_data\performance_data_champions-tour-2024-emea-kickoff.csv')

# economic data
economy_pacific = pd.read_csv('champions-tour-2024-pacific-kickoff_data\economy_data_champions-tour-2024-pacific-kickoff.csv')
economy_americas = pd.read_csv('champions-tour-2024-americas-kickoff_data\economy_data_champions-tour-2024-americas-kickoff.csv')
economy_emea = pd.read_csv('champions-tour-2024-emea-kickoff_data\economy_data_champions-tour-2024-emea-kickoff.csv')

# picks and bans
pick_ban_pacific = pd.read_csv('champions-tour-2024-pacific-kickoff_data\pick_ban_data_champions-tour-2024-pacific-kickoff.csv')
pick_ban_americas = pd.read_csv('champions-tour-2024-americas-kickoff_data\pick_ban_data_champions-tour-2024-americas-kickoff.csv')
pick_ban_emea = pd.read_csv('champions-tour-2024-emea-kickoff_data\pick_ban_data_champions-tour-2024-emea-kickoff.csv')



In [4]:
set(general_pacific['Id'])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [None]:
def general_dataset_creation(type, data_emea, data_americas, data_pacific):
    """
    This function orchestrate the generation of the dataset by calling the appropriate dataset creation function.

    Parameter:
        type : str | represents the type of data to generate cases : 'general', 'performance', 'economic', 'picks_bans'
        data_region : dataframe of the different scraped data per region
    """

    if type == 'general':
        print(1)
    elif type == 'performance':
        print(2)
    elif type == 'economic':
        print(3)
    elif type == 'picks_bans':
        print(3)
    else:
        print('Is not a type of data | type of data to generate : general, performance, economic, picks_bans')
        
    return None

In [5]:
def general_feature_creation_for_matches(general, list_feature = ['R', 'ACS', 'K', 'D','ADR', 'HS%', 'FK']):
    """
        Function that creates a dataframe of the average/std features for a region with the general data discretized by matches. Individual feature only. 

        Parameter:
            general : dataframe from the scraper general_data_scraper
            list_feature : list of feature to compute
    """

    gathered_feature_name = []
    gathered_dictionnaries = []
    match_ids = set(general['Id'])
    
    for feature_name in list_feature:

        if feature_name == 'HS%':

            default = np.mean(general[feature_name].apply(lambda x : float(x.strip().split('\n')[0][:-1])).values)
            # Action
            avrg_action_per_match = {id_match : np.mean(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_hs_value(x, 0, default))) for id_match in match_ids}
            std_action_per_match = {id_match : np.std(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_hs_value(x, 0, default))) for id_match in match_ids}
            # Action attack
            avrg_action_per_match_atk = {id_match : np.mean(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_hs_value(x, 1, default))) for id_match in match_ids}
            std_action_per_match_atk = {id_match : np.std(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_hs_value(x, 1, default))) for id_match in match_ids}
            # Action defense
            avrg_action_per_match_dfs = {id_match : np.mean(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_hs_value(x, 2, default))) for id_match in match_ids}
            std_action_per_match_dfs = {id_match : np.std(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_hs_value(x, 2, default))) for id_match in match_ids}
        else:
            default = np.mean(general[feature_name].apply(lambda x : float(x.strip().split('\n')[0])).values)
            # Action
            avrg_action_per_match = {id_match : np.mean(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_value(x, 0, default))) for id_match in match_ids}
            std_action_per_match = {id_match : np.std(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_value(x, 0, default))) for id_match in match_ids}
            # Action attack
            avrg_action_per_match_atk = {id_match : np.mean(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_value(x, 1, default))) for id_match in match_ids}
            std_action_per_match_atk = {id_match : np.std(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_value(x, 1, default))) for id_match in match_ids}
            # Action defense
            avrg_action_per_match_dfs = {id_match : np.mean(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_value(x, 2, default))) for id_match in match_ids}
            std_action_per_match_dfs = {id_match : np.std(general[general['Id'] == id_match][feature_name].apply(lambda x : parse_value(x, 2, default))) for id_match in match_ids}

        gathered_dictionnaries.append(avrg_action_per_match)
        gathered_feature_name.append(f'avrg_{feature_name.lower()}_per_match')
        gathered_dictionnaries.append(std_action_per_match)
        gathered_feature_name.append(f'std_{feature_name.lower()}_per_match')
        gathered_dictionnaries.append(avrg_action_per_match_atk)
        gathered_feature_name.append(f'avrg_{feature_name.lower()}_per_match_atk')
        gathered_dictionnaries.append(std_action_per_match_atk)
        gathered_feature_name.append(f'std_{feature_name.lower()}_per_match_atk')
        gathered_dictionnaries.append(avrg_action_per_match_dfs)
        gathered_feature_name.append(f'avrg_{feature_name.lower()}_per_match_dfs')
        gathered_dictionnaries.append(std_action_per_match_dfs)
        gathered_feature_name.append(f'std_{feature_name.lower()}_per_match_dfs')
    
    # Create an empty DataFrame
    df = pd.DataFrame(columns=gathered_feature_name)

    # Iterate over the list of dictionaries
    for id_match in gathered_dictionnaries[0].keys():
        # Create a new row for each team
        row_values = [d[id_match] for d in gathered_dictionnaries]
        df.loc[id_match] = row_values
    
    return df

In [6]:
t = general_feature_creation_for_matches(general_americas)

In [7]:
t

Unnamed: 0,avrg_r_per_match,std_r_per_match,avrg_r_per_match_atk,std_r_per_match_atk,avrg_r_per_match_dfs,std_r_per_match_dfs,avrg_acs_per_match,std_acs_per_match,avrg_acs_per_match_atk,std_acs_per_match_atk,...,avrg_hs%_per_match_atk,std_hs%_per_match_atk,avrg_hs%_per_match_dfs,std_hs%_per_match_dfs,avrg_fk_per_match,std_fk_per_match,avrg_fk_per_match_atk,std_fk_per_match_atk,avrg_fk_per_match_dfs,std_fk_per_match_dfs
1,1.0095,0.276812,0.842,0.343054,1.1755,0.350364,190.55,53.895709,169.85,68.924071,...,27.25,14.352265,26.5,9.651425,2.2,2.039608,0.9,1.135782,1.3,1.187434
2,1.007,0.192634,0.887667,0.327279,1.124667,0.296903,197.066667,42.471114,183.8,55.464343,...,28.466667,13.370947,26.633333,11.825068,2.266667,2.189876,1.1,1.274101,1.166667,1.462494
3,1.034,0.195791,0.698,0.195182,1.371,0.370363,193.0,56.138222,158.05,50.006475,...,26.85,13.484343,25.3,9.633795,2.3,1.705872,0.6,0.734847,1.7,1.452584
4,0.998333,0.348196,0.968,0.44964,1.03,0.431192,201.233333,63.884105,199.766667,80.359062,...,25.533333,10.828153,26.7,12.458331,2.066667,2.112397,0.8,1.194432,1.266667,1.263153
5,1.007,0.27894,1.069667,0.37663,0.945333,0.360062,198.866667,59.384472,209.8,77.315975,...,30.3,11.018621,28.466667,11.566427,2.266667,1.982142,1.033333,1.328742,1.233333,1.202313
6,1.006,0.174281,0.9225,0.321572,1.09,0.298831,191.5,53.98472,183.9,65.025303,...,33.65,15.865923,26.35,13.195738,2.3,1.977372,0.95,1.116915,1.35,1.314344
7,1.019,0.274218,1.007,0.355773,1.029667,0.359856,196.9,41.040915,195.966667,70.496564,...,26.133333,10.082107,25.666667,14.276632,1.933333,1.730767,1.033333,1.303414,0.9,0.830662
8,1.0125,0.242958,0.9255,0.39893,1.0925,0.330906,202.15,43.606508,191.95,92.122459,...,30.0,18.327575,22.75,9.082263,2.3,1.951922,0.9,0.943398,1.4,1.462874
9,1.002,0.287998,1.057,0.381699,0.944,0.299595,196.633333,45.483684,205.0,67.18085,...,27.8,11.637296,30.1,11.799294,2.233333,1.563827,0.933333,1.062492,1.3,1.1
10,1.0035,0.16144,0.9255,0.298353,1.082,0.285633,199.6,42.086102,196.7,74.72891,...,34.05,11.133171,29.7,9.402659,2.6,1.56205,1.0,0.894427,1.6,1.356466


<h1>Individual Data</h1>
<h2>General Data</h2>

In [2]:
df_pacific_general = general_feature_creation_for_teams(general_pacific)
df_americas_general = general_feature_creation_for_teams(general_americas)
df_emea_general = general_feature_creation_for_teams(general_emea)

df_general = create_dataframe(df_emea_general,df_americas_general,df_pacific_general)
# Assuming the last column of the data is the target variable (e.g., region labels)
X_general, y_general = df_general, df_general.index.get_level_values(0)   # Features & Target variable (region labels)

In [11]:
df_concatenated = pd.concat([performance_pacific, performance_americas, performance_emea], keys=['EMEA', 'Americas', 'Pacific'])

In [12]:
set(df_concatenated['Map Name'])

{'Ascent', 'Bind', 'Breeze', 'Icebox', 'Lotus', 'Split', 'Sunset'}

<h3>Feature Selection & Visualization General Data</h3>

In [4]:
k_best_feature_general = selectKbest(X_general, y_general, k=30)

Feature 'avrg_r_per_team_atk': 0.8780476856287613
Feature 'avrg_r_per_team_dfs': 0.7456114044344311
Feature 'std_r_per_team_dfs': 0.5228922046417612
Feature 'std_acs_per_team': 0.6966238572366699
Feature 'avrg_acs_per_team_atk': 1.213373905280456
Feature 'avrg_acs_per_team_dfs': 0.38529324099891327
Feature 'std_acs_per_team_dfs': 0.21486734869434856
Feature 'std_k_per_team': 0.9599129200616082
Feature 'std_k_per_team_dfs': 0.45412052530405567
Feature 'avrg_d_per_team': 0.23530670653317298
Feature 'std_d_per_team': 1.2112560942412043
Feature 'avrg_d_per_team_atk': 0.5743017022834715
Feature 'std_d_per_team_atk': 3.0094731245506288
Feature 'avrg_d_per_team_dfs': 0.6879019758982621
Feature 'std_d_per_team_dfs': 1.0208927460375459
Feature 'avrg_adr_per_team': 0.2119809474377535
Feature 'std_adr_per_team': 1.1174124403987953
Feature 'avrg_adr_per_team_atk': 0.799572168253867
Feature 'avrg_adr_per_team_dfs': 0.21107928685058933
Feature 'std_adr_per_team_dfs': 0.7137304324168358
Feature 'avrg

: 

In [10]:
plot_t_sne(X_general,y_general)

In [13]:
plot_t_sne(X_general[k_best_feature_general],y_general)

In [None]:
k_best_feature_rfecv = RFECV_feature_selection(X_general,y_general,"linear")

In [4]:
visualize_mean_feature_for_each_region(df_general,k_best_feature_general)

Plots saved as PNG files.


In [None]:
plot_q_q(X_general)

<h2>Performance data</h2>

In [4]:
performance_feature_emea = performance_feature_creation_for_teams(performance_emea, economy_emea)
performance_feature_americas = performance_feature_creation_for_teams(performance_americas, economy_americas)
performance_feature_pacific = performance_feature_creation_for_teams(performance_pacific, economy_pacific)

df_performance = create_dataframe(performance_feature_emea,performance_feature_americas,performance_feature_pacific)
columns_to_drop = ['5k_mean', '5k_std','1v5_mean', '1v5_std']
df_performance_dropped = df_performance.drop(columns=columns_to_drop)
X_performance, y_performance = df_performance_dropped.iloc[:, :-1], df_performance_dropped.index.get_level_values(0)

In [6]:
k_best_feature_performance = selectKbest(X_performance, y_performance, k=5)

Feature '2k_std': 0.7833969591481711
Feature '4k_mean': 1.1873820438640228
Feature '1v3_std': 0.6534339325305459
Feature 'econ_std': 0.7659081599989729
Feature 'pl_mean': 0.9437653919871462


In [17]:
k_best_feature_rfecv_performance = RFECV_feature_selection(X_performance,y_performance,"linear")

Selected Features:
Index(['2k_mean', '2k_std', '3k_std', '4k_mean', '1v1_mean', '1v1_std',
       '1v2_mean', '1v2_std', '1v3_mean', '1v3_std', 'pl_mean', 'pl_std',
       'de_mean'],
      dtype='object')
Optimal number of features: 13


In [19]:
plot_t_sne(X_performance,y_performance)

In [20]:
plot_t_sne(X_performance[k_best_feature_performance],y_performance)

In [8]:
visualize_mean_feature_for_each_region(df_performance,k_best_feature_performance)

Plots saved as PNG files.


In [7]:
plot_q_q(X_performance[k_best_feature_performance])

<h2>Economic Data</h2>

In [7]:
df_emea_economy = economy_feature_creation_for_teams(economy_emea)
df_americas_economy = economy_feature_creation_for_teams(economy_americas)
df_pacific_economy = economy_feature_creation_for_teams(economy_pacific)

df_economy = create_dataframe(df_emea_economy,df_americas_economy,df_pacific_economy)
# Assuming the last column of the data is the target variable (e.g., region labels)
X_economy, y_economy = df_economy.iloc[:, :-1], df_economy.index.get_level_values(0)   # Features & Target variable (region labels)

In [8]:
k_best_feature_economy = selectKbest(X_economy, y_economy, k=5, selec_type='chi2')

Feature 'ratio_$_won': 0.29450331125827806
Feature 'ratio_Eco': 0.46550512151034384
Feature 'ratio_$': 0.569992553983619
Feature 'ratio_$$$': 0.29272933723779304
Feature 'Bank': 0.28270270849221024


In [45]:
k_best_feature_rfecv_economy= RFECV_feature_selection(X_economy,y_economy,"linear")

Selected Features:
Index(['ratio_Eco'], dtype='object')
Optimal number of features: 1


In [30]:
plot_t_sne(X_economy,y_economy)

In [31]:
plot_t_sne(X_economy[k_best_feature_economy],y_economy)

In [11]:
visualize_mean_feature_for_each_region(df_economy,k_best_feature_economy)

Plots saved as PNG files.


In [None]:
plot_q_q(X_economy)

<h2>Picks and Bans</h2>

In [26]:
df_emea_pick_bans = picks_and_bans_feature_creation_for_teams(pick_ban_emea)
df_americas_pick_bans = picks_and_bans_feature_creation_for_teams(pick_ban_americas)
df_pacific_pick_bans = picks_and_bans_feature_creation_for_teams(pick_ban_pacific)

df_picks_bans = create_dataframe(df_emea_pick_bans,df_americas_pick_bans,df_pacific_pick_bans)
# Assuming the last column of the data is the target variable (e.g., region labels)
X_picks_bans, y_picks_bans = df_picks_bans, df_picks_bans.index.get_level_values(0)   # Features & Target variable (region labels)

  bans_dummies = pd.get_dummies(pick_ban['Bans'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  picks_dummies = pd.get_dummies(pick_ban['Picks'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  decider_dummies = pd.get_dummies(pick_ban['Decider'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  bans_dummies = pd.get_dummies(pick_ban['Bans'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  picks_dummies = pd.get_dummies(pick_ban['Picks'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  decider_dummies = pd.get_dummies(pick_ban['Decider'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  bans_dummies = pd.get_dummies(pick_ban['Bans'].apply(lambda x : ast.literal_eval(x)).apply(pd.Series).stack()).sum(level=0)
  picks_dummies = pd.get_dummies(pick_ban['Picks'].apply(lambda x : ast.literal_eval(x)).apply(pd.Seri

In [27]:
k_best_feature_picks_bans = selectKbest(X_picks_bans, y_picks_bans, k=5, selec_type='chi2')

Feature 'bans_breeze_mean': 0.9974740036595711
Feature 'picks_lotus_mean': 1.9092904001556068
Feature 'picks_sunset_mean': 1.9605536107869832
Feature 'decider_bind_mean': 1.3287228090959953
Feature 'decider_sunset_mean': 1.089366257493577


In [37]:
plot_t_sne(X_picks_bans,y_picks_bans)

In [38]:
plot_t_sne(X_picks_bans[k_best_feature_picks_bans],y_picks_bans)

In [14]:
visualize_mean_feature_for_each_region(df_picks_bans,k_best_feature_picks_bans)

Plots saved as PNG files.


In [None]:
plot_q_q(X_picks_bans)

<h1>Aggregate the data</h1>

In [11]:
df_general_refined = df_general[k_best_feature_general]
df_performance_refined = df_performance[k_best_feature_performance]
df_economy_refined = df_economy[k_best_feature_economy]
df_pick_bans_refined = df_picks_bans[k_best_feature_picks_bans]

df_general_refined.index.names = ['Region', 'Team Name']
df_performance_refined.index.names = ['Region', 'Team Name']
df_economy_refined.index.names = ['Region', 'Team Name']
df_pick_bans_refined.index.names = ['Region', 'Team Name']

merged_df = pd.concat([df_general_refined,df_performance_refined, df_economy_refined, df_pick_bans_refined], axis=1)

<h1>Resample & Boostrap my data</h1>

In [7]:
dataset = pd.read_csv('data/dataset.csv')

In [8]:
dataset

Unnamed: 0,Region,Team Name,std_d_per_team_atk,std_fk_per_team,avrg_fk_per_team_atk,std_fk_per_team_atk,avrg_fk_per_team_dfs,2k_std,4k_mean,1v3_std,...,ratio_$_won,ratio_Eco,ratio_$,ratio_$$$,Bank,bans_breeze_mean,picks_lotus_mean,picks_sunset_mean,decider_bind_mean,decider_lotus_mean
0,EMEA,BBL,0.377024,0.312115,0.452632,0.125528,0.123077,0.600763,0.214091,0.0,...,0.2,0.634615,0.886076,0.268966,0.0,0.5,0.0,0.0,0.0,0.0
1,EMEA,FUT,0.747556,0.680208,0.664474,0.468462,0.466346,0.810584,0.25291,0.721798,...,1.0,0.288462,0.316456,0.993103,0.742463,1.0,0.333333,0.0,0.0,0.0
2,EMEA,FNC,0.516684,0.322551,0.434211,0.394123,0.836538,0.170427,0.490626,0.79794,...,1.0,0.365385,0.189873,0.593103,0.53606,0.0,1.0,0.0,1.0,0.0
3,EMEA,VIT,0.517308,0.545145,0.827751,0.610139,0.108392,0.785995,0.77891,0.349593,...,0.16,0.326923,0.253165,0.0,0.735745,0.2,0.2,0.333333,0.4,0.3
4,EMEA,TH,0.876463,0.554764,0.563158,0.36383,0.634615,0.585511,0.765103,0.67518,...,0.36,0.423077,0.291139,0.565517,0.675738,0.0,0.75,0.416667,0.25,0.0
5,EMEA,KC,0.728905,0.480793,0.608187,0.59277,0.5,0.571292,0.566846,0.217722,...,0.4,0.461538,0.278481,1.0,0.524424,0.857143,0.0,0.714286,0.285714,0.428571
6,EMEA,NAVI,1.0,0.464804,0.0,0.0,0.423077,0.362891,0.896936,0.48095,...,0.0,0.115385,0.139241,0.151724,0.771793,0.0,0.0,0.0,0.0,0.0
7,EMEA,TL,0.043887,0.876051,1.0,0.88998,0.538462,0.71681,0.680186,0.5375,...,0.28,0.403846,0.734177,0.931034,0.750914,0.666667,0.0,0.0,0.0,0.0
8,EMEA,M8,0.400426,0.882285,0.747368,0.866048,0.176923,0.905667,0.5764,0.0,...,0.5,0.711538,0.101266,0.068966,0.34662,0.5,0.5,0.0,0.0,0.0
9,EMEA,GX,0.12778,0.0,0.342105,0.314203,0.141026,0.267792,0.929456,0.56628,...,0.0,1.0,0.443038,0.924138,0.269611,0.0,0.0,0.833333,0.0,0.75


In [31]:
import numpy as np

# Assuming your original dataset is stored in X and y
# X is the feature matrix and y is the target variable
X,y = dataset.iloc[:, 2:], dataset['Region']

# Number of bootstrap datasets to create for each class
num_bootstrap_datasets = 10

# Size of each bootstrap dataset (same as original class size)
bootstrap_dataset_size = 4

# List to store augmented data
augmented_X = []
augmented_y = []

# Create bootstrap datasets for each class
for class_label in np.unique(y):
    # Indices of samples belonging to the current class
    class_indices = np.where(y == class_label)[0]
    
    # Create bootstrap datasets for the current class
    for _ in range(num_bootstrap_datasets):
        # Randomly sample indices with replacement
        bootstrap_indices = np.random.choice(class_indices, size=bootstrap_dataset_size, replace=True)
        
        # Create bootstrap dataset for the current class
        bootstrap_X = X.iloc[bootstrap_indices]
        bootstrap_y = y.iloc[bootstrap_indices]
        
        # Append bootstrap dataset to augmented data
        augmented_X.append(bootstrap_X)
        augmented_y.append(bootstrap_y)

# Concatenate the list of DataFrames to create the augmented dataset
augmented_X = pd.concat(augmented_X)
augmented_y = pd.concat(augmented_y)

# Optionally, reset the index if desired
augmented_X.reset_index(drop=True, inplace=True)
augmented_y.reset_index(drop=True, inplace=True)


In [32]:
augmented_X

Unnamed: 0,std_d_per_team_atk,std_fk_per_team,avrg_fk_per_team_atk,std_fk_per_team_atk,avrg_fk_per_team_dfs,2k_std,4k_mean,1v3_std,econ_std,pl_mean,ratio_$_won,ratio_Eco,ratio_$,ratio_$$$,Bank,bans_breeze_mean,picks_lotus_mean,picks_sunset_mean,decider_bind_mean,decider_lotus_mean
0,0.220676,0.718543,0.076023,0.484104,0.230769,0.298607,0.982212,0.000000,0.433443,0.604651,0.20,0.653846,0.303797,0.462069,0.525375,0.000000,0.000000,0.0,0.333333,0.00
1,0.450207,0.603425,0.421053,0.352824,0.538462,0.455874,0.321592,0.000000,0.447354,0.511628,0.76,0.192308,0.265823,0.965517,0.482657,0.000000,0.000000,0.0,0.000000,0.50
2,0.350074,0.614440,0.327935,0.447602,0.665680,0.478097,0.586203,0.227983,0.392797,0.618962,0.58,0.057692,0.341772,0.703448,0.694711,0.200000,0.000000,1.0,0.000000,0.00
3,0.116044,0.557937,0.250000,0.249141,0.567308,0.555713,0.039647,0.000000,0.247230,0.162791,0.00,0.307692,0.468354,0.737931,0.365852,0.000000,0.000000,0.0,0.000000,0.75
4,0.000000,0.746271,0.495614,0.608893,0.477564,0.036380,0.343930,0.266991,0.175789,0.720930,0.72,0.134615,0.126582,0.903448,0.449720,0.000000,0.000000,0.0,0.000000,0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.804494,0.589699,0.219298,0.388725,0.275641,0.521218,0.625896,0.575563,0.844921,0.000000,0.80,0.903846,0.139241,0.593103,0.835590,0.000000,0.000000,0.0,0.000000,0.50
116,0.731894,0.855843,0.593301,0.672264,0.451049,0.611177,0.333548,0.305299,0.411533,0.536998,0.36,0.576923,0.240506,0.765517,0.887551,0.000000,0.000000,0.0,0.000000,0.60
117,0.261560,0.508575,0.710526,0.648488,0.096154,0.678734,0.277987,1.000000,0.100164,0.093023,0.00,0.653846,0.354430,0.041379,0.303855,0.000000,1.000000,0.0,0.500000,0.00
118,0.804494,0.589699,0.219298,0.388725,0.275641,0.521218,0.625896,0.575563,0.844921,0.000000,0.80,0.903846,0.139241,0.593103,0.835590,0.000000,0.000000,0.0,0.000000,0.50


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5714285714285714


In [41]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Model evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Americas       0.50      0.33      0.40         3
        EMEA       0.00      0.00      0.00         2
     Pacific       0.50      0.50      0.50         2

    accuracy                           0.29         7
   macro avg       0.33      0.28      0.30         7
weighted avg       0.36      0.29      0.31         7



In [24]:
from sklearn.model_selection import cross_val_score

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, augmented_X, augmented_y, cv=5)  # 5-fold cross-validation

# Calculate the mean accuracy and standard deviation
mean_accuracy = cv_scores.mean()
std_accuracy = cv_scores.std()

print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

Mean Accuracy: 0.8666666666666668
Standard Deviation of Accuracy: 0.08498365855987973


<h1>Centralization of the Analysis</h1>

<h2>Selected Feature for the Individuality</h2>
<p>K1 best from general</p>
<p>K2 best from performance</p>

<h2>Selected Feature for the Overall Strategy</h2>
<p>K3 best from economy</p>
<p>K4 best from picks and bans</p>

<h2>Selected Feature for the State of Form</h2>
<p>Not done yet</p>