### Libraries

In [211]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, AdaBoostClassifier, \
    BaggingClassifier, StackingClassifier
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

### Import Data

In [212]:
train_away_player_statistics_df = pd.read_csv('Train_Data/train_away_player_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv('Train_Data/train_away_team_statistics_df.csv', index_col=0)
train_home_player_statistics_df = pd.read_csv('Train_Data/train_home_player_statistics_df.csv', index_col=0)
train_home_team_statistics_df = pd.read_csv('Train_Data/train_home_team_statistics_df.csv', index_col=0)

Y_train = pd.read_csv('Y_train_1rknArQ.csv', index_col=0)

test_away_player_statistics_df = pd.read_csv('Test_Data/test_away_player_statistics_df.csv', index_col=0)
test_away_team_statistics_df = pd.read_csv('Test_Data/test_away_team_statistics_df.csv', index_col=0)
test_home_player_statistics_df = pd.read_csv('Test_Data/test_home_player_statistics_df.csv', index_col=0)
test_home_team_statistics_df = pd.read_csv('Test_Data/test_home_team_statistics_df.csv', index_col=0)

### Train Data processing

##### Remove some useless features

In [213]:
train_away_player_statistics_drop_df = train_away_player_statistics_df.drop(labels=['PLAYER_NAME', 'PLAYER_CAPTAIN_season_sum', 'PLAYER_LONG_BALLS_season_sum', 'PLAYER_LONG_BALLS_WON_season_sum', 'PLAYER_SHOTS_OFF_TARGET_season_sum', 'PLAYER_CAPTAIN_season_average', 'PLAYER_PUNCHES_season_average', 'PLAYER_LONG_BALLS_season_average', 'PLAYER_LONG_BALLS_WON_season_average', 'PLAYER_SHOTS_OFF_TARGET_season_average', 'PLAYER_CAPTAIN_season_std', 'PLAYER_PUNCHES_season_std', 'PLAYER_LONG_BALLS_season_std', 'PLAYER_LONG_BALLS_WON_season_std', 'PLAYER_SHOTS_OFF_TARGET_season_std', 'PLAYER_CAPTAIN_5_last_match_sum', 'PLAYER_PUNCHES_5_last_match_sum', 'PLAYER_LONG_BALLS_5_last_match_sum', 'PLAYER_LONG_BALLS_WON_5_last_match_sum', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_sum', 'PLAYER_CAPTAIN_5_last_match_average', 'PLAYER_PUNCHES_5_last_match_average', 'PLAYER_LONG_BALLS_5_last_match_average', 'PLAYER_LONG_BALLS_WON_5_last_match_average', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_average', 'PLAYER_CAPTAIN_5_last_match_std', 'PLAYER_PUNCHES_5_last_match_std', 'PLAYER_LONG_BALLS_5_last_match_std', 'PLAYER_LONG_BALLS_WON_5_last_match_std', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_std'], axis=1)

train_home_player_statistics_drop_df= train_home_player_statistics_df.drop(labels=['PLAYER_NAME', 'PLAYER_CAPTAIN_season_sum', 'PLAYER_LONG_BALLS_season_sum', 'PLAYER_LONG_BALLS_WON_season_sum', 'PLAYER_SHOTS_OFF_TARGET_season_sum', 'PLAYER_CAPTAIN_season_average', 'PLAYER_PUNCHES_season_average', 'PLAYER_LONG_BALLS_season_average', 'PLAYER_LONG_BALLS_WON_season_average', 'PLAYER_SHOTS_OFF_TARGET_season_average', 'PLAYER_CAPTAIN_season_std', 'PLAYER_PUNCHES_season_std', 'PLAYER_LONG_BALLS_season_std', 'PLAYER_LONG_BALLS_WON_season_std', 'PLAYER_SHOTS_OFF_TARGET_season_std', 'PLAYER_CAPTAIN_5_last_match_sum', 'PLAYER_PUNCHES_5_last_match_sum', 'PLAYER_LONG_BALLS_5_last_match_sum', 'PLAYER_LONG_BALLS_WON_5_last_match_sum', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_sum', 'PLAYER_CAPTAIN_5_last_match_average', 'PLAYER_PUNCHES_5_last_match_average', 'PLAYER_LONG_BALLS_5_last_match_average', 'PLAYER_LONG_BALLS_WON_5_last_match_average', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_average', 'PLAYER_CAPTAIN_5_last_match_std', 'PLAYER_PUNCHES_5_last_match_std', 'PLAYER_LONG_BALLS_5_last_match_std', 'PLAYER_LONG_BALLS_WON_5_last_match_std', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_std'], axis=1)

In [214]:
train_home_team_statistics_drop_df = train_home_team_statistics_df.iloc[:,2:]
train_away_team_statistics_drop_df = train_away_team_statistics_df.iloc[:,2:]

train_home_team_statistics_drop_df.columns = 'HOME_' + train_home_team_statistics_drop_df.columns
train_away_team_statistics_drop_df.columns = 'AWAY_' + train_away_team_statistics_drop_df.columns

##### Encoding the position, league and team_name of each player

In [215]:
train_home_player_statistics_drop_df.loc[train_home_player_statistics_drop_df['POSITION'].isna(), 'POSITION'] = 'NAN'

enc_home_POSITION = LabelEncoder()
enc_home_POSITION.fit(train_home_player_statistics_drop_df['POSITION'])
encoded_home_POSITION = enc_home_POSITION.transform(train_home_player_statistics_drop_df['POSITION'])


enc_home_LEAGUE = LabelEncoder()
enc_home_LEAGUE.fit(train_home_player_statistics_drop_df['LEAGUE'])
encoded_home_LEAGUE = enc_home_LEAGUE.transform(train_home_player_statistics_drop_df['LEAGUE'])

enc_home_TEAM_NAME = LabelEncoder()
enc_home_TEAM_NAME.fit(train_home_player_statistics_drop_df['TEAM_NAME'])
encoded_home_TEAM_NAME = enc_home_TEAM_NAME.transform(train_home_player_statistics_drop_df['TEAM_NAME'])

In [216]:
train_home_player_statistics_drop_df['LEAGUE'] = encoded_home_LEAGUE
train_home_player_statistics_drop_df['TEAM_NAME'] = encoded_home_TEAM_NAME
train_home_player_statistics_drop_df['POSITION'] = encoded_home_POSITION

In [217]:
train_away_player_statistics_drop_df.loc[train_away_player_statistics_drop_df['POSITION'].isna(), 'POSITION'] = 'NAN'

enc_away_POSITION = LabelEncoder()
enc_away_POSITION.fit(train_away_player_statistics_drop_df['POSITION'])
encoded_away_POSITION = enc_away_POSITION.transform(train_away_player_statistics_drop_df['POSITION'])


enc_away_LEAGUE = LabelEncoder()
enc_away_LEAGUE.fit(train_away_player_statistics_drop_df['LEAGUE'])
encoded_away_LEAGUE = enc_away_LEAGUE.transform(train_away_player_statistics_drop_df['LEAGUE'])

enc_away_TEAM_NAME = LabelEncoder()
enc_away_TEAM_NAME.fit(train_away_player_statistics_drop_df['TEAM_NAME'])
encoded_away_TEAM_NAME = enc_away_TEAM_NAME.transform(train_away_player_statistics_drop_df['TEAM_NAME'])

In [218]:
train_away_player_statistics_drop_df['LEAGUE'] = encoded_away_LEAGUE
train_away_player_statistics_drop_df['TEAM_NAME'] = encoded_away_TEAM_NAME
train_away_player_statistics_drop_df['POSITION'] = encoded_away_POSITION

In [219]:
train_home_player_statistics_drop_df.loc[train_home_player_statistics_drop_df['POSITION']==0, 'POSITION'] = np.nan

train_away_player_statistics_drop_df.loc[train_away_player_statistics_drop_df['POSITION']==0, 'POSITION'] = np.nan

imputer = KNNImputer(n_neighbors=20)

train_home_player_statistics_drop_impute_df = imputer.fit_transform(train_home_player_statistics_drop_df)
train_away_player_statistics_drop_impute_df = imputer.fit_transform(train_away_player_statistics_drop_df)

In [220]:
train_home_player_statistics_drop2_df = train_home_player_statistics_drop_df.drop(columns=['LEAGUE', 'TEAM_NAME'])

train_away_player_statistics_drop2_df = train_away_player_statistics_drop_df.drop(columns=['LEAGUE', 'TEAM_NAME'])

##### Create a dataframe fill just with NaN

In [230]:
NaN_df_columns = train_home_player_statistics_drop2_df.drop(columns=['POSITION'])
NaN_df = pd.DataFrame(columns=NaN_df_columns.columns)
for k in range(len(train_home_player_statistics_drop2_df.columns)):
    NaN_df.loc[0] = np.nan

##### Mean of the statistics of the players of each team by position (goalkeeper, defender, miedfielder, attacker)

In [231]:
L = train_home_player_statistics_drop2_df.index.drop_duplicates()

for k in range(len(L)):
    df = train_home_player_statistics_drop2_df[train_home_player_statistics_drop2_df.index == L[k]]

    for i in range(1,5):
        df_position = df[df['POSITION'] == i]
        df_position = df_position.drop('POSITION', axis = 1)

        df_position_mean = df_position.mean(axis=0, skipna = False)
    
        new_df = pd.DataFrame(columns = df_position_mean.index)

        for index in df_position_mean.index:
            new_df.loc[L[0], index] = df_position_mean[index]
        #new_df = new_df.reset_index(drop=True)
        df_position_mean = new_df
        columns = df_position_mean.columns


        if df_position_mean.empty:
            NaN_df.index = [i]
            df_position_mean = NaN_df
        df_position_mean.columns = 'HOME_' + 'POSITION_' + str(i) + '_' + df_position_mean.columns
        
        if i == 1:
            df_team_mean = df_position_mean
        else:
            df_team_mean = pd.concat([df_team_mean, df_position_mean], axis= 1)
    
    if k == 0:
        final_df_home_player = df_team_mean
    else:
        final_df_home_player = pd.concat([final_df_home_player, df_team_mean], axis = 0)

final_df_home_player.index = L
final_df_home_player.head()

Unnamed: 0_level_0,HOME_POSITION_1_PLAYER_ACCURATE_CROSSES_season_sum,HOME_POSITION_1_PLAYER_ACCURATE_PASSES_season_sum,HOME_POSITION_1_PLAYER_AERIALS_WON_season_sum,HOME_POSITION_1_PLAYER_ASSISTS_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_MISSED_season_sum,HOME_POSITION_1_PLAYER_BLOCKED_SHOTS_season_sum,HOME_POSITION_1_PLAYER_CLEARANCES_season_sum,HOME_POSITION_1_PLAYER_CLEARANCE_OFFLINE_season_sum,HOME_POSITION_1_PLAYER_DISPOSSESSED_season_sum,...,HOME_POSITION_4_PLAYER_SAVES_INSIDE_BOX_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_BLOCKED_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_ON_TARGET_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_TOTAL_5_last_match_std,HOME_POSITION_4_PLAYER_STARTING_LINEUP_5_last_match_std,HOME_POSITION_4_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,HOME_POSITION_4_PLAYER_TACKLES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_CROSSES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_DUELS_5_last_match_std,HOME_POSITION_4_PLAYER_YELLOWCARDS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.25,12.0,21.25,10.5,6.0,21.5,3.0,6.5,0.0,23.75,...,0.0,22.833333,13.0,27.333333,33.833333,41.666667,40.0,24.333333,48.166667,25.666667
1,4.0,12.333333,13.0,40.0,20.666667,32.666667,2.666667,9.666667,0.0,35.0,...,0.0,20.375,19.25,19.375,7.875,13.75,22.875,12.875,27.5,25.375
2,5.0,5.25,6.75,1.75,4.5,10.0,0.0,1.25,0.0,9.75,...,0.0,11.0,15.4,17.2,57.3,16.9,33.2,11.0,42.1,51.1
3,2.5,13.25,13.25,11.25,7.5,32.25,1.5,3.0,0.0,17.75,...,0.0,20.5,18.5,25.875,35.0,16.375,29.5,34.5,25.125,19.25
4,18.75,20.25,14.5,23.0,29.5,24.75,2.0,7.0,0.0,33.25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,3.0,19.666667,7.333333,5.333333,0.0,4.666667,0.0,2.666667,0.0,32.0,...,0.0,19.285714,12.428571,22.428571,31.0,10.428571,20.857143,15.571429,25.142857,49.0
12299,0.333333,10.333333,36.666667,5.333333,3.666667,28.0,6.0,7.333333,0.0,22.666667,...,,,,,,,,,,
12300,3.333333,6.5,5.166667,5.0,11.166667,14.0,2.833333,1.666667,0.0,22.333333,...,0.0,24.25,16.0,23.75,38.5,19.25,53.0,17.5,21.75,15.75
12301,,,,31.333333,,,,,,,...,,,,,54.833333,,,,,61.833333


In [232]:
L = train_away_player_statistics_drop2_df.index.drop_duplicates()

for k in range(len(L)):
    df = train_away_player_statistics_drop2_df[train_away_player_statistics_drop2_df.index == L[k]]

    for i in range(1,5):
        df_position = df[df['POSITION'] == i]
        df_position = df_position.drop('POSITION', axis = 1)


        df_position_mean = df_position.mean(axis=0, skipna = False)
    
        new_df = pd.DataFrame(columns = df_position_mean.index)

        for index in df_position_mean.index:
            new_df.loc[L[0], index] = df_position_mean[index]
        #new_df = new_df.reset_index(drop=True)
        df_position_mean = new_df
        columns = df_position_mean.columns


        if df_position_mean.empty:
            NaN_df.index = [i]
            df_position_mean = NaN_df
        df_position_mean.columns = 'HOME_' + 'POSITION_' + str(i) + '_' + df_position_mean.columns
        
        if i == 1:
            df_team_mean = df_position_mean
        else:
            df_team_mean = pd.concat([df_team_mean, df_position_mean], axis= 1)
    
    if k == 0:
        final_df_away_player = df_team_mean
    else:
        final_df_away_player = pd.concat([final_df_away_player, df_team_mean], axis = 0)

final_df_away_player.index = L
final_df_away_player.head()

Unnamed: 0_level_0,HOME_POSITION_1_PLAYER_ACCURATE_CROSSES_season_sum,HOME_POSITION_1_PLAYER_ACCURATE_PASSES_season_sum,HOME_POSITION_1_PLAYER_AERIALS_WON_season_sum,HOME_POSITION_1_PLAYER_ASSISTS_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_MISSED_season_sum,HOME_POSITION_1_PLAYER_BLOCKED_SHOTS_season_sum,HOME_POSITION_1_PLAYER_CLEARANCES_season_sum,HOME_POSITION_1_PLAYER_CLEARANCE_OFFLINE_season_sum,HOME_POSITION_1_PLAYER_DISPOSSESSED_season_sum,...,HOME_POSITION_4_PLAYER_SAVES_INSIDE_BOX_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_BLOCKED_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_ON_TARGET_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_TOTAL_5_last_match_std,HOME_POSITION_4_PLAYER_STARTING_LINEUP_5_last_match_std,HOME_POSITION_4_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,HOME_POSITION_4_PLAYER_TACKLES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_CROSSES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_DUELS_5_last_match_std,HOME_POSITION_4_PLAYER_YELLOWCARDS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.0,24.333333,12.166667,9.333333,4.0,6.166667,5.0,9.5,0.0,22.666667,...,0.0,20.2,12.0,18.8,28.0,17.8,26.2,9.0,33.4,34.2
1,5.0,14.0,11.0,6.666667,16.333333,38.666667,2.666667,0.666667,0.0,44.333333,...,0.0,14.333333,25.0,24.666667,56.0,18.333333,35.666667,23.333333,40.333333,43.166667
2,34.4,22.4,17.4,19.8,37.4,26.8,1.8,7.2,0.0,36.6,...,,,,,,,,,,
3,3.5,8.75,22.75,11.25,17.5,11.25,2.5,9.75,6.25,14.0,...,,,,,,,,,,
4,3.0,7.5,10.0,21.5,23.5,49.5,5.5,6.0,0.0,6.5,...,0.0,23.25,33.125,27.125,36.75,9.875,26.0,16.0,24.5,28.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,1.0,11.666667,19.333333,5.333333,20.666667,18.666667,0.0,4.333333,0.0,31.666667,...,0.0,22.571429,16.285714,23.142857,20.0,15.142857,17.0,22.0,24.142857,29.0
12299,3.333333,15.666667,19.333333,5.333333,14.666667,47.333333,4.0,5.0,0.0,33.666667,...,0.0,22.375,13.5,15.875,31.5,9.0,32.25,12.25,31.75,17.5
12300,5.25,13.75,19.0,10.0,16.75,28.0,5.5,4.25,0.0,42.25,...,,,,,,,,,,
12301,,,,17.6,,,,,,,...,,,,,,,,,,


##### Join the information about the players and the team together

In [233]:
#Attention inner
X_train_home = final_df_home_player.join(train_home_team_statistics_drop_df, on=['ID'], how='outer')

X_train_away = final_df_away_player.join(train_away_team_statistics_drop_df, on=['ID'], how='outer')

##### Join the information about the home and away team

In [234]:
X_train =  pd.concat([X_train_home,X_train_away],join='outer',axis=1)
Y_train = Y_train.loc[X_train.index]

X_train = X_train.replace({np.inf:np.nan,-np.inf:np.nan})

  X_train = X_train.replace({np.inf:np.nan,-np.inf:np.nan})


In [235]:
index_train = X_train.index

In [236]:
columns_name = X_train.columns

In [237]:
X_train.to_csv('X_train_without_imputer_V2.csv', index = True)

### Test Data processing

##### Remove some useless features

In [238]:
test_away_player_statistics_drop_df = test_away_player_statistics_df.drop(labels=['PLAYER_CAPTAIN_season_sum', 'PLAYER_LONG_BALLS_season_sum', 'PLAYER_LONG_BALLS_WON_season_sum', 'PLAYER_SHOTS_OFF_TARGET_season_sum', 'PLAYER_CAPTAIN_season_average', 'PLAYER_PUNCHES_season_average', 'PLAYER_LONG_BALLS_season_average', 'PLAYER_LONG_BALLS_WON_season_average', 'PLAYER_SHOTS_OFF_TARGET_season_average', 'PLAYER_CAPTAIN_season_std', 'PLAYER_PUNCHES_season_std', 'PLAYER_LONG_BALLS_season_std', 'PLAYER_LONG_BALLS_WON_season_std', 'PLAYER_SHOTS_OFF_TARGET_season_std', 'PLAYER_CAPTAIN_5_last_match_sum', 'PLAYER_PUNCHES_5_last_match_sum', 'PLAYER_LONG_BALLS_5_last_match_sum', 'PLAYER_LONG_BALLS_WON_5_last_match_sum', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_sum', 'PLAYER_CAPTAIN_5_last_match_average', 'PLAYER_PUNCHES_5_last_match_average', 'PLAYER_LONG_BALLS_5_last_match_average', 'PLAYER_LONG_BALLS_WON_5_last_match_average', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_average', 'PLAYER_CAPTAIN_5_last_match_std', 'PLAYER_PUNCHES_5_last_match_std', 'PLAYER_LONG_BALLS_5_last_match_std', 'PLAYER_LONG_BALLS_WON_5_last_match_std', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_std'], axis=1)

test_home_player_statistics_drop_df= test_home_player_statistics_df.drop(labels=['PLAYER_CAPTAIN_season_sum', 'PLAYER_LONG_BALLS_season_sum', 'PLAYER_LONG_BALLS_WON_season_sum', 'PLAYER_SHOTS_OFF_TARGET_season_sum', 'PLAYER_CAPTAIN_season_average', 'PLAYER_PUNCHES_season_average', 'PLAYER_LONG_BALLS_season_average', 'PLAYER_LONG_BALLS_WON_season_average', 'PLAYER_SHOTS_OFF_TARGET_season_average', 'PLAYER_CAPTAIN_season_std', 'PLAYER_PUNCHES_season_std', 'PLAYER_LONG_BALLS_season_std', 'PLAYER_LONG_BALLS_WON_season_std', 'PLAYER_SHOTS_OFF_TARGET_season_std', 'PLAYER_CAPTAIN_5_last_match_sum', 'PLAYER_PUNCHES_5_last_match_sum', 'PLAYER_LONG_BALLS_5_last_match_sum', 'PLAYER_LONG_BALLS_WON_5_last_match_sum', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_sum', 'PLAYER_CAPTAIN_5_last_match_average', 'PLAYER_PUNCHES_5_last_match_average', 'PLAYER_LONG_BALLS_5_last_match_average', 'PLAYER_LONG_BALLS_WON_5_last_match_average', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_average', 'PLAYER_CAPTAIN_5_last_match_std', 'PLAYER_PUNCHES_5_last_match_std', 'PLAYER_LONG_BALLS_5_last_match_std', 'PLAYER_LONG_BALLS_WON_5_last_match_std', 'PLAYER_SHOTS_OFF_TARGET_5_last_match_std'], axis=1)

In [239]:
test_home_team_statistics_drop_df = test_home_team_statistics_df
test_away_team_statistics_drop_df = test_away_team_statistics_df

test_home_team_statistics_drop_df.columns = 'HOME_' + test_home_team_statistics_drop_df.columns
test_away_team_statistics_drop_df.columns = 'AWAY_' + test_away_team_statistics_drop_df.columns

##### Encoding the position, league and team_name of each player

In [240]:
test_home_player_statistics_drop_df.loc[test_home_player_statistics_drop_df['POSITION'].isna(), 'POSITION'] = 'NAN'

enc_home_POSITION = LabelEncoder()
enc_home_POSITION.fit(test_home_player_statistics_drop_df['POSITION'])
encoded_home_POSITION = enc_home_POSITION.transform(test_home_player_statistics_drop_df['POSITION'])

In [241]:
test_home_player_statistics_drop_df['POSITION'] = encoded_home_POSITION

In [242]:
test_away_player_statistics_drop_df.loc[test_away_player_statistics_drop_df['POSITION'].isna(), 'POSITION'] = 'NAN'

enc_away_POSITION = LabelEncoder()
enc_away_POSITION.fit(test_away_player_statistics_drop_df['POSITION'])
encoded_away_POSITION = enc_away_POSITION.transform(test_away_player_statistics_drop_df['POSITION'])

In [243]:
test_away_player_statistics_drop_df['POSITION'] = encoded_away_POSITION

In [244]:
test_home_player_statistics_drop_df.loc[test_home_player_statistics_drop_df['POSITION']==0, 'POSITION'] = np.nan

test_away_player_statistics_drop_df.loc[test_away_player_statistics_drop_df['POSITION']==0, 'POSITION'] = np.nan

imputer = KNNImputer(n_neighbors=20)

test_home_player_statistics_drop_impute_df = imputer.fit_transform(test_home_player_statistics_drop_df)
test_away_player_statistics_drop_impute_df = imputer.fit_transform(test_away_player_statistics_drop_df)

In [245]:
test_home_player_statistics_drop2_df = test_home_player_statistics_drop_df
test_away_player_statistics_drop2_df = test_away_player_statistics_drop_df

##### Create a dataframe fill just with NaN

In [247]:
NaN_df_columns = test_home_player_statistics_drop2_df.drop(columns=['POSITION'])
NaN_df = pd.DataFrame(columns=NaN_df_columns.columns)
for k in range(len(test_home_player_statistics_drop2_df.columns)):
    NaN_df.loc[0] = np.nan

##### Mean of the statistics of the players of each team by position (goalkeeper, defender, miedfielder, attacker)

In [248]:
L = test_home_player_statistics_drop2_df.index.drop_duplicates()

for k in range(len(L)):
    df = test_home_player_statistics_drop2_df[test_home_player_statistics_drop2_df.index == L[k]]

    for i in range(1,5):
        df_position = df[df['POSITION'] == i]
        df_position = df_position.drop('POSITION', axis = 1)

        df_position_mean = df_position.mean(axis=0, skipna = False)
    
        new_df = pd.DataFrame(columns = df_position_mean.index)

        for index in df_position_mean.index:
            new_df.loc[L[0], index] = df_position_mean[index]
        #new_df = new_df.reset_index(drop=True)
        df_position_mean = new_df
        columns = df_position_mean.columns


        if df_position_mean.empty:
            NaN_df.index = [i]
            df_position_mean = NaN_df
        df_position_mean.columns = 'HOME_' + 'POSITION_' + str(i) + '_' + df_position_mean.columns
        
        if i == 1:
            df_team_mean = df_position_mean
        else:
            df_team_mean = pd.concat([df_team_mean, df_position_mean], axis= 1)
    
    if k == 0:
        final_df_home_player = df_team_mean
    else:
        final_df_home_player = pd.concat([final_df_home_player, df_team_mean], axis = 0)

final_df_home_player.index = L
final_df_home_player

Unnamed: 0_level_0,HOME_POSITION_1_PLAYER_ACCURATE_CROSSES_season_sum,HOME_POSITION_1_PLAYER_ACCURATE_PASSES_season_sum,HOME_POSITION_1_PLAYER_AERIALS_WON_season_sum,HOME_POSITION_1_PLAYER_ASSISTS_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_MISSED_season_sum,HOME_POSITION_1_PLAYER_BLOCKED_SHOTS_season_sum,HOME_POSITION_1_PLAYER_CLEARANCES_season_sum,HOME_POSITION_1_PLAYER_CLEARANCE_OFFLINE_season_sum,HOME_POSITION_1_PLAYER_DISPOSSESSED_season_sum,...,HOME_POSITION_4_PLAYER_SAVES_INSIDE_BOX_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_BLOCKED_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_ON_TARGET_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_TOTAL_5_last_match_std,HOME_POSITION_4_PLAYER_STARTING_LINEUP_5_last_match_std,HOME_POSITION_4_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,HOME_POSITION_4_PLAYER_TACKLES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_CROSSES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_DUELS_5_last_match_std,HOME_POSITION_4_PLAYER_YELLOWCARDS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12303,,,,,,,,,,,...,0.0,14.090909,10.909091,17.0,11.454545,14.363636,18.454545,17.818182,27.636364,14.636364
12304,5.666667,13.333333,21.666667,20.333333,21.333333,23.0,8.666667,4.333333,0.0,50.666667,...,,,,,,,,,,
12305,0.25,9.5,12.5,22.5,16.0,16.5,4.0,7.75,0.0,20.25,...,,,,,,,,,,
12306,15.0,13.666667,19.333333,23.666667,23.0,21.666667,1.333333,6.0,0.0,37.333333,...,0.0,24.0,14.285714,23.714286,56.0,21.714286,17.571429,12.142857,36.571429,21.428571
12307,10.25,8.25,18.25,9.25,21.75,21.75,1.0,2.75,0.0,16.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,0.0,2.0,6.5,0.0,0.0,0.0,0.0,2.5,0.0,2.0,...,,,,,,,,,,
37667,2.0,9.0,17.333333,33.333333,23.333333,30.0,1.666667,4.0,0.0,29.0,...,0.0,20.1,27.4,25.3,45.7,,24.0,15.6,25.7,12.6
37668,9.666667,11.0,7.666667,28.333333,12.333333,17.666667,4.333333,3.333333,0.0,26.0,...,0.0,22.285714,22.428571,31.857143,47.0,22.0,31.142857,11.714286,21.714286,16.571429
37669,0.0,6.5,8.5,0.0,5.0,0.0,0.0,0.5,0.0,18.0,...,,,,,,,,,,


In [249]:
L = test_away_player_statistics_drop2_df.index.drop_duplicates()

for k in range(len(L)):
    df = test_away_player_statistics_drop2_df[test_away_player_statistics_drop2_df.index == L[k]]

    for i in range(1,5):
        df_position = df[df['POSITION'] == i]
        df_position = df_position.drop('POSITION', axis = 1)


        df_position_mean = df_position.mean(axis=0, skipna = False)
    
        new_df = pd.DataFrame(columns = df_position_mean.index)

        for index in df_position_mean.index:
            new_df.loc[L[0], index] = df_position_mean[index]
        #new_df = new_df.reset_index(drop=True)
        df_position_mean = new_df
        columns = df_position_mean.columns


        if df_position_mean.empty:
            NaN_df.index = [i]
            df_position_mean = NaN_df
        df_position_mean.columns = 'HOME_' + 'POSITION_' + str(i) + '_' + df_position_mean.columns
        
        if i == 1:
            df_team_mean = df_position_mean
        else:
            df_team_mean = pd.concat([df_team_mean, df_position_mean], axis= 1)
    
    if k == 0:
        final_df_away_player = df_team_mean
    else:
        final_df_away_player = pd.concat([final_df_away_player, df_team_mean], axis = 0)

final_df_away_player.index = L
final_df_away_player

Unnamed: 0_level_0,HOME_POSITION_1_PLAYER_ACCURATE_CROSSES_season_sum,HOME_POSITION_1_PLAYER_ACCURATE_PASSES_season_sum,HOME_POSITION_1_PLAYER_AERIALS_WON_season_sum,HOME_POSITION_1_PLAYER_ASSISTS_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_MISSED_season_sum,HOME_POSITION_1_PLAYER_BLOCKED_SHOTS_season_sum,HOME_POSITION_1_PLAYER_CLEARANCES_season_sum,HOME_POSITION_1_PLAYER_CLEARANCE_OFFLINE_season_sum,HOME_POSITION_1_PLAYER_DISPOSSESSED_season_sum,...,HOME_POSITION_4_PLAYER_SAVES_INSIDE_BOX_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_BLOCKED_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_ON_TARGET_5_last_match_std,HOME_POSITION_4_PLAYER_SHOTS_TOTAL_5_last_match_std,HOME_POSITION_4_PLAYER_STARTING_LINEUP_5_last_match_std,HOME_POSITION_4_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,HOME_POSITION_4_PLAYER_TACKLES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_CROSSES_5_last_match_std,HOME_POSITION_4_PLAYER_TOTAL_DUELS_5_last_match_std,HOME_POSITION_4_PLAYER_YELLOWCARDS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12303,3.0,9.25,19.75,0.0,3.5,5.5,2.5,1.0,0.0,12.5,...,,,,,,,,,,
12304,3.5,19.5,16.5,27.5,38.0,14.5,2.5,10.0,0.0,35.5,...,0.0,15.4,22.0,23.2,43.4,17.3,30.3,14.6,39.5,21.1
12305,0.333333,6.333333,4.666667,7.333333,4.666667,6.333333,0.0,1.333333,0.0,18.333333,...,0.0,24.285714,8.714286,23.0,55.0,17.571429,31.285714,9.857143,30.0,7.142857
12306,9.5,7.5,4.5,3.5,5.0,6.5,1.0,0.5,0.0,13.0,...,0.0,14.125,18.375,27.0,35.0,13.375,24.5,16.75,27.375,34.0
12307,3.0,8.75,37.25,3.0,15.5,28.0,5.25,5.5,0.0,22.5,...,0.0,19.428571,20.428571,24.285714,22.0,23.0,25.428571,22.714286,33.714286,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,4.666667,17.833333,9.166667,16.666667,11.0,16.666667,5.5,3.166667,0.0,23.333333,...,0.0,8.333333,29.5,34.666667,22.166667,0.0,15.0,26.5,29.0,21.0
37667,0.5,6.75,22.0,5.0,2.5,15.0,3.75,4.25,0.0,15.5,...,0.0,28.888889,21.333333,32.555556,33.666667,,20.333333,17.888889,18.111111,38.111111
37668,4.5,8.75,23.0,21.0,13.0,17.0,1.5,4.5,0.0,19.75,...,,,,,,,,,,
37669,3.25,4.5,12.5,15.5,15.0,5.5,1.0,1.5,0.0,19.75,...,,,,,,,,,,


##### Join the information about the players and the team together

In [261]:
#Attention inner
X_test_home = final_df_home_player.join(test_home_team_statistics_drop_df, on=['ID'], how='outer')

X_test_away = final_df_away_player.join(test_away_team_statistics_drop_df, on=['ID'], how='outer')

In [262]:
X_test_home

Unnamed: 0_level_0,HOME_POSITION_1_PLAYER_ACCURATE_CROSSES_season_sum,HOME_POSITION_1_PLAYER_ACCURATE_PASSES_season_sum,HOME_POSITION_1_PLAYER_AERIALS_WON_season_sum,HOME_POSITION_1_PLAYER_ASSISTS_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_CREATED_season_sum,HOME_POSITION_1_PLAYER_BIG_CHANCES_MISSED_season_sum,HOME_POSITION_1_PLAYER_BLOCKED_SHOTS_season_sum,HOME_POSITION_1_PLAYER_CLEARANCES_season_sum,HOME_POSITION_1_PLAYER_CLEARANCE_OFFLINE_season_sum,HOME_POSITION_1_PLAYER_DISPOSSESSED_season_sum,...,HOME_TEAM_YELLOWCARDS_5_last_match_std,HOME_TEAM_REDCARDS_5_last_match_std,HOME_TEAM_OFFSIDES_5_last_match_std,HOME_TEAM_ATTACKS_5_last_match_std,HOME_TEAM_PENALTIES_5_last_match_std,HOME_TEAM_SUBSTITUTIONS_5_last_match_std,HOME_TEAM_BALL_SAFE_5_last_match_std,HOME_TEAM_DANGEROUS_ATTACKS_5_last_match_std,HOME_TEAM_INJURIES_5_last_match_std,HOME_TEAM_GOALS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12303,,,,,,,,,,,...,2.0,9.0,0.0,10.0,0.0,4.0,4.0,6.0,2.0,3.0
12304,5.666667,13.333333,21.666667,20.333333,21.333333,23.0,8.666667,4.333333,0.0,50.666667,...,9.0,10.0,3.0,1.0,8.0,3.0,8.0,1.0,0.0,2.0
12305,0.25,9.5,12.5,22.5,16.0,16.5,4.0,7.75,0.0,20.25,...,6.0,8.0,2.0,9.0,0.0,6.0,0.0,9.0,,0.0
12306,15.0,13.666667,19.333333,23.666667,23.0,21.666667,1.333333,6.0,0.0,37.333333,...,4.0,5.0,2.0,3.0,8.0,0.0,2.0,2.0,2.0,3.0
12307,10.25,8.25,18.25,9.25,21.75,21.75,1.0,2.75,0.0,16.0,...,5.0,0.0,0.0,7.0,5.0,0.0,1.0,6.0,7.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,0.0,2.0,6.5,0.0,0.0,0.0,0.0,2.5,0.0,2.0,...,2.0,6.0,2.0,1.0,5.0,6.0,6.0,5.0,2.0,0.0
37667,2.0,9.0,17.333333,33.333333,23.333333,30.0,1.666667,4.0,0.0,29.0,...,3.0,0.0,2.0,8.0,0.0,0.0,4.0,8.0,10.0,10.0
37668,9.666667,11.0,7.666667,28.333333,12.333333,17.666667,4.333333,3.333333,0.0,26.0,...,7.0,0.0,8.0,4.0,0.0,7.0,,2.0,,3.0
37669,0.0,6.5,8.5,0.0,5.0,0.0,0.0,0.5,0.0,18.0,...,5.0,0.0,4.0,7.0,8.0,5.0,6.0,4.0,5.0,4.0


##### Join the information about the home and away team

In [263]:
X_test =  pd.concat([X_test_home,X_test_away],join='outer',axis=1)

X_test = X_test.replace({np.inf:np.nan,-np.inf:np.nan})

  X_test = X_test.replace({np.inf:np.nan,-np.inf:np.nan})


In [252]:
columns_name = X_train.columns

In [253]:
X_test.to_csv('X_test_without_imputer_no_position_1.csv', index=True)

In [254]:
index_test = X_test.index
index_test

Index([12303, 12304, 12305, 12306, 12307, 12308, 12309, 12310, 12311, 12312,
       ...
       37661, 37662, 37663, 37664, 37665, 37666, 37667, 37668, 37669, 37670],
      dtype='int64', name='ID', length=25368)

### Data normalisation 

In [255]:
X_train = pd.read_csv('X_train_without_imputer.csv', index_col = 0)
X_test = pd.read_csv('X_test_without_imputer.csv', index_col = 0)

In [256]:
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_standard = scaler.transform(X_train)
X_test_standard = scaler.transform(X_test)

In [264]:
imputer = KNNImputer(n_neighbors=50)

X_train_standard_without_NaN = imputer.fit_transform(X_train_standard)
X_test_standard_without_NaN = imputer.fit_transform(X_test_standard)

In [265]:
X_train_standard_without_NaN_df = pd.DataFrame(X_train_standard_without_NaN, index = index_train)
X_test_standard_without_NaN_df = pd.DataFrame(X_test_standard_without_NaN, index = index_test)

In [266]:
X_train_standard_without_NaN_df.to_csv('X_train_standard_without_NaN_without_impute_at_beginning_V2.csv', index = True)
X_test_standard_without_NaN_df.to_csv('X_test_standard_without_NaN_without_impute_at_beginning_V2.csv', index = True)

pca = PCA(n_components=1000, random_state=42)

X_train_standard_without_NaN_PCA = pca.fit_transform()
X_test_standard_without_NaN_PCA = pca.transform()

### Model with the 3 outcome 

In [313]:
Y_train_new = pd.DataFrame(index=Y_train.index)

In [314]:
Y_train_new.loc[Y_train['HOME_WINS']==1, 'RESULT'] = 0
Y_train_new.loc[Y_train['DRAW']==1, 'RESULT'] = 1
Y_train_new.loc[Y_train['AWAY_WINS'] ==1, 'RESULT'] = 2
Y_train_new

Unnamed: 0_level_0,RESULT
ID,Unnamed: 1_level_1
0,2.0
1,1.0
2,2.0
3,0.0
4,1.0
...,...
12298,2.0
12299,2.0
12300,2.0
12301,0.0


### train, valid test split


In [316]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train_standard_without_NaN, Y_train_new, test_size=0.2, random_state=42)

In [317]:
X_train = pd.DataFrame(X_train)
#X_train.columns = columns_name
#X_valid.columns = columns_name
X_test = pd.DataFrame(X_test)
#X_test.columns = columns_name

In [318]:
y_train = y_train.to_numpy()
y_train = y_train.ravel()
y_train

array([1., 1., 0., ..., 0., 0., 1.])

### Gradient Boosting with sklearn

In [320]:
GB = HistGradientBoostingClassifier(learning_rate=0.025, max_depth=8, early_stopping=True)
GB.fit(X_train, y_train)

y_pred2 = GB.predict(X_test)
print(classification_report(y_test, y_pred2, digits=5))

              precision    recall  f1-score   support

         0.0    0.50842   0.83657   0.63246      1083
         1.0    0.32000   0.02581   0.04776       620
         2.0    0.47854   0.39710   0.43403       758

    accuracy                        0.49695      2461
   macro avg    0.43565   0.41982   0.37142      2461
weighted avg    0.45175   0.49695   0.42404      2461


### Submission

In [321]:
predictions = GB.predict(X_test_standard_without_NaN)
predictions = pd.DataFrame(predictions)
predictions

Unnamed: 0,0
0,0.0
1,2.0
2,0.0
3,0.0
4,0.0
...,...
25363,0.0
25364,0.0
25365,0.0
25366,0.0


In [322]:
submission = pd.DataFrame(index=predictions.index)
submission['HOME_WINS'] = 0
submission['DRAW'] = 0
submission['AWAY_WINS'] = 0

submission = submission.join(predictions)
submission.columns = ["HOME_WINS", "DRAW", "AWAY_WINS", 'INDEX']
submission.loc[submission['INDEX']==0, 'HOME_WINS'] = 1
submission.loc[submission['INDEX']==1, 'DRAW'] = 1
submission.loc[submission['INDEX']==2, 'AWAY_WINS'] = 1

submission = submission.drop(columns=['INDEX'])

Y = pd.read_csv('Y_test_random_sEE2QeA.csv', index_col=0)

submission.index = Y.index
submission.to_csv('submission.csv', index=True)

### AdaBoost Classifier

In [206]:
ABC = AdaBoostClassifier()
ABC.fit(X_train, y_train)

y_pred3 = ABC.predict(X_test)
print(classification_report(y_test, y_pred3))



              precision    recall  f1-score   support

         0.0       0.52      0.73      0.61      1083
         1.0       0.20      0.09      0.12       620
         2.0       0.46      0.41      0.43       758

    accuracy                           0.47      2461
   macro avg       0.39      0.41      0.39      2461
weighted avg       0.42      0.47      0.43      2461


### Bagging Classifier

In [207]:
Bg = BaggingClassifier()
Bg.fit(X_train, y_train)

y_pred3 = Bg.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

         0.0       0.49      0.66      0.56      1083
         1.0       0.24      0.17      0.20       620
         2.0       0.43      0.32      0.37       758

    accuracy                           0.43      2461
   macro avg       0.39      0.38      0.37      2461
weighted avg       0.41      0.43      0.41      2461


### RandomForest Classifier

In [208]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)

y_pred3 = RF.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

         0.0       0.46      0.88      0.60      1083
         1.0       0.31      0.04      0.07       620
         2.0       0.44      0.19      0.26       758

    accuracy                           0.45      2461
   macro avg       0.40      0.37      0.31      2461
weighted avg       0.42      0.45      0.37      2461


### MLP

In [209]:
MLP = MLPClassifier(hidden_layer_sizes=(200,100,20,), activation='logistic', solver='adam')
MLP.fit(X_train, y_train)

y_pred = MLP.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.51      0.54      0.52      1083
         1.0       0.28      0.25      0.26       620
         2.0       0.39      0.39      0.39       758

    accuracy                           0.42      2461
   macro avg       0.39      0.39      0.39      2461
weighted avg       0.41      0.42      0.42      2461
