In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from numpy import random
from datetime import datetime
from math import nan
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
import statsmodels.api as sm
import scipy.stats as stats

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
sns.set_style("whitegrid")

In [4]:
player_group_date = pd.read_csv('../dataset/player_group_date.csv')

In [5]:
player_group_date.head()

Unnamed: 0,player_api_id,date,passing,shooting,defence,goalkeeping,offence_misc,movement
0,2625,2007,59.67,54.125,61.25,16.25,56.0,62.67
1,291219,2007,38.33,63.75,24.5,10.25,60.0,60.67
2,27421,2007,61.67,49.75,70.0,18.5,69.0,57.0
3,45464,2007,58.0,31.5,65.25,13.0,63.0,59.67
4,182107,2007,38.67,28.5,43.25,17.5,38.0,60.33


In [6]:
matches_h_XY = pd.read_csv('../dataset/matches_h_XY.csv')
matches_a_XY = pd.read_csv('../dataset/matches_a_XY.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/matches_h_XY.csv'

In [None]:
matches_r = pd.read_csv('../dataset/results.csv')

In [None]:
matches_h_XY.head()

In [None]:
def add_stats(df):
    df = df.drop(columns = ['player_num','X', 'Y'])
    df = df.sort_values(['date'], ascending = True)
    
    df_merge = pd.merge_asof(df, player_group_date, on='date', by='player_api_id', direction='nearest')
        
    df_merge.drop(columns = ['player_api_id', 'date'], inplace = True)
    
    df_merge_mean = df_merge.groupby(['match_api_id','Position']).mean().round(2)
    df_merge_mean = df_merge_mean.reset_index()
    
    df_merge_wide = pd.pivot(df_merge_mean, index = 'match_api_id', columns = 'Position', 
                             values = player_group_date.columns[2:])
    df_merge_wide.columns = df_merge_wide.columns.to_flat_index()
    df_merge_wide = df_merge_wide.reset_index()
    
    return df_merge_wide

In [None]:
matches_h_stats = add_stats(matches_h_XY)  

In [None]:
matches_a_stats = add_stats(matches_a_XY)  

In [None]:
matches_h_stats.head()

In [None]:
position_list = ['ST', 'W', 'MF', 'CB', 'SB', 'GK']

In [None]:
for position in position_list:
    position_df = matches_h_stats.filter(like=position)
    position_melt = pd.melt(position_df)
    plt.subplots(figsize=(8,4))
    sns.boxplot(x='variable', y='value', data=position_melt, palette = 'Set2', fliersize =0.5, linewidth = 0.5)
    plt.xlabel('Group')
    plt.ylabel('Rating distribution')
    plt.title(str(position)+' rating')
    plt.xticks(ticks=[0, 1, 2, 3, 4, 5], labels=['passing', 'shooting', 'defence', 'goalkeeping', 'offence_misc', 'movement'])    
    plt.show()

In [None]:
matches_combined = pd.merge(matches_h_stats, matches_a_stats, on = 'match_api_id')

In [None]:
column_names = matches_combined.columns
new_column_names = [name.replace("'", '').replace(', ', '_').replace('_x', '_home').replace('_y', '_away').
                    replace('(', '').replace(')', '') for name in matches_combined.columns]

matches_combined.columns = new_column_names

In [None]:
matches_combined_all = pd.merge(matches_combined, matches_r, on = 'match_api_id', how = 'left')

In [None]:
matches_combined_all.head()

In [None]:
matches_combined_all.isna().sum()

In [None]:
matches_combined_all = matches_combined_all.dropna()

In [None]:
matches_combined_all.columns

In [None]:
advantage_pair = {
    'offence_box_adv':(['shooting_ST_home','shooting_W_home'],['defence_CB_away','defence_SB_away'],1), 
    'offence_GK_adv':(['shooting_ST_home','shooting_W_home'],['goalkeeping_GK_away'],0.5),
    'defence_box_adv':(['defence_CB_home','defence_SB_home'],['shooting_ST_away','shooting_W_away'],1),
    'defence_GK_adv':(['goalkeeping_GK_home'],['shooting_ST_away','shooting_W_away'],2),   
    
    'offence_pass_adv':(['passing_W_home', 'passing_MF_home', 'passing_SB_home'],['defence_CB_away','defence_SB_away'],2/3),
    'offence_move_adv':(['movement_W_home'],['movement_SB_away'],1),
    'defence_pass_adv':(['defence_CB_home','defence_SB_home'],['passing_W_away', 'passing_MF_away', 'passing_SB_away'],1.5),
    'defence_move_adv':(['movement_SB_home'],['movement_W_away'],1),    

    'offence_misc_adv':(['offence_misc_MF_home','offence_misc_ST_home','offence_misc_W_home'],[
        'offence_misc_MF_away','offence_misc_ST_away','offence_misc_W_away'],1)}

In [None]:
matches_adv = matches_combined_all[['match_api_id', 'result']]

In [None]:
for new_col, (numerator_col, denominator_cols, multiplier) in advantage_pair.items():
    num_sum = matches_combined_all[numerator_col].sum(axis=1)
    deno_sum = matches_combined_all[denominator_cols].sum(axis=1)
    matches_adv[new_col] = num_sum / deno_sum * multiplier

In [None]:
matches_adv.isna().sum()

In [None]:
matches_adv_2 = matches_adv.drop(columns = ['match_api_id'])

In [None]:
X = matches_adv_2.drop(columns = ['result'])

In [None]:
y = matches_adv_2['result']

In [None]:
matches_adv_2.columns

In [None]:
matches_adv_2.head()

In [None]:
plt.figure(figsize=(12, 8))
plt.tight_layout()
sns.set_palette("Paired")

# Define the categories of 'result'
categories = matches_adv_2['result'].unique()

# Iterate over each column (except 'result')
for i, column in enumerate(matches_adv_2.columns[1:], start=1):
    plt.subplot(2, 5, i)
    
    # Create a list to store box plot data for each category
    boxplot_data = []
    
    # Iterate over each category and collect the data
    for category in categories:
        data = matches_adv_2[matches_adv_2['result'] == category][column]
        boxplot_data.append(data)
    
    # Plot the box plots for each category
    plt.boxplot(boxplot_data, labels=categories, flierprops={'markersize': 1}) 
    
    plt.xlabel(column)

# Set the main title for the entire plot
plt.suptitle('Subplots of Box Plots')

# Adjust the spacing between subplots
plt.tight_layout(rect=[0, 0, 0.85, 0.95])

# Display the plot
plt.show()

In [None]:
z_scores = np.abs((X - X.mean()) / X.std())
outliers = z_scores > 4
outliers_rows = matches_adv_2[outliers.any(axis=1)]

In [None]:
outliers_rows.shape

In [None]:
outlier_counts = outliers.sum()
plt.figure(figsize=(8, 4))
outlier_counts.plot(kind='barh')
plt.xlabel('Columns')
plt.ylabel('Number of Outliers')
plt.title('Number of Outliers in Each Column')
plt.show()

In [None]:
plt.subplots(figsize=(10,8))
sns.heatmap(X.corr(),annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
X = matches_adv_2.drop(columns = ['result'], axis = 1)

y = matches_adv_2['result']

In [None]:
X_sm = sm.add_constant(X)

label_mapping = {'lose':0, 'tie':0.5, 'win':1}

y_sm = y.map(label_mapping)

log_reg = sm.Logit(y_sm, X_sm).fit()
print(log_reg.summary2())

In [None]:
Win = matches_adv_2[matches_adv_2['result'] == 'win']
Tie = matches_adv_2[matches_adv_2['result'] == 'tie']
Lose = matches_adv_2[matches_adv_2['result'] == 'lose']

In [None]:
num_cols = 4
num_rows = int(np.ceil(len(X.columns) / num_cols))

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, 24))
fig.subplots_adjust(hspace=0.5) 

axes = axes.flatten()

for i, col in enumerate(X.columns):
    ax = axes[i]
    sns.distplot(Win[col], hist=True, kde=True,
                 kde_kws={'linewidth': 1, 'color': 'darkblue'}, bins=20, color='lightblue', label='Win', ax=ax)
    sns.distplot(Lose[col], hist=True, kde=True,
                 kde_kws={'linewidth': 1, 'color': 'goldenrod'}, bins=20, color='tan', label='Lose', ax=ax)
    ax.set_title(col)
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
matches_adv_2.to_csv('../dataset/model_df2.csv', index = False)