# Win Premier league-games


## Import Data and Required Packages

In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Import the CSV Data as Pandas DataFrame

In [23]:
df = pd.read_csv('../artifacts/data.csv')
df.head()

Unnamed: 0,season_start_year,kickoff_date,GW,id,team_h,team_a,train_score,label_1,label_X,label_2,...,player_diff_max_f_rolling_mean_6_impact_game_threat,player_diff_max_f_rolling_mean_6_impact_game_transfers_balance,player_diff_max_f_rolling_mean_6_impact_game_hourly_rate_goals_scored,player_diff_max_f_rolling_mean_6_impact_game_hourly_rate_assists,player_diff_max_f_rolling_max_6_impact_game_influence,player_diff_max_f_rolling_max_6_impact_game_creativity,player_diff_max_f_rolling_max_6_impact_game_threat,player_diff_max_f_rolling_max_6_impact_game_transfers_balance,player_diff_max_f_rolling_max_6_impact_game_hourly_rate_goals_scored,player_diff_max_f_rolling_max_6_impact_game_hourly_rate_assists
0,23,2023-08-25,3,26,7,12,score,0,0,0,...,,,,,,,,,,
1,23,2023-08-26,3,22,3,18,score,0,0,0,...,,,,,,,,,,
2,23,2023-08-26,3,21,1,10,score,0,0,0,...,,,,,,,,,,
3,23,2023-08-26,3,23,4,8,score,0,0,0,...,,,,,,,,,,
4,23,2023-08-26,3,27,9,20,score,0,0,0,...,,,,,,,,,,


### Set label

In [24]:
label = "label_1"
remove_labels = ['label_X', 'label_2']

### Check Null and Dtypes


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1439 entries, 0 to 1438
Columns: 319 entries, season_start_year to player_diff_max_f_rolling_max_6_impact_game_hourly_rate_assists
dtypes: float64(288), int64(27), object(4)
memory usage: 3.5+ MB


### Adjust data

In [26]:
df = df.loc[df.train_score == "train"].drop(["rounds_left", "GW", 'id', 'team_h', 'team_a', 'train_score'] + remove_labels, axis=1)
df['season_start_year'] = df['season_start_year'].astype(str)
df.shape

(1079, 311)

### Check Missing values

In [27]:
df.isna().sum()

season_start_year                                                       0
kickoff_date                                                            0
label_1                                                                 0
home                                                                    0
away                                                                    0
                                                                       ..
player_diff_max_f_rolling_max_6_impact_game_creativity                  0
player_diff_max_f_rolling_max_6_impact_game_threat                      0
player_diff_max_f_rolling_max_6_impact_game_transfers_balance           0
player_diff_max_f_rolling_max_6_impact_game_hourly_rate_goals_scored    0
player_diff_max_f_rolling_max_6_impact_game_hourly_rate_assists         0
Length: 311, dtype: int64

### Check Duplicates

In [28]:
df.duplicated().sum()

0

### Divide feature into arrays based on type

In [29]:
# define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 307 numerical features : ['label_1', 'kickoff_year', 'kickoff_month', 'win_share_latest_5_games_overall_home_team', 'draw_share_latest_5_games_overall_home_team', 'loss_share_latest_5_games_overall_home_team', 'avg_goals_scored_latest_5_games_overall_home_team', 'avg_goals_conceded_latest_5_games_overall_home_team', 'win_share_latest_5_games_overall_away_team', 'draw_share_latest_5_games_overall_away_team', 'loss_share_latest_5_games_overall_away_team', 'avg_goals_scored_latest_5_games_overall_away_team', 'avg_goals_conceded_latest_5_games_overall_away_team', 'win_share_latest_5_games_home_home_team', 'draw_share_latest_5_games_home_home_team', 'loss_share_latest_5_games_home_home_team', 'avg_goals_scored_latest_5_games_home_home_team', 'avg_goals_conceded_latest_5_games_home_home_team', 'win_share_latest_5_games_away_away_team', 'draw_share_latest_5_games_away_away_team', 'loss_share_latest_5_games_away_away_team', 'avg_goals_scored_latest_5_games_away_away_team', 'avg_goals_c

### Checking the number of unique values of each column

In [30]:
df.nunique()

season_start_year                                                          3
kickoff_date                                                             355
label_1                                                                    2
home                                                                      24
away                                                                      24
                                                                        ... 
player_diff_max_f_rolling_max_6_impact_game_creativity                  1075
player_diff_max_f_rolling_max_6_impact_game_threat                      1049
player_diff_max_f_rolling_max_6_impact_game_transfers_balance           1070
player_diff_max_f_rolling_max_6_impact_game_hourly_rate_goals_scored     211
player_diff_max_f_rolling_max_6_impact_game_hourly_rate_assists          205
Length: 311, dtype: int64

In [31]:
for i in categorical_features:
    print(df.groupby([i])['season_start_year'].count().sort_values(ascending = False))
    print("\n")

season_start_year
19    360
21    360
20    359
Name: season_start_year, dtype: int64


kickoff_date
2022-05-22    10
2021-05-23    10
2020-07-26    10
2019-12-26     9
2020-01-01     9
              ..
2021-05-10     1
2020-03-09     1
2021-05-07     1
2021-04-30     1
2019-08-23     1
Name: season_start_year, Length: 355, dtype: int64


home
ARS    54
LEI    54
WHU    54
TOT    54
SOU    54
NEW    54
MUN    54
MCI    54
AVL    54
LIV    54
EVE    54
CRY    54
CHE    54
BUR    54
BHA    54
WOL    54
LEE    36
NOR    36
SHU    36
WAT    36
BRE    18
BOU    18
WBA    18
FUL    17
Name: season_start_year, dtype: int64


away
MUN    55
ARS    54
EVE    54
WHU    54
TOT    54
NEW    54
MCI    54
AVL    54
LIV    54
CRY    54
CHE    54
BUR    54
BHA    54
WOL    54
LEI    53
SOU    53
LEE    36
NOR    36
SHU    36
WAT    36
FUL    18
BRE    18
BOU    18
WBA    18
Name: season_start_year, dtype: int64




## Exploring Data

### Check statistics of data set

In [None]:
df.describe()

### Correlation

In [None]:
import seaborn as sns

corr = df[numeric_features].corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr.sort_values('label_1', ascending=False)['label_1'].head(20)

### Plot features against target

In [None]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

def plot_feature_against_target(df, feature, target):
    
    if is_string_dtype(df[feature]) or (is_numeric_dtype(df[feature]) and len(df[feature].unique()) <= 13):
        dfg = df.groupby(feature).agg(
            mean = (target, np.mean),
            n = (target, np.size))
        #ax = dfg.plot.bar(figsize=(15, 7.5), ylabel='Average target', title = feature)
        dfg.plot(figsize=(15, 7.5), kind='bar', secondary_y= 'n', rot= 0, title = feature)
        print(dfg.sort_values('mean'))
        
    elif is_numeric_dtype(df[feature]):
        
        bins = np.nanpercentile(df[feature], [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100])
        bins = [i for n, i in enumerate(bins) if i not in bins[:n]]
        bins[0] = np.floor(bins[0])
        bins[-1] = np.ceil(bins[-1])
        
        df_temp = pd.DataFrame({feature: df[feature], target: df[target]})
        df_temp[feature+'_bins'] = pd.cut(pd.to_numeric(df[feature]), bins, include_lowest=True)
        
        dfg = df_temp.groupby(feature+'_bins').agg(
            mean = (target, np.mean),
            n = (target, np.size))
        #ax = dfg.plot.bar(figsize=(15, 7.5), ylabel='Average target', title = feature)
        dfg.plot(figsize=(15, 7.5), kind='bar', secondary_y= 'n', rot= 0, title = feature)
        print(dfg)

        
    plt.show()

In [None]:
features = categorical_features + numeric_features
for p in features:
    plot_feature_against_target(df, p, label)

# Test significans

In [32]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from scipy.stats import f_oneway, kruskal

def test_significant(df, feature, target):
    
    if is_string_dtype(df[feature]) or (is_numeric_dtype(df[feature]) and len(df[feature].unique()) <= 13):

        df_temp = pd.DataFrame({"feature": df[feature], "target": df[target]})

        # Perform one-way ANOVA
        model = ols('target ~ feature', data=df_temp).fit()
        anova_table = sm.stats.anova_lm(model)

        result = pd.DataFrame({"feature": [feature], "PR(>F)": [anova_table['PR(>F)'][0]]})
        
    elif is_numeric_dtype(df[feature]):

        bins = np.nanpercentile(df[feature], [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100])
        bins = [i for n, i in enumerate(bins) if i not in bins[:n]]
        bins[0] = np.floor(bins[0])
        bins[-1] = np.ceil(bins[-1])
                
        df_temp = pd.DataFrame({"feature": df[feature], "target": df[target]})

        # Perform one-way ANOVA
        model = ols('target ~ feature', data=df_temp).fit()
        anova_table = sm.stats.anova_lm(model)

        result = pd.DataFrame({"feature": [feature], "PR(>F)": [anova_table['PR(>F)'][0]]})
    
    return result


In [33]:
features = categorical_features + numeric_features
p_values = pd.DataFrame()
for p in features:
    p_value = test_significant(df, p, label)
    p_values = pd.concat([p_values, p_value])

In [39]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

p_values.sort_values("PR(>F)").reset_index(drop=True)

Unnamed: 0,feature,PR(>F)
0,label_1,0.0
1,player_diff_max_all_rolling_max_6_impact_game_value,1.240143e-29
2,player_diff_max_all_rolling_mean_6_impact_game_value,6.552547000000001e-23
3,player_diff_max_g_rolling_max_6_impact_game_value,9.509876999999999e-20
4,player_home_max_all_rolling_max_6_impact_game_value,2.202461e-19
5,player_diff_max_all_rolling_max_6_impact_game_selected,1.346636e-16
6,player_diff_max_m_rolling_mean_6_impact_game_threat,2.35328e-16
7,player_diff_max_all_rolling_mean_6_impact_game_selected,6.488352e-16
8,player_home_max_all_rolling_mean_6_impact_game_value,7.553068e-16
9,home,2.538103e-15


Unnamed: 0,feature,PR(>F)
0,player_diff_max_f_rolling_mean_6_impact_game_t...,0.432665
