In [51]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency

In [52]:
df=pd.read_csv("train.csv")

In [53]:
df.columns

Index(['season', 'date', 'league_id', 'league', 'Team 1', 'Team2', 'SPI1',
       'SPI2', 'proj_score1', 'proj_score2', 'importance1', 'importance2',
       'score1', 'score2', 'xg1', 'xg2', 'nsxg1', 'nsxg2', 'adj_score1',
       'adj_score2', 'Outcome'],
      dtype='object')

In [54]:
df.head()

Unnamed: 0,season,date,league_id,league,Team 1,Team2,SPI1,SPI2,proj_score1,proj_score2,...,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,Outcome
0,2019,01/03/19,1979,Chinese Super League,Shandong Luneng,Guizhou Renhe,48.22,37.83,1.75,0.84,...,22.1,1.0,0.0,1.39,0.26,2.05,0.54,1.05,0.0,1
1,2019,01/03/19,1979,Chinese Super League,Shanghai Greenland,Shanghai SIPG,39.81,60.08,1.22,1.89,...,63.4,0.0,4.0,0.57,2.76,0.8,1.5,0.0,3.26,0
2,2019,01/03/19,1979,Chinese Super League,Guangzhou Evergrande,Tianjin Quanujian,65.59,39.99,2.58,0.62,...,28.8,3.0,0.0,0.49,0.45,1.05,0.75,3.15,0.0,1
3,2019,01/03/19,1979,Chinese Super League,Wuhan Zall,Beijing Guoan,32.25,54.82,1.1,1.79,...,58.9,0.0,1.0,1.12,0.97,1.51,0.94,0.0,1.05,0
4,2019,01/03/19,1979,Chinese Super League,Chongqing Lifan,Guangzhou RF,38.24,40.45,1.57,1.24,...,21.3,2.0,2.0,2.77,3.17,1.05,2.08,2.1,2.1,1


<div class="alert alert-block alert-success">
<h2>Hypothesis Testing 1: Relationship between Team Strength (SPI) and Match Outcome</h2>


### Null Hypothesis (H0): 
There is no relationship between the difference in SPI (SPI1 - SPI2) and the match outcome (home team win or away team win).
### Alternative Hypothesis (H1): 
The difference in SPI (SPI1 - SPI2) significantly influences the match outcome.<br>
<br>
This hypothesis test will help us understand whether the difference in team strength (as indicated by SPI) between the home and away teams has a significant impact on the match outcome.
</div>

In [55]:
# Calculate the difference in SPI between home and away teams
df['SPI_difference'] = df['SPI1'] - df['SPI2']

# Split data into home team wins and home team losses
home_team_wins = df[df['Outcome'] == 1]['SPI_difference']
home_team_losses = df[df['Outcome'] == 0]['SPI_difference']

In [56]:
# Perform independent t-test
t_statistic, p_value = ttest_ind(home_team_wins, home_team_losses)
print("T-statistic:", t_statistic)
print("P-value:", p_value)

T-statistic: 90.9810893414003
P-value: 0.0


In [57]:
alpha = 0.05  # Significance level
if p_value < alpha:
    print(f"Reject the null hypothesis (H0) for alpha value:{alpha}")
else:
    print(f"Fail to reject the null hypothesis (H0) for alpha value:{alpha}")

Reject the null hypothesis (H0) for alpha value:0.05


<div class="alert alert-block alert-success"> 
<h2> Hypothesis Testing 2: Impact of Importance on Home Team Performance:</h2>

### Null Hypothesis (H0): 
The average performance of home teams in high-importance matches (importance1 > median importance) is not significantly different from the average performance of home teams in low-importance matches (importance1 <= median importance).
### Alternative Hypothesis (H1): 
The average performance of home teams in high-importance matches is significantly different from the average performance of home teams in low-importance matches.
<br><br>
This hypothesis test will help us understand whether the importance of a match (as indicated by importance1) has a significant impact on the performance of home teams.
</div>

In [58]:
median_importance = df['importance1'].median()
median_importance

25.2

In [59]:
# Split data into high-importance and low-importance matches
high_importance_matches = df[df['importance1'] > median_importance]
low_importance_matches = df[df['importance1'] <= median_importance]

In [60]:
# Perform t-test
t_statistic, p_value = ttest_ind(high_importance_matches['Outcome'], low_importance_matches['Outcome'])
print("T-statistic:", t_statistic)
print("P-value:", p_value)

T-statistic: -1.3542293246346768
P-value: 0.17578440395515987


In [61]:
alpha = 0.05  # Significance level
if p_value < alpha:
    print(f"Reject the null hypothesis (H0) for alpha value:{alpha}")
else:
    print(f"Fail to reject the null hypothesis (H0) for alpha value:{alpha}")

Fail to reject the null hypothesis (H0) for alpha value:0.05


<div class="alert alert-block alert-success"> 
<h2> Hypothesis Testing (Anova) </h2>
  
### Null Hypothesis (H0): 
Home Team winning is not the same across different strengths of groups.
### Null Hypothesis (H1):
Home Team winning is the same across different strengths of groups.
</div>

In [62]:
df['SPI1'].describe()

count    7443.000000
mean       40.447416
std        18.367631
min         4.710000
25%        28.010000
50%        38.260000
75%        52.160000
max        93.990000
Name: SPI1, dtype: float64

<div class="alert alert-info">
Now based on above observation divsion of groups is as follows:<br>
1. Strong teams - SPI1>64<br>
2. Moderate teams - (SPI1 <= 64) & (SPI1 > 34)  <br>
3. Weak teams - SPI1<=34<br>
</div>    

In [63]:
strong_teams = df[df['SPI1'] > 64]
moderate_teams = df[(df['SPI1'] <= 64) & (df['SPI1'] > 34)]
weak_teams = df[df['SPI1'] <= 34]

In [64]:
# Perform one-way ANOVA
f_statistic, p_value = f_oneway(strong_teams['Outcome'], moderate_teams['Outcome'], weak_teams['Outcome'])
print("F-statistic:", f_statistic)
print("P-value:", p_value)
alpha = 0.05  # Significance level
if p_value<alpha:
    print(f"Reject the null hypothesis (H0) for alpha value:{alpha}")
else:
    print(f"Fail to reject the null hypothesis (H0) for alpha value:{alpha}")

F-statistic: 146.74808204839005
P-value: 3.1121984050042306e-63
Reject the null hypothesis (H0) for alpha value:0.05


<div class="alert alert-block alert-success"> 
<h2> Hypothesis Testing 3  (Chi-Square Test) : Home Advantage Across Different Leagues </h2>

### Null Hypothesis (H0):
There is no difference in the proportion of home team wins across different football leagues.

### Alternative Hypothesis (H1): 
The proportion of home team wins varies significantly across different football leagues.
<br><br>
This hypothesis test will help us investigate whether there is a consistent "home advantage" phenomenon across different football leagues or if the home team's success rate differs significantly depending on the league.
</div>

In [65]:
df_chi=df.copy()

In [66]:
df_chi.loc[df_chi['Outcome'] == 1, 'Outcome'] = 'Yes' 
df_chi.loc[df_chi['Outcome'] == 0, 'Outcome'] = 'No' 
df_chi.head(3)

Unnamed: 0,season,date,league_id,league,Team 1,Team2,SPI1,SPI2,proj_score1,proj_score2,...,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,Outcome,SPI_difference
0,2019,01/03/19,1979,Chinese Super League,Shandong Luneng,Guizhou Renhe,48.22,37.83,1.75,0.84,...,1.0,0.0,1.39,0.26,2.05,0.54,1.05,0.0,Yes,10.39
1,2019,01/03/19,1979,Chinese Super League,Shanghai Greenland,Shanghai SIPG,39.81,60.08,1.22,1.89,...,0.0,4.0,0.57,2.76,0.8,1.5,0.0,3.26,No,-20.27
2,2019,01/03/19,1979,Chinese Super League,Guangzhou Evergrande,Tianjin Quanujian,65.59,39.99,2.58,0.62,...,3.0,0.0,0.49,0.45,1.05,0.75,3.15,0.0,Yes,25.6


In [67]:
contingency_table = pd.crosstab(df_chi["league"], df_chi["Outcome"])
contingency_table

Outcome,No,Yes
league,Unnamed: 1_level_1,Unnamed: 2_level_1
AAL,36,125
APD,115,210
ATMB,43,65
BJL,58,104
BPL,69,91
Brasileiro Série A,98,282
Chinese Super League,61,179
Danish SAS-Ligaen,40,62
Dutch Eredivisie,36,108
English League Championship,84,180


In [68]:
# Perform chi-square test
chi2_statistic, p_value, _, _ = chi2_contingency(contingency_table)
print("Chi-square statistic:", chi2_statistic)
print("P-value:", p_value)

Chi-square statistic: 141.0524283476132
P-value: 9.122342411425732e-14


In [70]:
# Using p-value and alpha
alpha=0.05
if p_value<alpha:
    print(f"Reject the null hypothesis (H0) for alpha value:{alpha}")
else:
    print(f"Fail to reject the null hypothesis (H0) for alpha value:{alpha}")

Reject the null hypothesis (H0) for alpha value:0.05
