In [1]:
#pip install researchpy
import pandas as pd
import numpy as np
from scipy import stats
import scipy.stats.distributions as dist
import researchpy as rp

In [12]:
df = pd.read_csv("FinalData.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           1440 non-null   int64 
 1   Season               1440 non-null   object
 2   Date                 1440 non-null   object
 3   Home                 1440 non-null   object
 4   Away                 1440 non-null   object
 5   Attendance           1440 non-null   int64 
 6   Venue                1440 non-null   object
 7   URL                  1440 non-null   object
 8   HomeGoal             1440 non-null   int64 
 9   AwayGoal             1440 non-null   int64 
 10  HomePossession       1440 non-null   object
 11  HomePassesCompleted  1440 non-null   int64 
 12  HomePassesAttempts   1440 non-null   int64 
 13  HomeShotsonTarget    1440 non-null   int64 
 14  HomeShots            1440 non-null   int64 
 15  HomeFouls            1440 non-null   int64 
 16  HomeTa

In [13]:
def homeresult(c):
    if c['HomeGoal']>c['AwayGoal']:
        return 'W'
    elif c['HomeGoal']<c['AwayGoal']:
        return 'L'
    else:
        return 'T'
df["HomeResult"]= df.apply(homeresult,axis=1)

In [14]:
def varGames(c):
    if (c['Season']=='17-18') or (c['Season']=='18-19') or (c['Season']=='19-20') or (c['Season']=='20-21'):
        return 'VAR' 
    else:
        return 'NoVAR'
df['VarPresent']=df.apply(varGames,axis=1)

In [15]:
def attendance(c):
    if c['Attendance']==0:
        return "NoFans"
    else:
        return "Fans"

df['BiAttendance']=df.apply(attendance,axis=1)

In [16]:
df['GoalDifferential']=df['HomeGoal']+df['AwayGoal']
df['HomePassAccuracy']=df['HomePassesCompleted']/df['HomePassesAttempts']
df['HomeTandInt']=df['HomeTackles']+df['HomeInterceptions']
df['AwayFoulsandYRcards']= df['AwayFouls']+2*df['AwayYellow']+4*df['AwayRed']
df['HomeFoulsandYRcards']= df['HomeFouls']+2*df['HomeYellow']+4*df['HomeRed']

In [17]:
df['Season'].value_counts()

16-17    306
18-19    306
19-20    306
17-18    306
20-21    216
Name: Season, dtype: int64

In [18]:
NoVarGames=df[df['VarPresent']=='NoVAR']
VarGames=df[df['VarPresent']=='VAR']
Fans= df[df['Attendance']!=0]
NoFans= df[df['Attendance']==0]

In [19]:
No_Fans= df[df['Attendance']==0]
No_Fans['Season'].value_counts()

20-21    184
19-20     83
Name: Season, dtype: int64

In [20]:
Table= pd.crosstab(df.BiAttendance,df.VarPresent)
Table

VarPresent,NoVAR,VAR
BiAttendance,Unnamed: 1_level_1,Unnamed: 2_level_1
Fans,306,867
NoFans,0,267


In [64]:
Table= pd.crosstab(VarGames.Home,VarGames.BiAttendance)
Table

BiAttendance,Fans,NoFans
Home,Unnamed: 1_level_1,Unnamed: 2_level_1
Arminia,1,12
Augsburg,47,16
Bayern Munich,47,16
Dortmund,49,13
Düsseldorf,29,5
Eint Frankfurt,48,15
Freiburg,47,17
Hamburger SV,17,0
Hannover 96,34,0
Hertha BSC,49,14


## Measuring the Impact of VAR with Fans

In [21]:
Table= pd.crosstab(Fans.HomeResult,Fans.VarPresent)
Table

VarPresent,NoVAR,VAR
HomeResult,Unnamed: 1_level_1,Unnamed: 2_level_1
L,82,267
T,74,215
W,150,385


In [22]:
pd.crosstab(Fans.HomeResult,Fans.VarPresent).apply(lambda r:r/r.sum(),axis=0)

VarPresent,NoVAR,VAR
HomeResult,Unnamed: 1_level_1,Unnamed: 2_level_1
L,0.267974,0.307958
T,0.24183,0.247982
W,0.490196,0.44406


In [23]:
total_proportion_Won = (Fans.HomeResult == "W").mean()
num_NoVAR=Fans[Fans.BiAttendance=="NoVAR"].shape[0]
num_VAR=Fans[Fans.BiAttendance=="VAR"].shape[0]

In [24]:
prop = Fans.groupby("VarPresent")["HomeResult"].agg([lambda z: np.mean(z=="W"), "size"])
prop.columns = ["prop_won", 'counts']
prop.head()

Unnamed: 0_level_0,prop_won,counts
VarPresent,Unnamed: 1_level_1,Unnamed: 2_level_1
NoVAR,0.490196,306
VAR,0.44406,867


In [25]:
variance= total_proportion_Won*(1-total_proportion_Won)
standard_error= np.sqrt(variance*(1/prop.counts.VAR + 1/prop.counts.NoVAR))
print(standard_error)

0.033118294393471207


In [26]:
best_estimate= (prop.prop_won.VAR-prop.prop_won.NoVAR)
print(best_estimate)

h_est=0

test_stat= (best_estimate-h_est)/standard_error

print(test_stat)

-0.046136101499423265
-1.39306997369159


In [27]:
# Calculate the  p-value
pvalue = 2*dist.norm.cdf(-np.abs(test_stat)) # Multiplied by two indicates a two tailed testing.
print("Computed P-value is", pvalue)
##Not Significant difference

Computed P-value is 0.1635986319051237


In [30]:
rp.ttest(group1= Fans['GoalDifferential'][Fans['VarPresent'] == 'VAR'], group1_name= "VAR",
         group2= Fans['GoalDifferential'][Fans['VarPresent'] == 'NoVAR'], group2_name= "NoVAR")

##Not Significant Difference

(   Variable       N      Mean        SD        SE  95% Conf.  Interval
 0       VAR   867.0  3.047290  1.679609  0.057043   2.935332  3.159247
 1     NoVAR   306.0  2.866013  1.762549  0.100758   2.667744  3.064282
 2  combined  1173.0  3.000000  1.702738  0.049716   2.902457  3.097543,
             Independent t-test    results
 0  Difference (VAR - NoVAR) =      0.1813
 1        Degrees of freedom =   1171.0000
 2                         t =      1.6022
 3     Two side test p value =      0.1094
 4    Difference < 0 p value =      0.9453
 5    Difference > 0 p value =      0.0547
 6                 Cohen's d =      0.1065
 7                 Hedge's g =      0.1065
 8             Glass's delta =      0.1079
 9                         r =      0.0468)

In [32]:
rp.ttest(group1= Fans['HomePassAccuracy'][Fans['VarPresent'] == 'VAR'], group1_name= "VAR",
         group2= Fans['HomePassAccuracy'][Fans['VarPresent'] == 'NoVAR'], group2_name= "NoVAR")

#significant Difference use VAR Games for Fan Analysis

(   Variable       N      Mean        SD        SE  95% Conf.  Interval
 0       VAR   867.0  0.772074  0.068681  0.002333   0.767496  0.776652
 1     NoVAR   306.0  0.750712  0.087308  0.004991   0.740891  0.760534
 2  combined  1173.0  0.766501  0.074547  0.002177   0.762231  0.770772,
             Independent t-test    results
 0  Difference (VAR - NoVAR) =      0.0214
 1        Degrees of freedom =   1171.0000
 2                         t =      4.3422
 3     Two side test p value =      0.0000
 4    Difference < 0 p value =      1.0000
 5    Difference > 0 p value =      0.0000
 6                 Cohen's d =      0.2887
 7                 Hedge's g =      0.2885
 8             Glass's delta =      0.3110
 9                         r =      0.1259)

In [33]:
rp.ttest(group1= Fans['HomeTandInt'][Fans['VarPresent'] == 'VAR'], group1_name= "VAR",
         group2= Fans['HomeTandInt'][Fans['VarPresent'] == 'NoVAR'], group2_name= "NoVAR")

#significant Difference use VAR Games for Fan Analysis

(   Variable       N       Mean        SD        SE  95% Conf.   Interval
 0       VAR   867.0  25.816609  7.088996  0.240755  25.344078  26.289140
 1     NoVAR   306.0  38.699346  8.975438  0.513092  37.689699  39.708994
 2  combined  1173.0  29.177323  9.493446  0.277188  28.633482  29.721164,
             Independent t-test    results
 0  Difference (VAR - NoVAR) =    -12.8827
 1        Degrees of freedom =   1171.0000
 2                         t =    -25.4077
 3     Two side test p value =      0.0000
 4    Difference < 0 p value =      0.0000
 5    Difference > 0 p value =      1.0000
 6                 Cohen's d =     -1.6894
 7                 Hedge's g =     -1.6884
 8             Glass's delta =     -1.8173
 9                         r =      0.5961)

In [48]:
rp.ttest(group1= Fans['HomeFoulsandYRcards'][Fans['VarPresent'] == 'VAR'], group1_name= "VAR",
         group2= Fans['HomeFoulsandYRcards'][Fans['VarPresent'] == 'NoVAR'], group2_name= "NoVAR")

###significant Difference use VAR Games for Fan Analysis

(   Variable       N       Mean        SD        SE  95% Conf.   Interval
 0       VAR   867.0  16.698962  5.786864  0.196532  16.313227  17.084697
 1     NoVAR   306.0  17.633987  6.012830  0.343731  16.957603  18.310370
 2  combined  1173.0  16.942882  5.858483  0.171055  16.607273  17.278490,
             Independent t-test    results
 0  Difference (VAR - NoVAR) =     -0.9350
 1        Degrees of freedom =   1171.0000
 2                         t =     -2.4052
 3     Two side test p value =      0.0163
 4    Difference < 0 p value =      0.0082
 5    Difference > 0 p value =      0.9918
 6                 Cohen's d =     -0.1599
 7                 Hedge's g =     -0.1598
 8             Glass's delta =     -0.1616
 9                         r =      0.0701)

In [49]:
rp.ttest(group1= Fans['AwayFoulsandYRcards'][Fans['VarPresent'] == 'VAR'], group1_name= "VAR",
         group2= Fans['AwayFoulsandYRcards'][Fans['VarPresent'] == 'NoVAR'], group2_name= "NoVAR")

#significant Difference use VAR Games for Fan Analysis

(   Variable       N       Mean        SD        SE  95% Conf.   Interval
 0       VAR   867.0  18.295271  5.836139  0.198206  17.906251  18.684291
 1     NoVAR   306.0  19.375817  5.715031  0.326707  18.732933  20.018701
 2  combined  1173.0  18.577153  5.821745  0.169982  18.243649  18.910657,
             Independent t-test    results
 0  Difference (VAR - NoVAR) =     -1.0805
 1        Degrees of freedom =   1171.0000
 2                         t =     -2.7995
 3     Two side test p value =      0.0052
 4    Difference < 0 p value =      0.0026
 5    Difference > 0 p value =      0.9974
 6                 Cohen's d =     -0.1861
 7                 Hedge's g =     -0.1860
 8             Glass's delta =     -0.1851
 9                         r =      0.0815)

## Measuring the Impact of Fans given VAR Results

In [50]:
Table= pd.crosstab(df.HomeResult,df.BiAttendance)
Table

BiAttendance,Fans,NoFans
HomeResult,Unnamed: 1_level_1,Unnamed: 2_level_1
L,349,100
T,289,71
W,535,96


In [51]:
pd.crosstab(df.HomeResult,df.BiAttendance).apply(lambda r:r/r.sum(),axis=0)

BiAttendance,Fans,NoFans
HomeResult,Unnamed: 1_level_1,Unnamed: 2_level_1
L,0.297528,0.374532
T,0.246377,0.265918
W,0.456095,0.359551


In [52]:
total_proportion_won = (df.HomeResult == "W").mean()

num_NoFans=df[df.BiAttendance=="NoFans"].shape[0]
num_Fans=df[df.BiAttendance=="Fans"].shape[0]

In [53]:
assert num_NoFans*total_proportion_won>10, "Assumption not met"
assert num_Fans*total_proportion_won>10, "Assumption not met"
assert num_NoFans*(1-total_proportion_won)>10, "Assumption not met"
assert num_Fans*(1-total_proportion_won)>10, "Assumption not met"

In [54]:
prop = df.groupby("BiAttendance")["HomeResult"].agg([lambda z: np.mean(z=="W"), "size"])
prop.columns = ["prop_won", 'counts']
prop.head()

Unnamed: 0_level_0,prop_won,counts
BiAttendance,Unnamed: 1_level_1,Unnamed: 2_level_1
Fans,0.456095,1173
NoFans,0.359551,267


In [55]:
variance= total_proportion_won*(1-total_proportion_won)
standard_error= np.sqrt(variance*(1/prop.counts.Fans + 1/prop.counts.NoFans))
print(standard_error)

0.03364364623082332


In [56]:
best_estimate= (prop.prop_won.Fans-prop.prop_won.NoFans)
print(best_estimate)

h_est=0

test_stat= (best_estimate-h_est)/standard_error

print(test_stat)

0.09654491987317643
2.8696330715998557


In [57]:
# Calculate the  p-value
pvalue = 2*dist.norm.cdf(-np.abs(test_stat)) # Multiplied by two indicates a two tailed testing.
print("Computed P-value is", pvalue)

##Significant difference

Computed P-value is 0.0041094837257014535


In [46]:
Table= pd.crosstab(df.GoalDifferential,df.BiAttendance)
Table

BiAttendance,Fans,NoFans
GoalDifferential,Unnamed: 1_level_1,Unnamed: 2_level_1
0,67,14
1,144,36
2,279,58
3,275,57
4,203,52
5,102,24
6,68,16
7,20,8
8,13,2
9,2,0


In [58]:
rp.ttest(group1= df['GoalDifferential'][df['BiAttendance'] == 'Fans'], group1_name= "Fans",
         group2= df['GoalDifferential'][df['BiAttendance'] == 'NoFans'], group2_name= "NoFans")

#No difference

(   Variable       N      Mean        SD        SE  95% Conf.  Interval
 0      Fans  1173.0  3.000000  1.702738  0.049716   2.902457  3.097543
 1    NoFans   267.0  3.067416  1.730733  0.105919   2.858869  3.275962
 2  combined  1440.0  3.012500  1.707559  0.044998   2.924231  3.100769,
               Independent t-test    results
 0  Difference (Fans - NoFans) =     -0.0674
 1          Degrees of freedom =   1438.0000
 2                           t =     -0.5821
 3       Two side test p value =      0.5606
 4      Difference < 0 p value =      0.2803
 5      Difference > 0 p value =      0.7197
 6                   Cohen's d =     -0.0395
 7                   Hedge's g =     -0.0395
 8               Glass's delta =     -0.0396
 9                           r =      0.0153)

In [61]:
rp.ttest(group1= VarGames['HomePassAccuracy'][VarGames['BiAttendance'] == 'Fans'], group1_name= "Fans",
         group2= VarGames['HomePassAccuracy'][VarGames['BiAttendance'] == 'NoFans'], group2_name= "NoFans")
#no significant difference

(   Variable       N      Mean        SD        SE  95% Conf.  Interval
 0      Fans   867.0  0.772074  0.068681  0.002333   0.767496  0.776652
 1    NoFans   267.0  0.780696  0.062949  0.003852   0.773111  0.788281
 2  combined  1134.0  0.774104  0.067447  0.002003   0.770174  0.778034,
               Independent t-test    results
 0  Difference (Fans - NoFans) =     -0.0086
 1          Degrees of freedom =   1132.0000
 2                           t =     -1.8284
 3       Two side test p value =      0.0677
 4      Difference < 0 p value =      0.0339
 5      Difference > 0 p value =      0.9661
 6                   Cohen's d =     -0.1280
 7                   Hedge's g =     -0.1279
 8               Glass's delta =     -0.1255
 9                           r =      0.0543)

In [62]:
rp.ttest(group1= VarGames['HomeTandInt'][VarGames['BiAttendance'] == 'Fans'], group1_name= "Fans",
         group2= VarGames['HomeTandInt'][VarGames['BiAttendance'] == 'NoFans'], group2_name= "NoFans")
#significant difference

(   Variable       N       Mean        SD        SE  95% Conf.   Interval
 0      Fans   867.0  25.816609  7.088996  0.240755  25.344078  26.289140
 1    NoFans   267.0  27.569288  8.000345  0.489613  26.605278  28.533299
 2  combined  1134.0  26.229277  7.347892  0.218201  25.801154  26.657400,
               Independent t-test    results
 0  Difference (Fans - NoFans) =     -1.7527
 1          Degrees of freedom =   1132.0000
 2                           t =     -3.4241
 3       Two side test p value =      0.0006
 4      Difference < 0 p value =      0.0003
 5      Difference > 0 p value =      0.9997
 6                   Cohen's d =     -0.2397
 7                   Hedge's g =     -0.2395
 8               Glass's delta =     -0.2472
 9                           r =      0.1012)

In [59]:
rp.ttest(group1= VarGames['HomeFoulsandYRcards'][VarGames['BiAttendance'] == 'Fans'], group1_name= "Fans",
         group2= VarGames['HomeFoulsandYRcards'][VarGames['BiAttendance'] == 'NoFans'], group2_name= "NoFans")

#Significant difference in calls against home team

(   Variable       N       Mean        SD        SE  95% Conf.   Interval
 0      Fans   867.0  16.698962  5.786864  0.196532  16.313227  17.084697
 1    NoFans   267.0  18.101124  5.652290  0.345915  17.420045  18.782202
 2  combined  1134.0  17.029101  5.783688  0.171751  16.692115  17.366086,
               Independent t-test    results
 0  Difference (Fans - NoFans) =     -1.4022
 1          Degrees of freedom =   1132.0000
 2                           t =     -3.4807
 3       Two side test p value =      0.0005
 4      Difference < 0 p value =      0.0003
 5      Difference > 0 p value =      0.9997
 6                   Cohen's d =     -0.2436
 7                   Hedge's g =     -0.2435
 8               Glass's delta =     -0.2423
 9                           r =      0.1029)

In [60]:
rp.ttest(group1= VarGames['AwayFoulsandYRcards'][VarGames['BiAttendance'] == 'Fans'], group1_name= "Fans",
         group2= VarGames['AwayFoulsandYRcards'][VarGames['BiAttendance'] == 'NoFans'], group2_name= "NoFans")

#no significant difference in calls against the away team

(   Variable       N       Mean        SD        SE  95% Conf.   Interval
 0      Fans   867.0  18.295271  5.836139  0.198206  17.906251  18.684291
 1    NoFans   267.0  18.093633  5.650424  0.345800  17.412779  18.774487
 2  combined  1134.0  18.247795  5.791110  0.171971  17.910378  18.585213,
               Independent t-test    results
 0  Difference (Fans - NoFans) =      0.2016
 1          Degrees of freedom =   1132.0000
 2                           t =      0.4973
 3       Two side test p value =      0.6191
 4      Difference < 0 p value =      0.6905
 5      Difference > 0 p value =      0.3095
 6                   Cohen's d =      0.0348
 7                   Hedge's g =      0.0348
 8               Glass's delta =      0.0345
 9                           r =      0.0148)