In [1]:
# Import necessary tools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
# First, grab tables from other notebooks
%store -r awayTable
%store -r homeTable
%store -r awayMatchup
%store -r homeMatchup
%store -r features

In [3]:
features.pop(0)
features.pop(1)

'surface'

In [4]:
features.append('pf')

In [5]:
# Look at correlation matrix for home teams and away teams
corr = homeMatchup[features].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,stDvoa,offMatchup,offDvoaMatchup,ovrMatchup,runMatchup,ptsMatchup,passMatchup,pblkMatchup,totalDvoaMatchup,pf
stDvoa,1.0,0.0124472,0.0792095,0.0514683,0.00797092,0.114562,0.0353229,0.0146837,0.185425,0.0854604
offMatchup,0.0124472,1.0,0.132368,0.693633,0.561324,0.64654,0.762152,0.295714,0.473857,0.235993
offDvoaMatchup,0.0792095,0.132368,1.0,0.159731,-0.0768149,0.175412,0.216242,0.0445729,0.16848,0.172356
ovrMatchup,0.0514683,0.693633,0.159731,1.0,0.318868,0.617805,0.610908,0.22817,0.695742,0.233422
runMatchup,0.00797092,0.561324,-0.0768149,0.318868,1.0,0.283606,0.157335,-0.014675,0.187224,0.0796019
ptsMatchup,0.114562,0.64654,0.175412,0.617805,0.283606,1.0,0.638371,0.154357,0.587692,0.285279
passMatchup,0.0353229,0.762152,0.216242,0.610908,0.157335,0.638371,1.0,0.0729054,0.465104,0.234116
pblkMatchup,0.0146837,0.295714,0.0445729,0.22817,-0.014675,0.154357,0.0729054,1.0,0.160051,0.102824
totalDvoaMatchup,0.185425,0.473857,0.16848,0.695742,0.187224,0.587692,0.465104,0.160051,1.0,0.269303
pf,0.0854604,0.235993,0.172356,0.233422,0.0796019,0.285279,0.234116,0.102824,0.269303,1.0


In [6]:
corr = awayMatchup[features].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,stDvoa,offMatchup,offDvoaMatchup,ovrMatchup,runMatchup,ptsMatchup,passMatchup,pblkMatchup,totalDvoaMatchup,pf
stDvoa,1.0,-0.0141906,0.0438029,0.029566,-0.0136609,0.0839382,0.00592466,0.00718014,0.15686,0.0284542
offMatchup,-0.0141906,1.0,0.145469,0.701385,0.542278,0.660737,0.771839,0.287949,0.491936,0.249005
offDvoaMatchup,0.0438029,0.145469,1.0,0.186499,-0.0572846,0.185901,0.224991,0.0386995,0.178425,0.121729
ovrMatchup,0.029566,0.701385,0.186499,1.0,0.308102,0.614794,0.616786,0.235913,0.695742,0.238515
runMatchup,-0.0136609,0.542278,-0.0572846,0.308102,1.0,0.280744,0.1548,-0.0211786,0.207647,0.12586
ptsMatchup,0.0839382,0.660737,0.185901,0.614794,0.280744,1.0,0.645901,0.155539,0.581535,0.292522
passMatchup,0.00592466,0.771839,0.224991,0.616786,0.1548,0.645901,1.0,0.0599073,0.475858,0.256675
pblkMatchup,0.00718014,0.287949,0.0386995,0.235913,-0.0211786,0.155539,0.0599073,1.0,0.153557,0.0254453
totalDvoaMatchup,0.15686,0.491936,0.178425,0.695742,0.207647,0.581535,0.475858,0.153557,1.0,0.280353
pf,0.0284542,0.249005,0.121729,0.238515,0.12586,0.292522,0.256675,0.0254453,0.280353,1.0


Now that we have a sense for how each variable relates to the other, let's do some hypothesis testing. Our first test will be to see if home teams who have an above-average points matchup (avg points scored + opponent average points allowed) leads to scoring more points (pf). Let H0: no difference in average points scored when a home team has an above-average points matchup. Then HA: There is a difference in the two means; points matchup has a significant impact on points scored for home teams. First, let's look at summary statistics for ptsMatchup for home teams.

In [7]:
print(homeMatchup['ptsMatchup'].describe())

count    2736.000000
mean       44.873016
std         6.742792
min        21.428571
25%        40.285714
50%        44.785714
75%        49.285714
max        75.285714
Name: ptsMatchup, dtype: float64


We see that the average is around 44.87, so this will be the point that separates below-average (ba) matchups from above-average (aa) ones. Now let's find our mean points scored, standard deviations, and sample sizes for home teams with both ba and aa matchups.

In [8]:
print(np.mean(homeMatchup[homeMatchup['ptsMatchup'] > 44.87]['pf']))
print(np.std(homeMatchup[homeMatchup['ptsMatchup'] > 44.87]['pf']))
print(len(homeMatchup[homeMatchup['ptsMatchup'] > 44.87]['pf']))
print(np.mean(homeMatchup[homeMatchup['ptsMatchup'] <= 44.87]['pf']))
print(np.std(homeMatchup[homeMatchup['ptsMatchup'] <= 44.87]['pf']))
print(len(homeMatchup[homeMatchup['ptsMatchup'] <= 44.87]['pf']))

26.3974645787
10.4644261335
1341
21.1913978495
10.0019315975
1395


Now that we have found all of this information, we can compute a z-score. Using alpha = 0.05, we have a critical value of 1.65 since we are doing a one-tailed test (testing if aa matchups lead to more points). If our observed statistic has a value > 1.65, we should reject our null hypothesis. 

In [9]:
mean1 = np.mean(homeMatchup[homeMatchup['ptsMatchup'] > 44.87]['pf'])
std1 = np.std(homeMatchup[homeMatchup['ptsMatchup'] > 44.87]['pf'])
n1 = len(homeMatchup[homeMatchup['ptsMatchup'] > 44.87]['pf'])
mean2 = np.mean(homeMatchup[homeMatchup['ptsMatchup'] <= 44.87]['pf'])
std2 = np.std(homeMatchup[homeMatchup['ptsMatchup'] <= 44.87]['pf'])
n2 = len(homeMatchup[homeMatchup['ptsMatchup'] <= 44.87]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

13.170058259473853

Since our z-score > 1.65, we reject H0. We have strong evidence to suggest that having an above-average points matchup leads to home teams scoring more points.

Now let's follow the same approach for some more variables, and make sure to check the variable for away teams as well.

In [10]:
# ptsMatchup for away teams

print(awayMatchup['ptsMatchup'].describe())
print(np.mean(awayMatchup[awayMatchup['ptsMatchup'] > 45.11]['pf']))
print(len(awayMatchup[awayMatchup['ptsMatchup'] > 45.11]['pf']))
print(np.mean(awayMatchup[awayMatchup['ptsMatchup'] <= 45.11]['pf']))
print(len(awayMatchup[awayMatchup['ptsMatchup'] <= 45.11]['pf']))

count    2736.000000
mean       45.116541
std         6.753264
min        24.857143
25%        40.571429
50%        45.000000
75%        49.285714
max        69.142857
Name: ptsMatchup, dtype: float64
23.4859467456
1352
19.0202312139
1384


In [11]:
mean1 = np.mean(awayMatchup[awayMatchup['ptsMatchup'] > 45.11]['pf'])
std1 = np.std(awayMatchup[awayMatchup['ptsMatchup'] > 45.11]['pf'])
n1 = len(awayMatchup[awayMatchup['ptsMatchup'] > 45.11]['pf'])
mean2 = np.mean(awayMatchup[awayMatchup['ptsMatchup'] <= 45.11]['pf'])
std2 = np.std(awayMatchup[awayMatchup['ptsMatchup'] <= 45.11]['pf'])
n2 = len(awayMatchup[awayMatchup['ptsMatchup'] <= 45.11]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

11.933960565546466

In [12]:
# totalDvoaMathup for home teams

print(homeMatchup['totalDvoaMatchup'].describe())
print(np.mean(homeMatchup[homeMatchup['totalDvoaMatchup'] > -0.008379]['pf']))
print(len(homeMatchup[homeMatchup['totalDvoaMatchup'] > -0.008379]['pf']))
print(np.mean(homeMatchup[homeMatchup['totalDvoaMatchup'] <= -0.008379]['pf']))
print(len(homeMatchup[homeMatchup['totalDvoaMatchup'] <= -0.008379]['pf']))

count    2736.000000
mean       -0.008379
std         0.312510
min        -1.240000
25%        -0.218000
50%        -0.004000
75%         0.200250
max         1.408000
Name: totalDvoaMatchup, dtype: float64
26.1452312139
1384
21.2840236686
1352


In [13]:
mean1 = np.mean(homeMatchup[homeMatchup['totalDvoaMatchup'] > -0.008379]['pf'])
std1 = np.std(homeMatchup[homeMatchup['totalDvoaMatchup'] > -0.008379]['pf'])
n1 = len(homeMatchup[homeMatchup['totalDvoaMatchup'] > -0.008379]['pf'])
mean2 = np.mean(homeMatchup[homeMatchup['totalDvoaMatchup'] <= -0.008379]['pf'])
std2 = np.std(homeMatchup[homeMatchup['totalDvoaMatchup'] <= -0.008379]['pf'])
n2 = len(homeMatchup[homeMatchup['totalDvoaMatchup'] <= -0.008379]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

12.449576328460539

In [14]:
# totalDvoaMathup for away teams

print(awayMatchup['totalDvoaMatchup'].describe())
print(np.mean(awayMatchup[awayMatchup['totalDvoaMatchup'] > 0.008379]['pf']))
print(len(awayMatchup[awayMatchup['totalDvoaMatchup'] > 0.008379]['pf']))
print(np.mean(awayMatchup[awayMatchup['totalDvoaMatchup'] <= 0.008379]['pf']))
print(len(awayMatchup[awayMatchup['totalDvoaMatchup'] <= 0.008379]['pf']))

count    2736.000000
mean        0.008379
std         0.312510
min        -1.408000
25%        -0.200250
50%         0.004000
75%         0.218000
max         1.240000
Name: totalDvoaMatchup, dtype: float64
23.5539940828
1352
18.9537572254
1384


In [15]:
mean1 = np.mean(awayMatchup[awayMatchup['totalDvoaMatchup'] > 0.008379]['pf'])
std1 = np.std(awayMatchup[awayMatchup['totalDvoaMatchup'] > 0.008379]['pf'])
n1 = len(awayMatchup[awayMatchup['totalDvoaMatchup'] > 0.008379]['pf'])
mean2 = np.mean(awayMatchup[awayMatchup['totalDvoaMatchup'] <= 0.008379]['pf'])
std2 = np.std(awayMatchup[awayMatchup['totalDvoaMatchup'] <= 0.008379]['pf'])
n2 = len(awayMatchup[awayMatchup['totalDvoaMatchup'] <= 0.008379]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

12.313237425370925

In [16]:
# offDvoaMatchup for home teams

print(homeMatchup['offDvoaMatchup'].describe())
print(np.mean(homeMatchup[homeMatchup['offDvoaMatchup'] > 0.008090]['pf']))
print(len(homeMatchup[homeMatchup['offDvoaMatchup'] > 0.008090]['pf']))
print(np.mean(homeMatchup[homeMatchup['offDvoaMatchup'] <= 0.008090]['pf']))
print(len(homeMatchup[homeMatchup['offDvoaMatchup'] <= 0.008090]['pf']))

count    2736.000000
mean        0.008090
std         0.216535
min        -0.824000
25%        -0.129000
50%         0.008500
75%         0.146000
max         0.753000
Name: offDvoaMatchup, dtype: float64
25.298245614
1368
22.1878654971
1368


In [17]:
mean1 = np.mean(homeMatchup[homeMatchup['offDvoaMatchup'] > 0.008090]['pf'])
std1 = np.std(homeMatchup[homeMatchup['offDvoaMatchup'] > 0.008090]['pf'])
n1 = len(homeMatchup[homeMatchup['offDvoaMatchup'] > 0.008090]['pf'])
mean2 = np.mean(homeMatchup[homeMatchup['offDvoaMatchup'] <= 0.008090]['pf'])
std2 = np.std(homeMatchup[homeMatchup['offDvoaMatchup'] <= 0.008090]['pf'])
n2 = len(homeMatchup[homeMatchup['offDvoaMatchup'] <= 0.008090]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

7.7904662346937776

In [18]:
# offDvoaMathup for away teams

print(awayMatchup['offDvoaMatchup'].describe())
print(np.mean(awayMatchup[awayMatchup['offDvoaMatchup'] > 0.005385]['pf']))
print(len(awayMatchup[awayMatchup['offDvoaMatchup'] > 0.005385]['pf']))
print(np.mean(awayMatchup[awayMatchup['offDvoaMatchup'] <= 0.005385]['pf']))
print(len(awayMatchup[awayMatchup['offDvoaMatchup'] <= 0.005385]['pf']))

count    2736.000000
mean        0.005385
std         0.211278
min        -0.868000
25%        -0.130000
50%         0.004000
75%         0.143000
max         0.744000
Name: offDvoaMatchup, dtype: float64
22.124082232
1362
20.3377001456
1374


In [19]:
mean1 = np.mean(awayMatchup[awayMatchup['offDvoaMatchup'] > 0.005385]['pf'])
std1 = np.std(awayMatchup[awayMatchup['offDvoaMatchup'] > 0.005385]['pf'])
n1 = len(awayMatchup[awayMatchup['offDvoaMatchup'] > 0.005385]['pf'])
mean2 = np.mean(awayMatchup[awayMatchup['offDvoaMatchup'] <= 0.005385]['pf'])
std2 = np.std(awayMatchup[awayMatchup['offDvoaMatchup'] <= 0.005385]['pf'])
n2 = len(awayMatchup[awayMatchup['offDvoaMatchup'] <= 0.005385]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

4.6900736302370438

In [20]:
# Playing surface for home teams

print(np.mean(homeMatchup[homeMatchup['surface'] == 'grass']['pf']))
print(np.mean(homeMatchup[homeMatchup['surface'] == 'turf']['pf']))

22.8563068921
24.8814691152


In [21]:
mean1 = np.mean(homeMatchup[homeMatchup['surface'] == 'turf']['pf'])
std1 = np.std(homeMatchup[homeMatchup['surface'] == 'turf']['pf'])
n1 = len(homeMatchup[homeMatchup['surface'] == 'turf']['pf'])
mean2 = np.mean(homeMatchup[homeMatchup['surface'] == 'grass']['pf'])
std2 = np.std(homeMatchup[homeMatchup['surface'] == 'grass']['pf'])
n2 = len(homeMatchup[homeMatchup['surface'] == 'grass']['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

4.7121592021500192

In [22]:
# Playing surface for away teams

print(np.mean(homeMatchup[homeMatchup['surface'] == 'grass']['pf']))
print(np.mean(homeMatchup[homeMatchup['surface'] == 'turf']['pf']))

22.8563068921
24.8814691152


In [23]:
mean1 = np.mean(awayMatchup[awayMatchup['surface'] == 'turf']['pf'])
std1 = np.std(awayMatchup[awayMatchup['surface'] == 'turf']['pf'])
n1 = len(awayMatchup[awayMatchup['surface'] == 'turf']['pf'])
mean2 = np.mean(awayMatchup[awayMatchup['surface'] == 'grass']['pf'])
std2 = np.std(awayMatchup[awayMatchup['surface'] == 'grass']['pf'])
n2 = len(awayMatchup[awayMatchup['surface'] == 'grass']['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

-0.77104895229523951

In [24]:
# Pass blocking matchup for home teams

print(homeMatchup['pblkMatchup'].describe())
print(np.mean(homeMatchup[homeMatchup['pblkMatchup'] > 7]['pf']))
print(np.std(homeMatchup[homeMatchup['pblkMatchup'] > 7]['pf']))
print(len(homeMatchup[homeMatchup['pblkMatchup'] > 7]['pf']))
print(np.mean(homeMatchup[homeMatchup['pblkMatchup'] <= 7]['pf']))
print(np.std(homeMatchup[homeMatchup['pblkMatchup'] <= 7]['pf']))
print(len(homeMatchup[homeMatchup['pblkMatchup'] <= 7]['pf']))

count    2736.000000
mean        6.990836
std         7.742817
min       -19.557143
25%         1.771429
50%         7.014286
75%        12.332143
max        30.757143
Name: pblkMatchup, dtype: float64
24.6851311953
10.6043329844
1372
22.7954545455
10.4237185763
1364


In [25]:
mean1 = np.mean(homeMatchup[homeMatchup['pblkMatchup'] > 7]['pf'])
std1 = np.std(homeMatchup[homeMatchup['pblkMatchup'] > 7]['pf'])
n1 = len(homeMatchup[homeMatchup['pblkMatchup'] > 7]['pf'])
mean2 = np.mean(homeMatchup[homeMatchup['pblkMatchup'] <= 7]['pf'])
std2 = np.std(homeMatchup[homeMatchup['pblkMatchup'] <= 7]['pf'])
n2 = len(homeMatchup[homeMatchup['pblkMatchup'] <= 7]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

4.707220014261388

In [26]:
# Pass blocking matchup for away teams

print(awayMatchup['pblkMatchup'].describe())
print(np.mean(awayMatchup[awayMatchup['pblkMatchup'] > 7.2]['pf']))
print(np.std(awayMatchup[awayMatchup['pblkMatchup'] > 7.2]['pf']))
print(len(awayMatchup[awayMatchup['pblkMatchup'] > 7.2]['pf']))
print(np.mean(awayMatchup[awayMatchup['pblkMatchup'] <= 7.2]['pf']))
print(np.std(awayMatchup[awayMatchup['pblkMatchup'] <= 7.2]['pf']))
print(len(awayMatchup[awayMatchup['pblkMatchup'] <= 7.2]['pf']))

count    2736.000000
mean        7.204141
std         7.791965
min       -22.071429
25%         2.242857
50%         7.285714
75%        12.589286
max        33.471429
Name: pblkMatchup, dtype: float64
21.3543478261
10.0191745123
1380
21.0973451327
9.93508244345
1356


In [27]:
mean1 = np.mean(awayMatchup[awayMatchup['pblkMatchup'] > 7.2]['pf'])
std1 = np.std(awayMatchup[awayMatchup['pblkMatchup'] > 7.2]['pf'])
n1 = len(awayMatchup[awayMatchup['pblkMatchup'] > 7.2]['pf'])
mean2 = np.mean(awayMatchup[awayMatchup['pblkMatchup'] <= 7.2]['pf'])
std2 = np.std(awayMatchup[awayMatchup['pblkMatchup'] <= 7.2]['pf'])
n2 = len(awayMatchup[awayMatchup['pblkMatchup'] <= 7.2]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

0.67663241406536812

In [29]:
print(homeMatchup['runMatchup'].describe())
print(np.mean(homeMatchup[homeMatchup['runMatchup'] > -4.24]['pf']))
print(np.std(homeMatchup[homeMatchup['runMatchup'] > -4.24]['pf']))
print(len(homeMatchup[homeMatchup['runMatchup'] > -4.24]['pf']))
print(np.mean(homeMatchup[homeMatchup['runMatchup'] <= -4.24]['pf']))
print(np.std(homeMatchup[homeMatchup['runMatchup'] <= -4.24]['pf']))
print(len(homeMatchup[homeMatchup['runMatchup'] <= -4.24]['pf']))

count    2736.000000
mean       -4.240638
std         7.821092
min       -32.800000
25%        -9.832143
50%        -4.542857
75%         1.200000
max        21.800000
Name: runMatchup, dtype: float64
24.7302431611
10.8526759241
1316
22.8281690141
10.1904426793
1420


In [30]:
mean1 = np.mean(homeMatchup[homeMatchup['runMatchup'] > -4.24]['pf'])
std1 = np.std(homeMatchup[homeMatchup['runMatchup'] > -4.24]['pf'])
n1 = len(homeMatchup[homeMatchup['runMatchup'] > -4.24]['pf'])
mean2 = np.mean(homeMatchup[homeMatchup['runMatchup'] <= -4.24]['pf'])
std2 = np.std(homeMatchup[homeMatchup['runMatchup'] <= -4.24]['pf'])
n2 = len(homeMatchup[homeMatchup['runMatchup'] <= -4.24]['pf'])
observed = mean1 - mean2
stdDiff = np.sqrt((((std1)**2)/n1) + (((std2)**2)/n1))
zScore = (observed - 0)/stdDiff
zScore

4.6349529548612365

Going through the hypothesis testing for the above variables, it was found that for home teams, ptsMatchup, totalDvoaMatchup, offDvoaMatchup, playing surface, and pass-blocking matchup were significant. For away teams, ptsMatchup, totalDvoaMatchup, and offDvoaMatchup, were significant. 