In [1]:
import pandas as pd
import numpy as np
# package with hypothesis tests
import scipy.stats as st

## Data

You can download the data from [**here**](https://drive.google.com/file/d/19b9lHlkixZhs8yka8zV0QFieao66dUcY/view?usp=sharing). There are results of NBA games from seasons 2013 to 2015.

In [2]:
nba = pd.read_csv("C:/Users/Tim/Desktop/lighthouse/w2/d5/probability_and_statistics_exercise/nba.csv", sep=';')
df = pd.DataFrame(nba)
df.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22015,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,...,0.826,5,38,43,41,14,8,13,20,35.0
1,22015,1610612749,MIL,Milwaukee Bucks,21501225,2016-04-13,MIL vs. IND,L,240,92,...,0.846,7,36,43,23,8,3,15,15,-5.0
2,22015,1610612738,BOS,Boston Celtics,21501217,2016-04-13,BOS vs. MIA,W,240,98,...,0.864,10,29,39,20,7,3,7,20,10.0
3,22015,1610612747,LAL,Los Angeles Lakers,21501228,2016-04-13,LAL vs. UTA,W,239,101,...,0.867,8,39,47,19,6,3,13,17,5.0
4,22015,1610612739,CLE,Cleveland Cavaliers,21501220,2016-04-13,CLE vs. DET,L,265,110,...,0.733,8,35,43,21,4,7,10,23,-2.0


In [3]:
for cols in df.columns:
    print(cols)

SEASON_ID
TEAM_ID
TEAM_ABBREVIATION
TEAM_NAME
GAME_ID
GAME_DATE
MATCHUP
WL
MIN
PTS
FGM
FGA
FG_PCT
FG3M
FG3A
FG3_PCT
FTM
FTA
FT_PCT
OREB
DREB
REB
AST
STL
BLK
TOV
PF
PLUS_MINUS


### Split the data into 3 separate files according to the season!

In [4]:
df.groupby(['SEASON_ID'])['GAME_ID'].count()

SEASON_ID
22013    2460
22014    2460
22015    2460
Name: GAME_ID, dtype: int64

In [5]:
df2013 = df[df['SEASON_ID']==22013]
df2013.to_csv('nba2013.csv',index=False)

In [6]:
df2014 = df[df['SEASON_ID']==22014]
df2014.to_csv('nba2014.csv',index=False)

In [7]:
df2015 = df[df['SEASON_ID']==22015]
df2015.to_csv('nba2015.csv',index=False)

### Test, if the hypothesis that offensive production of Cleveland Cavaliers and Golden State Warriors (teams from finals) was distributed equally in 2015/2016

Do two separate tests for PTS (Points) and FG_PCT (Field Goal Percentage)

In [8]:
st.ks_2samp(df2015[df2015['TEAM_NAME']=="Cleveland Cavaliers"]['PTS'],df2015[df2015['TEAM_NAME']=="Golden State Warriors"]['PTS'])

#cannot reject the null hypothesis that PTS for both teams were drawn from the same cont dist

KstestResult(statistic=0.4024390243902439, pvalue=2.5400281722991783e-06)

In [9]:
st.ks_2samp(df2015[df2015['TEAM_NAME']=="Cleveland Cavaliers"]['FG_PCT'],df2015[df2015['TEAM_NAME']=="Golden State Warriors"]['FG_PCT'])

##reject the null hypothesis that FG_PCT for both teams were drawn from the same cont dist

KstestResult(statistic=0.25609756097560976, pvalue=0.008991166830753808)

In [10]:
df2015[df2015['TEAM_NAME']=="Cleveland Cavaliers"]['GAME_DATE']

4       2016-04-13
40      2016-04-11
78      2016-04-09
127     2016-04-06
151     2016-04-05
           ...    
2336    2015-11-04
2364    2015-11-02
2399    2015-10-30
2440    2015-10-28
2456    2015-10-27
Name: GAME_DATE, Length: 82, dtype: object

### Test whether production (PTS) of Cleveland changes significantly before and after coach change in 2015/2016 - Use ony data from seasons 2014/2015 and 2015/2016 - Those are when cleveland was coached by Blatt
#### Coach Blatt was fired on 24th of Jan, 2016

We have two possible solutions here:
- take the same amount of games from before and after and try t-test.
- take all the games from before and after and look for the right test to compare two samples with different sizes

In [11]:
dfq = df[(df['TEAM_NAME']=="Cleveland Cavaliers")&(df['GAME_DATE']>"2016-01-24")]
dfp = df[(df['TEAM_NAME']=="Cleveland Cavaliers")&(df['GAME_DATE']<"2016-01-24")]

In [12]:
df[(df['TEAM_NAME']=="Cleveland Cavaliers")&(df['GAME_DATE']>"2016-01-24")]['GAME_ID'].count()

40

In [13]:
dfp = dfp.iloc[:40]
dfp['GAME_ID'].count()

40

In [14]:
st.ttest_ind(dfp["PTS"],dfq["PTS"],equal_var=False)

#pts change significant after change

Ttest_indResult(statistic=-2.911228614180665, pvalue=0.004774658460978929)

Download the same dataset for playoffs games in 2016 from [**here**](https://drive.google.com/file/d/1jY57bAOZp9y83b4W2PAoSH1uFARaxxls/view?usp=sharing)

In [15]:
nba2 = pd.read_csv("C:/Users/Tim/Desktop/lighthouse/w2/d5/probability_and_statistics_exercise/nbapo16.csv", sep=';')
dfpo = pd.DataFrame(nba2)
dfpo.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42015,1610612739,CLE,Cleveland Cavaliers,41500407,2016-06-19,CLE @ GSW,W,241,93,...,0.84,9,39,48,17,7,6,11,15,4.0
1,42015,1610612744,GSW,Golden State Warriors,41500407,2016-06-19,GSW vs. CLE,L,239,89,...,0.769,7,32,39,22,7,5,10,23,-4.0
2,42015,1610612744,GSW,Golden State Warriors,41500406,2016-06-16,GSW @ CLE,L,238,101,...,0.69,9,26,35,19,5,3,14,25,-14.0
3,42015,1610612739,CLE,Cleveland Cavaliers,41500406,2016-06-16,CLE vs. GSW,W,240,115,...,0.781,8,37,45,24,12,7,10,25,14.0
4,42015,1610612739,CLE,Cleveland Cavaliers,41500405,2016-06-13,CLE @ GSW,W,241,112,...,0.609,8,33,41,15,11,9,16,22,15.0


### Select Toronto Raptors and test the hypothesis that number of blocks in playoffs (BLK) are from the same distribution as in reguar season 2015/2016
we again have two samples with different sizes

In [16]:
dftr = df2015[df2015['TEAM_NAME']=="Toronto Raptors"]
dftrpo = dfpo[dfpo['TEAM_NAME']=="Toronto Raptors"]

In [17]:
st.ttest_ind(dftr["BLK"],dftrpo["BLK"],equal_var=False)

Ttest_indResult(statistic=3.500438136870473, pvalue=0.0011403514552816168)

In [None]:
#significant different btw regular season and playoffs

In [18]:
st.ks_2samp(dftr["BLK"],dftrpo["BLK"])

KstestResult(statistic=0.33414634146341465, pvalue=0.04184880172296346)

In [None]:
#reject null that they are from the same cont dist

### Test the hypothesis that points per game (PTS) are equally distributed in all 3 seasons for Cleveland
we need a hypothesis test to compare more than 2 distributions

In [20]:
dfc13 = df2013[df2013['TEAM_NAME']=="Cleveland Cavaliers"]
dfc14 = df2014[df2014['TEAM_NAME']=="Cleveland Cavaliers"]
dfc15 = df2015[df2015['TEAM_NAME']=="Cleveland Cavaliers"]

In [21]:
st.f_oneway(dfc13['PTS'],dfc14['PTS'],dfc15['PTS'])

F_onewayResult(statistic=5.9200250318080885, pvalue=0.003087727119983984)

In [22]:
#reject null hyp that these groups have the same pop mean

In [23]:
#for diff var
st.bartlett(dfc13['PTS'],dfc14['PTS'],dfc15['PTS'])
#same var

BartlettResult(statistic=1.8337888706159797, pvalue=0.3997585915033671)

#### Between which seasons, we can see the significant difference?
+ unfortunatelly, this is not the output of ANOVA test and further analysis needs to be applied in most of the cases
+ Note that Lebron James came back to Cleveland prior to season 2014/2015 (just for interpretation of the results)

In [24]:
st.ttest_ind(dfc13["PTS"],dfc14["PTS"],equal_var=False)

Ttest_indResult(statistic=-2.508958204796911, pvalue=0.013092273648525245)

In [25]:
st.f_oneway(dfc13['PTS'],dfc14['PTS'])

F_onewayResult(statistic=6.29487127341772, pvalue=0.013091680534336554)

In [26]:
st.ttest_ind(dfc14["PTS"],dfc15["PTS"],equal_var=False)

Ttest_indResult(statistic=-0.6442093460555935, pvalue=0.5203691269861739)

In [27]:
st.f_oneway(dfc14['PTS'],dfc15['PTS'])

F_onewayResult(statistic=0.41500568154538037, pvalue=0.520350761773442)

In [None]:
#btw years 13 and 14 that have sig diff