In [None]:
!pip install numpy==1.20.0
!pip install scipy==1.6.3
!pip install researchpy

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

___
# Stats 1
1. Introduction
2. Descriptive statistics


In [None]:
data = pd.read_csv("/kaggle/input/premier-league-player-statistics-updated-daily/dataset - 2020-09-24.csv")
data.replace('NaN',0, inplace=True)
data.fillna(value=0.0, inplace=True)
data.head()

In [None]:
# Graph 1) HISTOGRAM
plt.figure(figsize=(6,3))
data['Age'].plot(kind='hist')
plt.xlabel("Age")
plt.title("Age histogram");

In [None]:
# Graph 2) BOXPLOT
data['Shooting accuracy %'] = data['Shooting accuracy %'].str[:-1].astype('float64')
sns.boxplot(data=data, y='Shooting accuracy %');
# data['Shooting accuracy %'].plot(kind='box') will do the same

3.  Interential statistic
**Hypothesis testing**<br>
- $H_0$ : no difference from before
- $H_1$ : difference from before

*p value* = probability that we will mistakenly reject the $H_0$. p value, aka. acceptable mistake rate, have to less than some threshold $\alpha$ so that we can reject the $H_0$.

- Test for **Mean**:
    - Is $\mu = \mu_{0}$ given this data -> *one sample t-test* (`scipy.stats.ttest_1samp`)
    - Is $\mu_{a} = \mu_{b}$ -> *two sample t-test* (`scipy.stats.ttest_ind`) \**$a$ and $b$ are independent
        - If a, b, c -> *ANOVA*
    - Is $\mu_{\text{before}} = \mu_{\text{after}}$ -> *paired t-test* (`scipy.stats.ttest_rel`)

In [None]:
sample = np.random.normal(size=(1000),loc=3.0, scale=1.0)

# Null: pop mean equal to 0
from scipy.stats import ttest_1samp

t, p = ttest_1samp(sample, 0)
print(f'p_val = {p}')   # Reject Null

In [None]:
a = np.random.normal(size=(1000),loc=3.0, scale=1.0)
b = np.random.normal(size=(900),loc=10.0, scale=1.0)

# Null: a and b have the same pop mean
from scipy.stats import ttest_ind
t, p = ttest_ind(a, b, equal_var=True)
print(f'p_val = {p}')   # Reject Null

In [None]:
before = np.random.normal(size=(1000),loc=3.0, scale=1.0)
after = np.random.normal(size=(1000),loc=10.0, scale=1.0)

# Null: after has greater pop mean.
from scipy.stats import ttest_rel
t, p = ttest_rel(before, after, alternative='less')
print(f'p_val = {p}')   # Reject Null

![](https://i.pinimg.com/originals/43/8e/75/438e752694f5738971efb00f595c344c.png)

___
# Stats2
1. ANOVA : test if 3+ pop means are all equal. <br>

If #indep. vars == 1 : *One-way ANOVA* (`scipy.stats.f_oneway` , `statsmodels.api.stats.anova_lm`)<br>
else : *Two-way ANOVA* (`statsmodels.api.stats.anova_lm`)

In [None]:
nat_appear = data[['Nationality','Appearances']]
nat_appear.head()

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Null: All nationality have the same pop mean of appearences
model = ols('Appearances ~ Nationality', data=nat_appear).fit()
table = sm.stats.anova_lm(model)

print(table)   # P_val = 0.99, we can't reject the Null

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(data=nat_appear, y='Appearances', x='Nationality')
plt.xticks(rotation=90);

2. Correlation : continuous ~ continuous vars are related

$H_0$ : `x` and `y` are not correlated (r=0)

In [None]:
data[['Age','Wins']].corr(method='pearson')

In [None]:
from scipy.stats import pearsonr

# Null: Age and wins have no correlation to each other (r=0)
r, p = pearsonr(data['Age'], data['Wins'])
print(f'p_val = {p}\ncorr = {r}')   # Reject null : We have a great confidence to say r is not 0

3. Chi-Square : category ~ category vars are related

$H_0$ : `x` and `y` are not associated (r=0)

associate means if the distribution of one variable changes the value of the other changes. If there is no association, the distribution of first variable is still unchanged regardless of the change of the other.

In [None]:
import researchpy as rp

# Null: Nationality and Club are not related
df = data[['Nationality','Club']]
table, result = rp.crosstab(df['Nationality'], df['Club'], prop= 'col', test= 'chi-square')

result

4. Regression tests : test cause-and-effect relationships

Linear regression, Logistic regression

In [None]:
x, y = data[['Age', 'Goals', 'Offsides']], data[['Wins']]

from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(x,y)

$R^{2} = 1 - \frac{SSE}{SST} = 1 - \frac{\text{SS of predicted error}}{\text{SS of baseline model error}}$

If $SSE < SST$, our model is better than an avg model. Model with **higher** $R^2$ is a better model.

In [None]:
#score(X, y) : Return R2
lr.score(x,y)

Adjusted $R^2$ = $1 - \frac{(N-1)(1-R^2)}{N-p-1}$

Where `p` = number of predictors, `N` = sample size

In [None]:
nan