# Chapter 5

## 5.2.1

In [None]:
import numpy as np    #A
from scipy.stats import skewnorm, kruskal, mannwhitneyu

np.random.seed(99)    #B

group_a = skewnorm.rvs(a=9, scale=2.2, size=99) + 4.5    #C
group_b = skewnorm.rvs(a=11, scale=1, size=99) + 4.6

H = kruskal(group_a, group_b)    #D
U = mannwhitneyu(group_a, group_b)

print(f"Kruskal-Wallis Test, H={H[0]}, p={H[1]}")   #E
print(f"Mann-Whitney U-test, U={U[0]}, p={U[1]}")


#A Import libraries
#B Set random seed for reproducible results
#C Generate two skewed distributions
#D Run a Kruskal-Wallis and Mann-Whitney U-test
#E Print the results

In [None]:
!pip3 install scikit_posthocs

In [None]:
import numpy as np  # A
from scipy import stats as st
import scikit_posthocs as sp

np.random.seed(99)    #B

group_a = st.skewnorm.rvs(a=9, scale=2.2, size=99) + 4.6  # B
group_b = st.skewnorm.rvs(a=11, scale=1.5, size=99) + 4.6
group_c = st.skewnorm.rvs(a=9.1, scale=2.0, size=99) + 4.6

data = [group_a, group_b, group_c]
H = st.kruskal(group_a, group_b)  # C
post_hoc = sp.posthoc_dunn(data, p_adjust="bonferroni")

print(f"Kruskal-Wallis Test, H={H[0]}, p={H[1]}")  # D
print(post_hoc)

#A Import libraries
#B Set random seed for reproducible results
#C Generate three non-normal series of data
#D Perform a Kruskal-Wallis test and a Dunn test as a post-hoc
#E Print the results

## 5.2.2

In [None]:
import pandas as pd
import scipy.stats as st    #A

assignments = pd.read_csv("assignments.csv", index_col="recommender")    #B

chi_sq = st.chi2_contingency(assignments)    #C

print(f"Chi-square value: {chi_sq[0].round(3)}")    #D
print(f"p-value: {chi_sq[1].round(3)}")
print(f"Expected Frequencies:\n {chi_sq[3].round(2)}")

#A Import the pandas and the stats library from scipy
#B Import the recommender group assignments crosstab 
#C Conduct the chi-square test
#D Print results

In [None]:
expected = assignments * .2596    #A
print(expected.round(2))    #B

#A Calculate expected frequencies based on the total proportion
#B Show expected proportions

In [None]:
clicked = pd.read_csv("clicked.csv", index_col="recommender")    #A

chi_sq = st.chi2_contingency(clicked, expected)    #B

print(f"Chi-square value: {chi_sq[0].round(3)}")    #C
print(f"p-value: {chi_sq[1].round(3)}")

#A Import the crosstab of clicks by country and recommender
#B Calculate a chi-square test 
#C Print resulting test statistic and p-value

In [None]:
import seaborn as sns    #A
from operator import sub

diffs = list(map(sub, clicked.values, expected.values))    #B
diffs = pd.DataFrame(
    diffs,
    columns=assignments.columns,
    index=assignments.index,
)
sns.heatmap(  # C
    diffs.iloc[:-1, :-1],
    cmap="vlag",
    annot=True,
    cbar=False,
) 
plt.xlabel("")
plt.ylabel("Recommender Version")

#A Import Seaborn and the subtraction operator
#B Subtract corresponding values between each item in the observed and expected frequencies
#C Convert the differences to a heatmap for a more presentable display

In [None]:
clicked.T

In [None]:
from itertools import combinations  # A

pairs = list(combinations(assignments.iloc[:-1, :-1].index, 2))  # B
chisq_values = []
p_values = []

for p in pairs:  # C
    c = clicked[(clicked.index == p[0]) | (clicked.index == p[1])]
    chi2, pv, dof, exp = st.chi2_contingency(c, correction=True)
    chisq_values.append(chi2)
    p_values.append(pv)
    print(p, ", Chi-square =", chi2.round(3), ", p =", pv.round(3))    #D

#A Import combinations function
#B Generate pairs of experiment groups and empty lists
#C Conduct a chi-square test with a correction for each pairwise comparison of experiment groups
#D Print each pair and its corresponding p-value

## 5.3

In [None]:
import numpy as np    #A
from scipy import stats as st
import matplotlib.pyplot as plt

np.random.seed(99)    #B

X1 = np.random.normal(loc=75.5, scale=6.2, size=500)    #C
X2 = np.random.normal(loc=76.2, scale=6.5, size=500)

def t_stat(X1, X2):    #D
    return st.ttest_ind(X1, X2)[0]

t_values = st.bootstrap(    #E
    (X1, X2),
    t_stat,
    n_resamples=1000,
    batch=50,
    method="basic",
    vectorized=False,
    random_state=99,
)

t_crit = -st.t.ppf(q=0.95, df=49)  # E

plt.hist(result.bootstrap_distribution, bins=25)  # F
plt.axvline(t_crit, color="black", linestyle="dashed")

#A Import libraries
#B Generate two normal distributions for independent t-test comparisons
#C Create a function to return only the t-value from a an independent samples t-test
#D Calculate bootstrapped t-values with 1000 samples of n=50
#E Calculate the (negative) t-critical value 
#F Plot the distribution with the critical value as a vertical line