In [1]:
from scipy.stats import ttest_ind
# loading Python modules
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

# silence please; developers at work
import warnings
warnings.filterwarnings('ignore')

In [None]:
# append question mark ? to any method to get a jupyter popup of help
# sns.displot?

In [None]:
xNS.mean(), xS.mean()

In [None]:
def cohend(d1, d2):
    """Calculate Cohen's d for independent samples"""
    n1, n2 = len(d1), len(d2)
    mu1, mu2 = np.mean(d1), np.mean(d2)
    var1, var2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
    # calculate the pooled standard deviation
    s_pooled = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    # return the effect size
    return (mu1 - mu2) / s_pooled

cohend(xS.values, xNS.values)

In [None]:
# sns.histplot([xA,xB], colors=['r', 'b'])
sns.histplot(data=df, x="ELV", hue="group", bins=10, kde=True, alpha=0.3)
# sns.displot(data=df, x="ELV", hue="group", bins=10, kde=True)

In [None]:
# xA.hist(alpha=0.3)
# xB.hist(alpha=0.3)

In [None]:
# xA['ELV']

In [None]:
# sample mean and standard deviation are approximations of the population mean and std
meanA = df[df['group']=="A"]['ELV'].mean()
stdA = df[df['group']=="A"]['ELV'].std(ddof=0)
print("The sample mean and sample standard deviation for Group A are:")
meanA, stdA

In [None]:
meanB = df[df['group']=="B"]['ELV'].mean()
stdB = df[df['group']=="B"]['ELV'].std()
print("The sample mean and sample standard deviation for Group B are:")
meanB, stdB

In [None]:
# another way to get an estimate of the population parameters is to
# use the `fit` method on a probability distribution model:
from scipy.stats.distributions import norm

In [None]:
# fit a normal distribution for Group A
norm.fit(df[df['group']=="A"]['ELV'], method="MLE")

In [None]:
# fit a normal distribution for Group B
norm.fit(df[df['group']=="B"]['ELV'], method="MLE")

The estimates obtaind from the `fit` method are called "maximum likelyhood estimates" (MLE) meaning the computer went over all the possible parametes for the group mean $\mu$ and standard deviation $\sigma$ and picked the parameters that are most likely to have generated the data.

This is a thing now. Computers are fast enough to perform complicated search algorithms and solve optimization problems in just a few seconds to tell you the parameters $(\mu, \sigma)$ that best describe the data distribution from which the sample comes from.

In [None]:
# the method of moments (MM) is another general-purpse way to
# compute estimates for the model parameters:
#   obtain a formula for the first moment of the distribution (set to the sample mean)
#   obtain a formula for the second moment around mean (set to value computed from sample variance)
norm.fit(df[df['group']=="S"]['ELV'], method="MM")

In [None]:
# note there is a slight difference with the estimate obtained
# using the .std() estimator above (difference due to 1/(n-1) vs. 1/n in formulas)

In [None]:
xS.values

In [None]:
xbarML, sML = norm.fit(df[df['group']=="S"]['ELV'])
xAsim = norm(xbarML, sML)
sns.histplot(xAsim.rvs(300))


### Generate the LM data

$$
  \texttt{ELV} = 1000 + 2.5 \cdot \texttt{hours}
$$

In [None]:
import numpy as np
from scipy.stats import uniform, norm
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [None]:
# np.random.seed(seed=15)
np.random.seed(seed=154)

u100 = uniform(0, 100)
x = u100.rvs(33)

n300 = norm(0, 100)
eps = n300.rvs(33)

beta1 = 2.5

y = 1000 + beta1*x + eps
y

In [None]:
# # save generated data
# df = pd.DataFrame({'hours': x, 'ELV': y})
# with pd.ExcelWriter("data/ELV_vs_hours.ods") as writer:
#     df.to_excel(writer, sheet_name="Data", index=None)

In [None]:
fig, ax = plt.subplots()
ax = sns.scatterplot(x, y, ax=ax)

In [None]:
# sm.graphics.plot_regress_exog(res, 1)

In [None]:
# sns.scatterplot(x, res.fittedvalues)

In [None]:
sns.lineplot(x, beta0+beta1*x, ax=ax, color="red")
fig

In [None]:
def dmeans(data, valuescol="values", groupcol="group", groups=["A", "B"]):
    groupA, groupB = groups
    xbarA = data[data[groupcol]==groupA][valuescol].mean()
    # print("The average value in", groupA, "is", xbarA)
    xbarB = data[data[groupcol]==groupB][valuescol].mean()
    # print("The average value in", groupB, "is", xbarB)
    d = xbarA - xbarB
    return d


In [9]:
from functools import partial

def dmeans(x, col="var"):
    print(x, col)

d = partial(dmeans, col="foo")
d("hh")

d2 = partial(dmeans, "zz", col="bar")
d2()

hh foo
zz bar


In [None]:
import scipy as sp
from scipy.stats.distributions import norm

def plot_rv(rv, label=None, xlabel=None, ax=None, title=None):
    """
    Plot a SciPy distribution (a `rv_frozen` object).
    """
    assert isinstance(rv, sp.stats._distn_infrastructure.rv_frozen), \
        'this function assumes plotting a rv_forzen random variable'
    size = 10000
    x = np.linspace(rv.ppf(0.001), rv.ppf(0.999), size)
    y = rv.pdf(x)

    if ax is None:
        fig, ax = plt.subplots()
    sns.lineplot(x, y, ax=ax, label=label)
    
    ax.set_title(title)
    ax.set_xlim(x.min(), x.max())
    ax.set_xlabel(xlabel)
    ax.tick_params(axis='both', labelsize=9)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # return ax in case want to plot more
    return ax

rvNS = norm(loc=1000, scale=300)

plot_rv(rvNS,
        xlabel="ELV",
        label="NS",
        title='Normal distributino $\mathcal{N}(1000,100)$')


In [None]:
size = 10000

Delta = 200
rvNS = norm(loc=1000, scale=300)
rvS = norm(loc=1000+Delta, scale=300)

x = np.linspace(rvNS.ppf(0.001), rvS.ppf(0.999), size)
yNS = rvNS.pdf(x)
yS = rvS.pdf(x)

fig, ax = plt.subplots()
sns.lineplot(x, yNS, ax=ax, label="NS") #  color='black', alpha=0.5)
sns.lineplot(x, yS, ax=ax, label="S")
ax.set_title('Normal distributions...')
# ax.set_title(ax.get_title(), pad=25)
ax.set_xlim(x.min(), x.max())
ax.set_xlabel('ELV', fontsize=8, labelpad=10)
ax.tick_params(axis='both', labelsize=9)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
