In [16]:
import numpy as np
from tqdm import tqdm
from scipy import stats

In [28]:
data = [2,4,2,4,6,9,3,2]

# Variance, for-cycle, iteration over list

In [3]:
def calculate_var(data, is_sample=False):
    n = len(data)
    mean = sum(data)/n
    sum_square_dist = 0
    for value in data:
        square_dist = (value-mean)**2
        sum_square_dist += square_dist
    denominator = n+1 if is_sample else n
    var = sum_square_dist/denominator
    return var
        

In [4]:
calculate_var(data)

5.25

In [5]:
np.array(data).var()

np.float64(5.25)

# Variance using generator expression

In [6]:
def calculate_var(data, is_sample=False):
    n = len(data)
    mean = sum(data)/n
    #sum of square diff from the mean
    sum_square_diff = sum((x - mean)**2 for x in data)
    denominator = n - 1 if is_sample else n
    variance = sum_square_diff/denominator
    return variance

# Standard deviation(std)

In [7]:
def calculate_std(data):
    var = calculate_var(data)
    std = var**0.5
    return std

# Standard error of the mean(SEM)

In [8]:
def calculate_sem(data):
    std = calculate_std(data)
    n  = len(data)
    sem = std/n**0.5
    return sem
    


# Bootstrapping standard error of the mean(SEM)

In [9]:
def bootstrap_sem(data, n_bootstrap=10000):
    n = len(data)
    bootstrap_means = []
    for i in tqdm(range(n_bootstrap)):
        # Resample with replacement
        bootstrap_sample = np.random.choice(data, size=n, replace=True)
        bootstrap_means.append(np.mean(bootstrap_sample))
    sem = calculate_std(bootstrap_means)
    
    return sem

In [10]:
bootstrap_sem(data)

100%|██████████| 10000/10000 [00:00<00:00, 12960.31it/s]


np.float64(0.8084127345452933)

# Vectorized bootstraping SEM

In [11]:
def vectorized_bootstrap_sem(data, n_bootstraps = 10_000):
    n = len(data)
    boot_sample = np.random.choice(data, size=(n_bootstraps, n), replace=True)
    boots_means = np.mean(boot_sample, axis=1)
    boot_sem = calculate_sem(boots_means)
    return boot_sem

In [21]:
sem = vectorized_bootstrap_sem(data)

In [13]:
bootstrap_sem(data)

100%|██████████| 10000/10000 [00:00<00:00, 11286.36it/s]


np.float64(0.8049038416947679)

In [14]:
calculate_sem(data)

0.8100925873009824

# Calculate the 95% t confidence interval for mean

In [35]:
def calculate_t_confidence_interval(data, confidedence_level=0.95):
    data = np.array(data)
    
    n = len(data)
    x_bar = np.mean(data)
    standard_error = calculate_sem(data)
    
    #degrees of freedom
    dof = n-1
    
    #calculate critival t value
    alpha = 1 - confidedence_level
    t_critical = stats.t.ppf(1-alpha/2, dof)

    #margin of error
    margin_of_error = t_critical * standard_error

    # Confidence interval
    lower_bound = x_bar - margin_of_error
    upper_bound = x_bar + margin_of_error
    
    return x_bar, lower_bound, upper_bound, margin_of_error

In [34]:
calculate_t_confidence_interval(data)

(np.float64(4.0),
 np.float64(2.084435422032552),
 np.float64(5.915564577967448),
 np.float64(1.915564577967448))