In [1]:
from collections import Counter
from collections import OrderedDict
import math

from scipy import stats

In [2]:
def mittelwert(array):
    n = len(array)
    return sum(array) / n 


gehalt = [1000, 1500, 1500, 20000, 3000, 2000]
mittelwert(gehalt)

4833.333333333333

In [3]:
def median(array):
    array = sorted(array)
    n = len(array)
    index = int(n / 2)
    if n % 2 == 0:
        lower = array[index - 1]
        upper = array[index]
        return 0.5 * (lower + upper)
    return array[index]


gehalt = [1000, 1500, 1500, 20000, 3000, 2000]
median(gehalt)

1750.0

In [4]:
def modus(array):
    value_counts = Counter(array)
    most_common = value_counts.most_common(2)
    modus = most_common[0][1]
    if modus == most_common[1][1]:
        return None
    return array[modus]


gehalt = [1000, 1500, 1500, 20000, 3000, 2000]
modus(gehalt)

1500

In [5]:
def varianz(array):
    n = len(array)
    mean = sum(array) / n
    var = (1 / (n-1)) * sum(map(lambda x: (x - mean) **2 , array))
    return var
    
    
gehalt = [1000, 1500, 1500, 20000, 3000, 2000]
varianz(gehalt)

55666666.66666668

In [6]:
def std_abweichung(array):
    n = len(array)
    mean = sum(array) / n
    var = (1 / (n-1)) * sum(map(lambda x: (x - mean) **2 , array))
    return math.sqrt(var)


gehalt = [1000, 1500, 1500, 20000, 3000, 2000]
std_abweichung(gehalt)

7461.009761866465

In [7]:
def covariance(x, y):
    if not len(x) == len(y):
        print("X and Y must be of same size.")
    n = len(x)
    x_mn = sum(x) / n
    y_mn = sum(y) / n
    xy_var = map(lambda xi, yi: (xi - x_mn) * (yi - y_mn), x, y)
    cov = (1 / (n-1)) * sum(xy_var)
    return cov


size = [20, 30, 40, 50, 60]
price = [300, 400, 600, 700, 1000]
covariance(size, price)

4250.0

In [8]:
# PEARSON KORRELATIONSKOEFFIZIENT
def correlation(x, y):
    if not len(x) == len(y):
        print("Error: X and Y must be of same size.")
        
    n = len(x)
    
    x_mn = sum(x) / n
    y_mn = sum(y) / n

    var_x = (1 / (n-1)) * sum(map(lambda xi: (xi - x_mn) ** 2 , x))
    var_y = (1 / (n-1)) * sum(map(lambda yi: (yi - y_mn) ** 2 , y))

    std_x, std_y = math.sqrt(var_x), math.sqrt(var_y)
    
    xy_var = map(lambda xi, yi: (xi - x_mn) * (yi - y_mn), x, y)
    cov = (1 / (n-1)) * sum(xy_var)
    
    r = cov / (std_x * std_y)
    return float(f"{r:.3f}")


size = [20, 30, 40, 50, 60]
price = [300, 400, 600, 700, 1000]
correlation(size, price)

0.981

In [9]:
# SPEARMAN RANGKORRELATION
def ranking(array):
    
    rank = 1
    array_sorted = sorted(set(array))
    ord_dict = OrderedDict(Counter(array))
    
    rankings = {}
    for num in array_sorted:
        count = ord_dict.get(num)
        if count == 1:
            rankings[num] = rank
            rank += 1
        else:
            rankings[num] = rank + (count - 1) / 2
            rank += count
    
    return [float(rankings.get(num)) for num in array]


eng = [12, 12, 3, 6, 10, 4, 15, 8]
eng_rank = ranking(eng)
print(eng_rank)

deu = [14, 14, 5, 4, 11, 8, 10, 3]
deu_rank = ranking(deu)
print(deu_rank)

correlation(eng_rank, deu_rank)

[6.5, 6.5, 1.0, 3.0, 5.0, 2.0, 8.0, 4.0]
[7.5, 7.5, 3.0, 2.0, 6.0, 4.0, 5.0, 1.0]


0.639

In [10]:
def var_koeff(array):
    n = len(array)
    mn = sum(array) / n
    var = (1 / (n-1)) * sum(map(lambda x: (x - mn) ** 2 , array))
    std = math.sqrt(var)
    v = std / mn
    if 0 in array:
        v = v / math.sqrt((n-1))
    return v


pizza_de = [4.99, 7.99, 5.99, 4.99, 6.99]
pizza_us = [5.74, 9.19, 6.89, 5.74, 8.04]

mn_de, mn_us = mittelwert(pizza_de), mittelwert(pizza_us)
print(round(mn_de, 2), round(mn_us, 2))

std_de, std_us = std_abweichung(pizza_de), std_abweichung(pizza_us)
print(round(std_de, 2), round(std_us, 2))

cv_de, cv_us = var_koeff(pizza_de), var_koeff(pizza_us)
print(round(cv_de, 2), round(cv_us, 2))

6.19 7.12
1.3 1.5
0.21 0.21


In [11]:
def sample_size_pop(N, e=0.05, c=0.95, p=0.5, extra=None):
    """Stichprobengröße, bekannte Population.
        
        N: Population / Grundgesamtheit
        e: Fehlertoleranz
        c: Konfidenzniveau
        p: geschätzter Anteilswert
        extra: Zusatz für fehlenden Rücklauf [0, 1]
    """
    z = stats.norm.ppf((1 + c) / 2)
    
    frac_n = (z**2 * p*(1-p)) / e**2
    frac_d = 1 + ((z**2 * p*(1-p)) / (e**2 * N))
    n = frac_n / frac_d
    if extra:
        n = n + n * extra
    
    # Werte aufrunden
    return math.ceil(n)

n = 4000
sample_size_pop(n, c=0.99, e=0.03, p=0.5, extra=0.05)

1325

In [12]:
sample_size_pop(n, c=0.95, e=0.05, p=0.8, extra=0.05)

244

In [13]:
def sample_size(e=0.05, c=0.95, p=0.5, extra=None):
    """
    Stichprobengröße, unbekannte Population.
    
    Parameter
    ---------
        e: Fehlertoleranz
        c: Konfidenzniveau
        p: geschätzter Anteilswert
        extra: Zusatz für fehlenden Rücklauf
    
    Returns
    -------
        n, sample size -> int
    """
    z = stats.norm.ppf((1 + c) / 2)
    n = (z**2 * p * (1-p)) / e**2
    if extra:
        n = n + n * extra
    
    return math.ceil(n)

sample_size(p=0.09, e=0.01, c=0.95)

3147