In [1]:
import os, random
import pathlib

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 41
random.seed(SEED)
np.random.seed(SEED)

%matplotlib inline

In [2]:
# my custom color palette
COLORS = ["#64E6FF", "#007DC5", "#4D2F9E", "#BE0046", "#EB5000", "#FFE600"]
sns.set_palette(sns.color_palette(COLORS))

In [3]:
# calculating mean of a dataset
data = np.random.randint(10, 50, 100)  # generate 100 random numbers between 10 & 50
print(f"Data sample (first 20): {data[:20]}")
# calculate the mean
mean = np.mean(data)
print(f"Mean: {mean:.3f}")

Data sample (first 20): [10 45 22 44 26 11 35 33 31 36 13 45 16 30 32 38 34 21 38 27]
Mean: 29.380


In [4]:
# calculate trimmed mean (using scipy.stats)
trimmed_mean = stats.trim_mean(data, 0.10)  # drop top & bottom 10% (0.1) of data
print(f"Trimmed mean: {trimmed_mean:.3f}")

Trimmed mean: 29.438


In [5]:
# calculating median of a dataset
data = np.random.randint(10, 50, 100)  # generate 100 random numbers between 10 & 50
median = np.median(data)
print(f"Median: {median:.3f}")

Median: 27.000


In [6]:
# calculate the mode
data = np.random.randint(10, 50, 100)  # generate 100 random numbers between 10 & 50
print(f"Data sample (first 20): {data[:20]}")

mode = stats.mode(data, keepdims=True)
print(f"Mode: {mode[0]} (occurs {mode[1]} times)")

Data sample (first 20): [43 35 44 18 46 38 44 11 22 14 25 41 39 12 29 49 36 29 37 41]
Mode: [17] (occurs [6] times)


In [8]:
# calculate Kurtosis
data = np.random.randint(10, 50, 100)  # generate 100 random numbers between 10 & 50
print(f"Data sample (first 20): {data[:20]}")

# kurtosis using Fisher definition (default)
kf = stats.kurtosis(data, fisher=True)
# kurtosis using Pearson's definition
kp = stats.kurtosis(data, fisher=False)
print(f"Kurtosos (Fisher): {kf:.3f} - Kurtosos (Pearson): {kp:.3f}")

Data sample (first 20): [24 27 24 13 45 36 14 33 13 41 28 27 28 33 28 48 33 30 21 29]
Kurtosos (Fisher): -1.018 - Kurtosos (Pearson): 1.982


In [10]:
# calculate z-scores
data = np.random.randint(10, 50, 100)  # generate 100 random numbers between 10 & 50
print(f"Data sample (first 20): {data[:20]}")

# calculate
x_bar, std = np.mean(data), np.std(data)
print(f"Mean: {x_bar:.3f} - stdev: {std:.3f}")
# calculate the z-scores
z_scores = (data - x_bar) / std
print(f"z_scores (first 20): {z_scores[:20]}")

Data sample (first 20): [27 35 41 23 47 27 39 29 46 45 30 15 37 42 36 32 32 41 19 14]
Mean: 31.210 - stdev: 11.518
z_scores (first 20): [-0.36551268  0.32904823  0.84996891 -0.71279313  1.37088958 -0.36551268
  0.67632868 -0.19187245  1.28406947  1.19724936 -0.10505234 -1.40735403
  0.50268845  0.93678902  0.41586834  0.06858789  0.06858789  0.84996891
 -1.06007358 -1.49417414]


In [15]:
def five_num_summary(data: np.ndarray):
    mini = np.min(data)
    q1 = np.quantile(data, 0.25)
    median = np.quantile(data, 0.50)
    q3 = np.quantile(data, 0.75)
    maxi = np.max(data)
    # also return -1.5 IQR limits
    lower_limit = mini - 1.5 * (q3 - q1)
    upper_limit = maxi + 1.5 * (q3 - q1)

    return mini, q1, median, q3, maxi, lower_limit, upper_limit


data = np.random.randint(10, 50, 20)  # generate 100 random numbers between 10 & 50
print(f"Data (sorted) {sorted(data)}")
print(f"Five number summary: {five_num_summary(data)}")

Data (sorted) [11, 12, 15, 16, 18, 18, 19, 20, 22, 22, 23, 24, 27, 27, 31, 31, 41, 43, 47, 48]
Five number summary: (11, 18.0, 22.5, 31.0, 48, -8.5, 67.5)


In [18]:
# calculation weighted means
cost_per_pound = np.array([3.0, 3.4, 2.8, 2.9, 3.25])
num_pounds = np.array([1200, 500, 2750, 1000, 800])
weighted_mean = np.dot(cost_per_pound, num_pounds) / np.sum(num_pounds)
print(f"Weighted mean: {weighted_mean:.3f}")

# alternatively this can be calculated simply as
weighted_mean = np.average(cost_per_pound, weights=num_pounds)
print(f"Weighted mean: {weighted_mean:.3f}")

Weighted mean: 2.960
Weighted mean: 2.960
