### 1. Creating a database

- create a database with objects containing name and age attributes

In [5]:
import random

class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age

names = ["Mirta", "Bob", "Charlie", "David", "Leandro", "Frank", "Grace", "Helen", "William", "Jack", 
               "Karen", "Emily", "Bruno", "Marcos", "Bernardo", "Paul", "Quincy", "Irineu", "Wagner", "Walter"]

surnames = ["Smith", "Johnson", "Brown", "Taylor", "Lee", "Walker", "Harris", "Young", "King", "Wright",
            "Adams", "da Silva", "Nelson", "Hill", "Carter", "Mitchell", "Perez", "Roberts", "Evans", "Green"]

db = [Person(f"{random.choice(names)} {random.choice(surnames)}", random.randint(1, 100)) for _ in range(50)]

for person in db:
    print(f"Name: {person.name}, Age: {person.age}")

Name: Helen Carter, Age: 18
Name: Marcos Mitchell, Age: 12
Name: Leandro Smith, Age: 27
Name: Charlie Walker, Age: 62
Name: Bob da Silva, Age: 29
Name: Mirta Walker, Age: 19
Name: Quincy Hill, Age: 3
Name: Bernardo Taylor, Age: 60
Name: Irineu Nelson, Age: 25
Name: Mirta Adams, Age: 88
Name: Karen Carter, Age: 81
Name: David Young, Age: 88
Name: Charlie Nelson, Age: 43
Name: Mirta Mitchell, Age: 31
Name: Frank Wright, Age: 9
Name: Marcos Evans, Age: 43
Name: Grace Young, Age: 83
Name: Emily Young, Age: 29
Name: Quincy Adams, Age: 77
Name: Marcos Nelson, Age: 20
Name: William Lee, Age: 48
Name: William Walker, Age: 32
Name: William Taylor, Age: 35
Name: Leandro Brown, Age: 50
Name: Leandro da Silva, Age: 76
Name: Bernardo Roberts, Age: 28
Name: Bob Mitchell, Age: 94
Name: Grace King, Age: 31
Name: David Carter, Age: 92
Name: Bruno Lee, Age: 96
Name: Bernardo Nelson, Age: 62
Name: Paul Evans, Age: 51
Name: Wagner Lee, Age: 9
Name: Paul Perez, Age: 51
Name: Mirta Perez, Age: 26
Name: Davi

### 2. Frequency Tables

- **Absolute Frequency (Fa)**:  
  This is simply the count of each individual value in your dataset.
  
- **Relative Frequency (Fr)**:  
  **Formula**:  
  $$
  Fr = \frac{Fa}{n}
  $$
  **Where**:
  - Fa - is the absolute frequency of a value.
  - n - is the total number of data points.

In [6]:
data = [person.age for person in db]

freq = {}
for value in data:
    if value in freq:
        freq[value] += 1
    else:
        freq[value] = 1
        
n = len(data)

rel_freq = {key: val / n for key, val in freq.items()}

print("Absolute Frequency:", freq)
print("Relative Frequency:", rel_freq)

Absolute Frequency: {18: 1, 12: 2, 27: 1, 62: 2, 29: 2, 19: 1, 3: 1, 60: 1, 25: 2, 88: 2, 81: 1, 43: 2, 31: 2, 9: 2, 83: 1, 77: 1, 20: 1, 48: 1, 32: 1, 35: 1, 50: 1, 76: 1, 28: 2, 94: 1, 92: 1, 96: 1, 51: 2, 26: 2, 33: 1, 58: 1, 70: 1, 41: 1, 44: 1, 64: 1, 54: 1, 69: 1, 37: 1, 7: 1, 68: 1}
Relative Frequency: {18: 0.02, 12: 0.04, 27: 0.02, 62: 0.04, 29: 0.04, 19: 0.02, 3: 0.02, 60: 0.02, 25: 0.04, 88: 0.04, 81: 0.02, 43: 0.04, 31: 0.04, 9: 0.04, 83: 0.02, 77: 0.02, 20: 0.02, 48: 0.02, 32: 0.02, 35: 0.02, 50: 0.02, 76: 0.02, 28: 0.04, 94: 0.02, 92: 0.02, 96: 0.02, 51: 0.04, 26: 0.04, 33: 0.02, 58: 0.02, 70: 0.02, 41: 0.02, 44: 0.02, 64: 0.02, 54: 0.02, 69: 0.02, 37: 0.02, 7: 0.02, 68: 0.02}


### 2. Class Intervals (Grouping)

- **Class Width**:  
  **Formula**:  
  $$
  \text{class width} = \frac{\text{(max(data)} - \text{min(data))}}{\sqrt{n}}
  $$
  **Where**:
  -  max(data) - is the maximum value in your dataset.
  -  min(data) - is the minimum value in your dataset.
  -  n - is the total number of data points.

In [9]:
import math

min_data = min(data)
max_data = max(data)

def rounded_sqrt(num):
    square_root = math.sqrt(num)
    if square_root.is_integer():
        return int(square_root) 
    else:
        return math.ceil(square_root)  

num_classes = rounded_sqrt(n)

class_width = (max_data - min_data) / num_classes
print("Class Width:", "(", max_data, "-", min_data, ")", "/", num_classes)
print("Class Width:", class_width)

Class Width: ( 96 - 3 ) / 8
Class Width: 11.625


In [17]:
class Data_Class:
    def __init__(self, lower_limit, upper_limit):
        self.lower_limit = lower_limit
        self.upper_limit = upper_limit
        self.absolute_frequency = 0


def create_data_classes(num_classes, class_width):
    data_classes = []
    for i in range(num_classes):
        lower_limit = i * class_width
        upper_limit = lower_limit + class_width
        data_class = Data_Class(lower_limit, upper_limit)
        data_classes.append(data_class)
    return data_classes

def increment_frequency(data_classes, data):
    for number in data:
        for data_class in data_classes:
            if data_class.lower_limit <= number < data_class.upper_limit:
                data_class.absolute_frequency += 1

data_classes = create_data_classes(num_classes, class_width)
increment_frequency(data_classes, data)

print(num_classes)
print(class_width)
for dc in data_classes:
    print(f"lower_limit: {dc.lower_limit}, upper_limit: {dc.lower_limit}, absolute_frequency: {dc.absolute_frequency}")

8
11.625
lower_limit: 0.0, upper_limit: 0.0, absolute_frequency: 4
lower_limit: 11.625, upper_limit: 11.625, absolute_frequency: 5
lower_limit: 23.25, upper_limit: 23.25, absolute_frequency: 13
lower_limit: 34.875, upper_limit: 34.875, absolute_frequency: 6
lower_limit: 46.5, upper_limit: 46.5, absolute_frequency: 6
lower_limit: 58.125, upper_limit: 58.125, absolute_frequency: 6
lower_limit: 69.75, upper_limit: 69.75, absolute_frequency: 4
lower_limit: 81.375, upper_limit: 81.375, absolute_frequency: 4


### 3. Measures of Central Tendency

- **Mean (Arithmetic Mean)**:  
  **Formula**:  
  $$
  \text{Mean} = \frac{x_1 + x_2 + x_3 + \dots + x_n}{n}
  $$
  **Where**:
  - x1, x2, x3 - are the individual data points.
  - n - is the total number of data points.

In [None]:
mean = sum(data) / n
print("Mean:", sum(data), "/", n)
print("Mean:", mean)

- **Weighted Mean**:  
  **Formula**:  
  $$
  \text{Weighted Mean} = \frac{\sum_{i=1}^{n} x_i \cdot p_i}{\sum_{i=1}^{n} p_i}
  $$
  Where:
  - x_i - is each data point.
  - p_i - is the weight associated with each data point.

In [None]:
weighted_mean = sum(data) / n
print("Weighted Mean:", sum(data), "/", n)
print("Weighted Mean:", mean)

- **Median**:  
  **For Even Count**:  
  $$
  \text{Median} = \frac{\text{sorted data}[n//2 - 1] + \text{sorted data}[n//2]}{2}
  $$  
  **For Odd Count**:  
  $$
  \text{Median} = \text{sorted data}[n//2]
  $$
  Where:
  - Sorted data is the dataset arranged in ascending order.
  - n - is the total number of data points.

In [None]:
sorted_data = sorted(data)

if n % 2 == 1:
    median = sorted_data[n // 2]
else:
    median = (sorted_data[n // 2 - 1] + sorted_data[n // 2]) / 2

print("Median:", median)

- **Mode**:  
  The value that appears most frequently in the dataset.

In [None]:
mode = max(freq, key=freq.get)
print("Mode:", mode)

### 5. Median for Grouped Data (Md)

- **Formula**:  
  $$
  Md = L_{\text{inf}} + \frac{\left(\frac{n}{2} - F_{\text{ac previous}}\right)}{f_{\text{i class}}} \cdot \text{class width}
  $$

  Where:
  - \( L_{\text{inf}} \) is the lower limit of the median class.
  - \( F_{\text{ac previous}} \) is the cumulative frequency up to the class before the median class.
  - \( f_{\text{i class}} \) is the absolute frequency of the median class.
  - \( \text{class width} \) is the width of the class interval.
  - \( n \) is the total number of data points.

In [None]:
# 5. Median for Grouped Data (Md)

# Grouped data (class intervals and frequencies)
class_intervals = [(min_data + i * class_width, min_data + (i + 1) * class_width) for i in range(int((max_data - min_data) / class_width))]
frequencies = [sum(1 for age in data if interval[0] <= age < interval[1]) for interval in class_intervals]

# Calculate cumulative frequencies (manual calculation)
cumulative_freq = []
cumulative_sum = 0
for f in frequencies:
    cumulative_sum += f
    cumulative_freq.append(cumulative_sum)

# Median calculation (Md for grouped data)
n = sum(frequencies)
median_class_index = next(i for i, freq in enumerate(cumulative_freq) if cumulative_freq[i] >= n / 2)

L_inf = class_intervals[median_class_index][0]
F_ac_previous = cumulative_freq[median_class_index - 1] if median_class_index > 0 else 0
f_class = frequencies[median_class_index]
class_width = class_intervals[0][1] - class_intervals[0][0]  # Assuming equal width for all classes

Md = L_inf + ((n / 2 - F_ac_previous) / f_class) * class_width
print("Median for Grouped Data (Md):", Md)


### 4. Measures of Dispersion

- **Variance (s²)**:  
  **Formula**:  
  $$
  \text{Variance} = \frac{\sum_{i=1}^{n} f_i \cdot (x_i - \text{mean})^2}{n}
  $$
  Where:
  - f_i - is the frequency of the data point.
  - x_i - is the data point.
  - mean - is the mean of the dataset.
  - n - is the total number of data points.
- **Standard Deviation (s)**:  
  **Formula**:  
  $$
  \text{Standard Deviation} = \sqrt{\text{Variance}}
  $$
  It is simply the square root of the variance.

In [None]:
# Variance and Standard Deviation (manual calculation)
variance = sum((x - mean) ** 2 for x in data) / n
std_dev = variance ** 0.5

print("Variance:", variance)
print("Standard Deviation:", std_dev)


- **Coefficient of Variation (CV)**:  
  **Formula**:  
  $$
  CV = \frac{\text{Standard Deviation}}{\text{Mean}} \times 100
  $$
  This measures the relative dispersion in percentage terms.

In [None]:
# Coefficient of Variation (manual calculation)
cv = (std_dev / mean) * 100
print("Coefficient of Variation:", cv)