In [None]:
import numpy as np
import pandas as pd
import math
from typing import List
from collections import Counter
from scratch.linear_algebra import sum_of_squares, dot

# Central Tendencies

### 1- Mean

In [None]:
def _mean_value(xs:List[float])->float:
    return np.mean(xs)

### 2- Median

In [None]:
def _even_median(xs:List[float])->float:
    """if len(xs) is odd, the median is the middle element"""
    sorted_xs=sorted(xs)
    midpoint=len(xs)//2
    return (sorted_xs[midpoint-1] + sorted_xs[midpoint])/2

In [None]:
def _odd_median(xs: List[float])->float:
    """if len(xs) us even, it's the average of the middle two elements"""
    return sorted(xs)[len(sorted_xs//2)]

In [None]:
def median(xs:List[float])->float:
    """finds the middle most value of a list"""
    return _even_median(xs) if len(xs)%2==0 else _odd_median(xs)

Note: The naive way of finding the median costs O(NlogN) time complexity because we sort the list, for more efficient way we can use the Quickselection Algorithm for linear time complexity of O(N)

### 3- Quantile

In [None]:
def quantile(xs:List[float], p:float)->float:
    """returns the pth-percentile value in a list"""
    p_index=int(p*len(xs))
    return sorted(xs)[p_index]

### 4- Mode

In [None]:
def mode(xs:List[float])->float:
    """Return a list, since there migt be more than one mode"""
    counts=Counter(xs)
    max_count=max(counts.values())
    return [x_i for x_i, count in counts.items() if count==max_count]

# Dispersion

### 1- Range

In [None]:
def data_range(xs:List[float])->float:
    """The simplest way to know the spreading of the data"""
    return max(xs)-min(xs)

### 2- Variance

In [None]:
def de_mean(xs:List[float])->List[float]:
    """Transalte xs by subtracting its mean"""
    x_bar=np.mean(xs)
    return [x-x_bar for x in xs]

In [None]:
def variance(xs:List[float])->float:
    """almost the average squared deviation from the mean"""
    assert len(xs)>=2, """variance requires at least two elements"""
    n=len(xs)
    deviations=de_mean(xs)
    return sum_of_squares(deviations)/(n-1)

### 3- Standard Deviation

In [None]:
def standard_deviation(xs:List[float])->float:
    """The standard deviation is the square root of the vairance"""
    return math.sqrt(variance(xs))

In [None]:
'''
A more robust way to know the spreading of the data by avoiding outliers, which is a problem with
 Standard deviation and data range techniques.
'''
def interquartile_range(xs:List[float])->float:
    """Returns the difference between the 75the percentile and 25th percentile"""
    return quantile(xs, 0.75)-quantile(xs,0.25)

# Correlation

### 1- Covariance

In [None]:
def covariance(xs:List[float], ys:List[float])->float:
    assert len(xs) == len(ys)
    return dot(de_mean(xs), de_mean(ys))/(len(xs)-1)

### 2- Correlation

In [None]:
def correlation(xs:List[float]. ys:List[float])->float:
    """measures how much xs and ys vary in tandem from their mean"""
    stddev_x = standard_deviation(xs)
    stddev_y = standard_deviation(ys)
    if stddev_x > 0 and stddev_y > 0:
        return covariance(xs, ys) /stddev_x/ stddev_y
    else:
        return 0 #if no variation then the correlation is zero