# X500 - Introdução à Ciência de Dados

Prof. Erneson Alves de Oliveira<br>
Programa de Pós-Graduação em Informática Aplicada<br>
Universidade de Fortaleza

# Introdução à Estatística (Parte II)

# 1 Quantis

- Quartis ($q_1, q_2, q_3$): Divide o dado em 4 partes iguais (25%, 50% e 75%);

- Decis ($d_1, \dots, d_9$): Divide o dado em 10 partes iguais (de 10% em 10%);

- Percentis ($p_1, \dots, p_{99}$): Divide o dado em 100 partes iguais (de 1% em 1%).

Logo,

\begin{equation}
q_2(x) = d_5(x) = p_{50}(x) = mediana(x)
\end{equation}

Nesse contexto, o intervalo interquartis é definido por:

\begin{equation}
IIQ = q_3 - q_1
\end{equation}

In [None]:
import math

def calcula_quantis(x, q = 4):
    x_ordenado = sorted(x)
    n = len(x_ordenado)
    quantis = []
    for k in range(1, q):
        posto = n * k / q - 1
        i = math.floor(posto)
        j = math.ceil(posto)
        if i == j:
            quantis.append((x_ordenado[i] + x_ordenado[i + 1]) / 2)
        else:
            quantis.append(x_ordenado[j])
    return quantis

x = list(range(10))
print(x)

quartis = calcula_quantis(x)
print(quartis)

decis = calcula_quantis(x, q = 10)
print(decis)

percentis = calcula_quantis(x, q = 100)
print(percentis)

print(quartis[2 - 1], decis[5 - 1], percentis[50 - 1])

iiq = quartis[3 - 1] - quartis[1 - 1]
print(iiq)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[2, 4.5, 7]
[0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2.5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3.5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4.5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5.5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6.5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7.5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8.5, 9, 9, 9, 9, 9, 9, 9, 9, 9]
4.5 4.5 4.5
5


In [None]:
import statistics as stats

quartis = stats.quantiles(x, method = 'exclusive')
print(quartis)

quartis = stats.quantiles(x, method = 'inclusive')
print(quartis)

decis = stats.quantiles(x, n = 10)
print(decis)

percentis = stats.quantiles(x, n = 100)
print(percentis)

print(quartis[2 - 1], decis[5 - 1], percentis[50 - 1])

[1.75, 4.5, 7.25]
[2.25, 4.5, 6.75]
[0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9]
[-0.89, -0.78, -0.67, -0.56, -0.45, -0.34, -0.23, -0.12, -0.01, 0.1, 0.21, 0.32, 0.43, 0.54, 0.65, 0.76, 0.87, 0.98, 1.09, 1.2, 1.31, 1.42, 1.53, 1.64, 1.75, 1.86, 1.97, 2.08, 2.19, 2.3, 2.41, 2.52, 2.63, 2.74, 2.85, 2.96, 3.07, 3.18, 3.29, 3.4, 3.51, 3.62, 3.73, 3.84, 3.95, 4.06, 4.17, 4.28, 4.39, 4.5, 4.61, 4.72, 4.83, 4.94, 5.05, 5.16, 5.27, 5.38, 5.49, 5.6, 5.71, 5.82, 5.93, 6.04, 6.15, 6.26, 6.37, 6.48, 6.59, 6.7, 6.81, 6.92, 7.03, 7.14, 7.25, 7.36, 7.47, 7.58, 7.69, 7.8, 7.91, 8.02, 8.13, 8.24, 8.35, 8.46, 8.57, 8.68, 8.79, 8.9, 9.01, 9.12, 9.23, 9.34, 9.45, 9.56, 9.67, 9.78, 9.89]
4.5 4.5 4.5


In [None]:
import numpy as np

quartis = np.quantile(x, [0.25, 0.5, 0.75], method = 'linear')
print(quartis)
quartis = np.quantile(x, [0.25, 0.5, 0.75], method = 'weibull')
print(quartis)
quartis = np.quantile(x, [0.25, 0.5, 0.75], method = 'averaged_inverted_cdf')
print(quartis)

[2.25 4.5  6.75]
[1.75 4.5  7.25]
[2.  4.5 7. ]


Para saber mais sobre esses métodos: https://www.jstor.org/stable/2684934

# 2 Associação entre variáveis

## 2.1 Covariância

- Covariância Populacional:

\begin{equation}
\sigma_{xy} = \frac{1}{N}\sum_{i=1}^N (x_i - \mu_x)(y_i - \mu_y)
\end{equation}

- Covariância Amostral:

\begin{equation}
s_{xy} = \frac{1}{n-1}\sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y})
\end{equation}

Veja também: https://www.youtube.com/watch?v=qtaqvPAeEJY

In [None]:
def media(x):
    return sum(x) / len(x)

def covariancia(x, y, amostral = False):
    x_barra = media(x)
    y_barra = media(y)

    S = 0.
    for x_i, y_i in zip(x, y):
        S += (x_i - x_barra) * (y_i - y_barra)

    if amostral:
        return S / (len(x) - 1)
    else:
        return S / len(x)

x = list(range(10))
y = list(range(10))[::-1]

print(x)
print(y)
print(covariancia(x, y))
print(covariancia(x, y, amostral = True))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-8.25
-9.166666666666666


In [None]:
import statistics as stats
print(stats.covariance(x, y))

-9.166666666666666


In [None]:
M = np.array([x, y])
print(M)

print(np.cov(M))
print(np.cov(M)[0, 1])

[[0 1 2 3 4 5 6 7 8 9]
 [9 8 7 6 5 4 3 2 1 0]]
[[ 9.16666667 -9.16666667]
 [-9.16666667  9.16666667]]
-9.166666666666666


## 2.2 Coeficiente de Correlação de Pearson

\begin{equation}
r_{xy} = \frac{s_{xy}}{s_x s_y} = \frac{\sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum_{i=1}^n (x_i - \bar{x})^2} \sqrt{\sum_{i=1}^n (y_i - \bar{y})^2}}
\end{equation}

Veja também: https://www.youtube.com/watch?v=xZ_z8KWkhXE

![picture](https://drive.google.com/thumbnail?id=1lNukn_tMdRmzgtrz8IQQEOC6b_YVrEyS&sz=w600)

In [None]:
def variancia(x, amostral = False):
    x_barra = media(x)

    S = 0.
    for x_i in x:
        S += (x_i - x_barra) * (x_i - x_barra)

    if amostral:
        return S / (len(x) - 1)
    else:
        return S / len(x)

def desvio_padrao(x, amostral = False):
    return math.sqrt(variancia(x, amostral = amostral))

def correlacao(x, y):
    sx = desvio_padrao(x)
    sy = desvio_padrao(y)

    if sx > 0 and sy > 0:
        return covariancia(x, y) / (sx * sy)
    else:
        return 0

x = list(range(10))
y = list(range(10))[::-1]

print(x)
print(y)
print(correlacao(x, y))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-1.0


In [None]:
print(stats.correlation(x, y))

-1.0


In [None]:
# M = [x, y]
M = np.array([x, y])
print(M)

print(np.corrcoef(M))
print(np.corrcoef(M)[0, 1])

[[0 1 2 3 4 5 6 7 8 9]
 [9 8 7 6 5 4 3 2 1 0]]
[[ 1. -1.]
 [-1.  1.]]
-0.9999999999999999


In [None]:
import scipy

print(scipy.stats.pearsonr(x, y).correlation)

-0.9999999999999997
