# Data Science Bootcamp
# <center> **Aula 13 -- Comparing two Distributions**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

## Compare two Distributions

In [None]:
np.random.seed(200)

In [None]:
# generate synthetic data from a normal distribution
a = stats.norm.rvs(loc=0., scale=1., size=(100))
b = stats.norm.rvs(loc=0.3, scale=1., size=(100))

In [None]:
plt.figure()
sns.distplot(a, bins=20, hist=True, kde=True, rug=True)
sns.distplot(b, bins=20, hist=True, kde=True, rug=True)
plt.show()

In [None]:
np.random.seed(666)

In [None]:
# generate synthetic data from a normal distribution
c = stats.norm.rvs(loc=0., scale=1., size=(100))
d = stats.norm.rvs(loc=0.5, scale=1.5, size=(100))

In [None]:
plt.figure()
sns.distplot(c, bins=20, hist=True, kde=True, rug=True)
sns.distplot(d, bins=20, hist=True, kde=True, rug=True)
plt.show()

#### Plot differences

In [None]:
difab = np.sort(a) - np.sort(b)

In [None]:
plt.figure()
sns.distplot(difab, bins=25, hist=True, kde=True, rug=True, kde_kws={'bw': 0.15})
plt.legend()
plt.show()

In [None]:
difcd = np.sort(c) - np.sort(d)

In [None]:
plt.figure()
sns.distplot(difcd, bins=25, hist=True, kde=True, rug=True, kde_kws={'bw': 0.15})
plt.legend()
plt.show()

#### Compare Empirical (Cumulative) Distribution Function

In [None]:
def ecdf(x):
    n = x.shape[0]
    xx = np.sort(x)
    yy = np.zeros(n)
    for i in range(0, n):
        yy[i] = (xx < xx[i]).sum()
    yy = yy/n
    return xx, yy

In [None]:
aa, cdfa = ecdf(a)
bb, cdfb = ecdf(b)

In [None]:
plt.figure()
plt.plot(aa, cdfa, label='ecdf sample a')
plt.plot(bb, cdfb, label='ecdf sample b')
plt.legend()
plt.show()

In [None]:
cc, cdfc = ecdf(c)
dd, cdfd = ecdf(d)

In [None]:
plt.figure()
plt.plot(cc, cdfc, label='ecdf sample c')
plt.plot(dd, cdfd, label='ecdf sample d')
plt.legend()
plt.show()

#### Compare via QQ plots

In [None]:
from scipy.stats import probplot
import statsmodels.api as sm

In [None]:
#pp_a = sm.ProbPlot(a)
#pp_b = sm.ProbPlot(b)
plt.figure(figsize=(5,5))
sm.qqplot_2samples(a, b, xlabel=None, ylabel=None, line='45')
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sm.qqplot_2samples(c, d, xlabel=None, ylabel=None, line='45')
plt.show()

## Kolmogorov-Smirnov test for comparing two samples

In [None]:
from scipy.stats import ks_2samp

In [None]:
ks_2samp(a, b)

In [None]:
def ks_test(a, b, alpha=0.05):
    ks, pv = ks_2samp(a, b)
    print('Resultado de la prueba Kolmogorov-Smirnov')
    print('con {}% de confianza:'.format(1.-alpha))
    print('p-valor = ', pv)
    if (pv > alpha):
        print('No rechazamos Ho. Las distribuciones son iguales.')
    else:
        print('Rechazamos Ho. Las distribuciones son diferentes.')

In [None]:
ks_test(a, b, alpha=0.02)

In [None]:
ks_2samp(c, d)

In [None]:
ks_test(c, d, alpha=0.02)