# Data Science Bootcamp
# <center> **Aula 12 -- Statistical Inference: Fitting Distributions**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

## Fitting a Distribution to a Data Sample

We use the method of maximum likelihood estimation to find the best set of parameters of a distribution, fitting a dataset.

In [None]:
# generate synthetic data from a normal distribution
a = stats.norm.rvs(size=(200))

In [None]:
plt.figure()
sns.distplot(a, bins=20, hist=True, kde=True, rug=True)
plt.show()

In [None]:
print(a.mean())
print(a.var())

In [None]:
mu, sigma = stats.norm.fit(a)

In [None]:
print(mu, sigma)

In [None]:
x = np.linspace(-3.5, 3.5, 71)
f = stats.norm.pdf(x, loc=mu, scale=sigma)

In [None]:
plt.figure()
sns.distplot(a, bins=20, hist=True, kde=True, rug=True)
plt.plot(x, f, label='fitted')
plt.legend()
plt.show()

## A Log Normal

In [None]:
b = np.exp(a)

In [None]:
plt.figure()
sns.distplot(b, bins=25, hist=True, kde=True, rug=True)
plt.show()

In [None]:
params = stats.lognorm.fit(b)

In [None]:
params

In [None]:
x = np.linspace(0., 15, 151)
f = stats.lognorm.pdf(x, *params)

In [None]:
plt.figure()
sns.distplot(b, bins=45, hist=True, kde=True, rug=True)
plt.plot(x, f, label='fitted')
plt.legend()
plt.show()

In [None]:
# log-likelihood
llhood = stats.lognorm.logpdf(b, *params).sum()

In [None]:
llhood

In [None]:
size = len(b)

In [None]:
dist_names = ['lognorm', 'gamma', 'f', 'beta', 'gompertz']

In [None]:
llhoods = []
for dist_name in dist_names:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(b)
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
    plt.plot(pdf_fitted, label=dist_name)
    plt.xlim(0,47)
    lle = dist.logpdf(b, *params).sum()
    llhoods.append(lle)
sns.distplot(b, bins=25, hist=True, rug=True)
plt.legend(loc='upper right')
plt.show()

In [None]:
dist_names

In [None]:
llhoods

## Scipy Stats

In [None]:
from scipy import stats

In [None]:
from scipy.stats import norm

In [None]:
dist_continu  = [d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_continuous)]
dist_discrete = [d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_discrete)]

print('number of continuous distributions: %d' % len(dist_continu))
print('number of discrete distributions:   %d' % len(dist_discrete))

In [None]:
print(dist_discrete)

In [None]:
print(dist_continu)

## Example 1

In [None]:
data = pd.read_csv('understat_per_game.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['league'].value_counts()

In [None]:
data['year'].value_counts()

In [None]:
dataLiga = data[(data.league == 'La_liga') & (data.year == 2014)]

In [None]:
xG = dataLiga.xG

In [None]:
plt.figure()
sns.distplot(xG, bins=25, hist=True, kde=True, rug=False)
plt.title('')
plt.show()

#### Ajustamos una gamma

In [None]:
params_gamma = stats.gamma.fit(xG)

In [None]:
params_gamma

In [None]:
x = np.linspace(0, 7.2, 73)
f = stats.gamma.pdf(x, *params_gamma)

In [None]:
plt.figure()
sns.distplot(xG, kde=True, label='data')
plt.plot(x, f, label='fit gamma')
plt.legend()
plt.show()

In [None]:
llhood = stats.gamma.logpdf(xG, *params_gamma).sum()
llhood

#### Ajustamos una Rayleigh

In [None]:
params_rayleigh = stats.rayleigh.fit(xG)

In [None]:
params_rayleigh

In [None]:
x = np.linspace(0, 7.2, 73)
g = stats.rayleigh.pdf(x, *params_rayleigh)

In [None]:
plt.figure()
sns.distplot(xG, kde=True, label='data')
plt.plot(x, f, label=r'fit Gamma')
plt.plot(x, g, label=r'fit Rayleigh')
plt.legend()
plt.show()

In [None]:
llhood = stats.rayleigh.logpdf(xG, *params_rayleigh).sum()
llhood

#### Comparando varias opciones

In [None]:
size = len(xG)
size

In [None]:
dist_names = ['gamma', 'f', 'beta']

In [None]:
llhoods = []
sns.distplot(xG, kde=False, label='data')
for dist_name in dist_names:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(xG)
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
    plt.plot(pdf_fitted, label=dist_name)
    lle = dist.logpdf(xG, *params).sum()
    llhoods.append(lle)
plt.legend(loc='upper right')
plt.show()

In [None]:
llhoods

## Example 2

In [None]:
dataraw = pd.read_csv('featuresdf.csv')

In [None]:
dataraw.head()

In [None]:
dataraw.shape

In [None]:
data = dataraw.iloc[:,3:-1]

In [None]:
ax = sns.pairplot(data, diag_kind='kde')

Fit a proper distribution to a variable of the previous dataset. Choose any numerical variable of your preference.

## Example 3

In [None]:
from statsmodels.datasets import elnino

In [None]:
dataraw = elnino.load_pandas().data
dataraw['YEAR'] = dataraw['YEAR'].astype(int)
dataraw.set_index('YEAR', inplace=True)

In [None]:
dataraw.transpose()

In [None]:
data = dataraw.values.ravel()
data.shape

In [None]:
plt.figure()
sns.distplot(data, bins=25, rug=True)
plt.show()

In [None]:
from scipy.stats import gaussian_kde

In [None]:
kernel = gaussian_kde(data)

In [None]:
y = kernel.pdf(data)

In [None]:
idx = np.argsort(data)
plt.figure()
plt.plot(data[idx], y[idx])
plt.show()

In [None]:
#yy, xx = np.histogram(data, bins=25)

In [None]:
from pylab import *
from scipy.optimize import curve_fit

In [None]:
def gaussian(x, mu, sigma, c):
    return c*np.exp(-(x-mu)**2 / (2.*sigma**2))

def binormal(x, mu1, sigma1, c1, mu2, sigma2, c2):
    g1 = c1*np.exp(-(x-mu1)**2 / (2.*sigma1**2))
    g2 = c2*np.exp(-(x-mu2)**2 / (2.*sigma2**2))
    return g1 + g2

In [None]:
expected = (21., 0.2, 0.5, 25., 2., 0.5)

In [None]:
params, cov = curve_fit(binormal, data, y, expected)

In [None]:
params

In [None]:
cov

In [None]:
xx = np.linspace(17., 31., 141)
ff = binormal(xx, *params)

In [None]:
plt.figure()
sns.distplot(data, bins=25, label='data')
plt.plot(xx, ff, label='fit binormal')
plt.legend()
plt.show()