# Data Science Bootcamp
# <center> **Aula 17 -- Regression: Statsmodels Formula**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm

import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.graphics.gofplots import ProbPlot

from plot_diagnostic import diagnostic_plots

import warnings
warnings.filterwarnings('ignore')

## Statsmodels API Formula

In [None]:
df = sm.datasets.get_rdataset("Guerry", "HistData").data

In [None]:
df.head()

In [None]:
df = df[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna()

In [None]:
df.head()

### Sin categórica

In [None]:
X = df.iloc[:,1:-1]
X = sm.add_constant(X)
#X['Literacy*Wealth'] = X['Literacy']*X['Wealth']
y = df.iloc[:,0]

In [None]:
print(X.shape, y.shape)

In [None]:
X.head()

### Modelo 1

In [None]:
mod1 = smf.ols(formula='Lottery ~ Literacy + Wealth + Region', data=df)
reg1 = mod1.fit()

In [None]:
print(reg1.summary())

In [None]:
#betahat = reg1.params
#yhat = X.values @ betahat

In [None]:
minn = min(y.min(), yhat.min())
maxx = min(y.max(), yhat.max())

plt.figure(figsize=(5,5))
plt.plot(y, yhat, 'ko', label='observed')
#plt.plot(y, yhat, 'b--')
#plt.plot(x, yhat, 'r--', label='predicted', lw=2)
plt.plot([minn,maxx], [minn,maxx], 'b--')
plt.xlabel('observed')
plt.ylabel('predicted')
plt.xlim([minn-1, maxx+1])
plt.ylim([minn-1,maxx+1])
plt.legend()
plt.show()

### Modelo 2

In [None]:
mod2 = smf.ols(formula='Lottery ~ Literacy + Wealth', data=df)
reg2 = mod2.fit()

In [None]:
print(reg2.summary())

### Model 3

In [None]:
mod3 = smf.ols(formula='Lottery ~ Literacy + Wealth + C(Region)', data=df)
reg3 = mod3.fit()

In [None]:
print(reg3.summary())

### Model 4

In [None]:
mod4 = smf.ols(formula='Lottery ~ Literacy + Wealth - 1', data=df)
reg4 = mod4.fit()

In [None]:
print(reg4.summary())

### Model 5

In [None]:
mod5 = smf.ols(formula='Lottery ~ Literacy : Wealth - 1', data=df)
reg5 = mod5.fit()

In [None]:
print(reg5.summary())

### Model 6

In [None]:
mod6 = smf.ols(formula='Lottery ~ Literacy * Wealth - 1', data=df)
reg6 = mod6.fit()

In [None]:
print(reg6.summary())

### Model 7

In [None]:
mod7 = smf.ols(formula='Lottery ~ np.log(Literacy) + np.square(Literacy) + Literacy + Wealth + Literacy:Wealth - 1', data=df)
reg7 = mod7.fit()

In [None]:
print(reg7.summary())

### Model 8

In [None]:
mod8 = smf.ols(formula='Lottery ~ Literacy + Wealth + C(Region) + Literacy*Wealth*C(Region) - 1', data=df)
reg8 = mod8.fit()

In [None]:
print(reg8.summary())

In [None]:
print('Parameters: ', results5.params)
print('R2: ', results5.rsquared)

---

### Con categórica

In [None]:
dumm = pd.get_dummies(df)
X = dumm.iloc[:,1:]
X = sm.add_constant(X)
y = df.iloc[:,0]

In [None]:
print(X.shape, y.shape)

In [None]:
X.head()

In [None]:
#model = smf.ols(formula='Lottery ~ Literacy + Wealth + Region', data=df)
model = smf.ols(formula='Lottery ~ Literacy + Wealth + Region_C + Region_E + Region_N + Region_S + Region_W', data=dumm)
results = model.fit()

In [None]:
print(results.summary())

In [None]:
print('Parameters: ', results.params)
print('R2: ', results.rsquared)

In [None]:
betahat = results.params
yhat = X.values @ betahat

In [None]:
minn = min(y.min(), yhat.min())
maxx = min(y.max(), yhat.max())

plt.figure(figsize=(5,5))
plt.plot(y, yhat, 'ko', label='observed')
#plt.plot(y, yhat, 'b--')
#plt.plot(x, yhat, 'r--', label='predicted', lw=2)
plt.plot([minn,maxx], [minn,maxx], 'b--')
plt.xlabel('observed')
plt.ylabel('predicted')
plt.xlim([minn-1, maxx+1])
plt.ylim([minn-1,maxx+1])
plt.legend()
plt.show()