<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Regressão-Linear" data-toc-modified-id="Regressão-Linear-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Regressão Linear</a></span><ul class="toc-item"><li><span><a href="#Simulando-dados" data-toc-modified-id="Simulando-dados-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Simulando dados</a></span><ul class="toc-item"><li><span><a href="#Construindo-modelo" data-toc-modified-id="Construindo-modelo-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Construindo modelo</a></span><ul class="toc-item"><li><span><a href="#Com-statsmodels" data-toc-modified-id="Com-statsmodels-1.1.1.1"><span class="toc-item-num">1.1.1.1&nbsp;&nbsp;</span>Com statsmodels</a></span></li><li><span><a href="#Com-SKLEARN" data-toc-modified-id="Com-SKLEARN-1.1.1.2"><span class="toc-item-num">1.1.1.2&nbsp;&nbsp;</span>Com SKLEARN</a></span></li></ul></li><li><span><a href="#Fazendo-previsões-com-SKLEARN" data-toc-modified-id="Fazendo-previsões-com-SKLEARN-1.1.2"><span class="toc-item-num">1.1.2&nbsp;&nbsp;</span>Fazendo previsões com SKLEARN</a></span></li></ul></li><li><span><a href="#Com-dados-reais" data-toc-modified-id="Com-dados-reais-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Com dados reais</a></span><ul class="toc-item"><li><span><a href="#Utilizando-StatsModels" data-toc-modified-id="Utilizando-StatsModels-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Utilizando StatsModels</a></span></li><li><span><a href="#Utilizando-SKLEARN" data-toc-modified-id="Utilizando-SKLEARN-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Utilizando SKLEARN</a></span></li><li><span><a href="#Avaliando-Erro" data-toc-modified-id="Avaliando-Erro-1.2.3"><span class="toc-item-num">1.2.3&nbsp;&nbsp;</span>Avaliando Erro</a></span></li></ul></li></ul></li></ul></div>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

# Regressão Linear

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm

## Simulando dados

In [None]:
def simular_dado(media_X, desvpad_X, 
                 desvpad_E, A, B, 
                 samples):
    x = np.random.normal(loc = media_X, scale = desvpad_X, size = samples)
    E = np.random.normal(loc = 0, scale = desvpad_E, size = samples)
    y = B + A * x + E
    return pd.DataFrame({'x' : x, 'y' : y})

In [None]:
teste = simular_dado(media_X = 0, desvpad_X = 1, 
                     desvpad_E = 5, A = 10, B = 50, 
                     samples = 100)

In [None]:
sns.pairplot(data = teste, diag_kind = 'kde', kind = 'reg',
            height = 4)

### Construindo modelo

#### Com statsmodels

In [None]:
teste = simular_dado(media_X = 0, desvpad_X = 1, 
                     desvpad_E = 0.1, A = 2, B = 10, 
                     samples = 100)

In [None]:
sns.pairplot(data = teste, diag_kind = 'kde', kind = 'reg',
            height = 4)

In [None]:
X = sm.add_constant(teste['x'])
y = teste['y']

In [None]:
modelo = sm.OLS(y, X)
lm_fit = modelo.fit()

In [None]:
lm_fit.summary()

In [None]:
def estimar_lm(data):
    X = sm.add_constant(data['x'])
    y = data['y']
    modelo = sm.OLS(y, X)
    lm_fit = modelo.fit()
    return lm_fit

In [None]:
teste_1 = simular_dado(media_X=0,
                       desvpad_X=1,
                       desvpad_E=10,
                       A=1,
                       B=0,
                       samples=100)

teste1_fit = estimar_lm(teste_1)
sns.pairplot(data = teste_1, diag_kind = 'kde', kind = 'reg',
            height = 4)
teste1_fit.summary()

#### Com SKLEARN

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
modelo = LinearRegression()

In [None]:
X = teste_1[['x']]
y = teste_1['y']
modelo.fit(X, y)

In [None]:
print(modelo.coef_[0])
print(modelo.intercept_)

### Fazendo previsões com SKLEARN

In [None]:
teste_1['pred'] = modelo.predict(X)

In [None]:
teste_1

In [None]:
sns.scatterplot(data=teste_1, x = 'x', y = 'y')
sns.lineplot(data=teste_1, x = 'x', y = 'pred', color = 'red')

## Com dados reais

### Utilizando StatsModels

In [None]:
tb_olist = pd.read_csv('data/tb_diaria_olist.csv')
tb_olist.describe()

In [None]:
sns.scatterplot(data = tb_olist, x = 'per_atraso', y = 'avg_review')

In [None]:
X = sm.add_constant(tb_olist['per_atraso'] * 100)
y = tb_olist['avg_review']
modelo = sm.OLS(y, X)
lm_fit = modelo.fit()
lm_fit.summary()

### Utilizando SKLEARN

In [None]:
lm_fit = LinearRegression()
X = tb_olist[['per_atraso']]
y = tb_olist['avg_review']
lm_fit.fit(X, y)

In [None]:
lm_fit.coef_

In [None]:
lm_fit.predict([[0.5]])

### Avaliando Erro

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
lm_fit = LinearRegression()
lm_fit.fit(X_train, y_train)

In [None]:
X_test['pred_avg_score'] = lm_fit.predict(X_test)
X_test['real_avg_score'] = y_test
X_test['erro2'] = (X_test['real_avg_score'] - X_test['pred_avg_score'])**2

In [None]:
np.sqrt(X_test['erro2'].mean())