<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Multiple-Linear-Regression" data-toc-modified-id="Multiple-Linear-Regression-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Multiple Linear Regression</a></span><ul class="toc-item"><li><span><a href="#Simulando-Dados" data-toc-modified-id="Simulando-Dados-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Simulando Dados</a></span></li><li><span><a href="#Dados-Reais" data-toc-modified-id="Dados-Reais-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Dados Reais</a></span></li></ul></li></ul></div>

In [None]:
import psycopg2, random
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from sqlalchemy import create_engine
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm

engine = create_engine('postgresql+psycopg2://ironhack:123456@localhost:5432/ironhack')

# Multiple Linear Regression

## Simulando Dados

In [None]:
def simular_dado_mv(parametros_x1, parametros_x2, desvpad_E, samples):
    x1 = np.random.normal(loc=parametros_x1[0],
                          scale=parametros_x1[1],
                          size=samples)
    x2 = np.random.normal(loc=parametros_x2[0],
                          scale=parametros_x2[1],
                          size=samples)
    E = np.random.normal(loc=0, scale=desvpad_E, size=samples)
    y = parametros_x1[2] * x1 + parametros_x2[2] * x2 + E
    return pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})

In [None]:
simular_dado_mv((2, 5, 10), (10, 8, 5), 12, 100)

In [None]:
teste_mv = simular_dado_mv((0, 1, 10), (0, 1, 5), 2, 100)
sns.pairplot(teste_mv)

In [None]:
X = sm.add_constant(teste_mv[['x1', 'x2']])
Y = teste_mv['y']
modelo = sm.OLS(Y, X)
lm_fit = modelo.fit()
lm_fit.summary()

In [None]:
modelo = LinearRegression()
X = teste_mv[['x1', 'x2']]
Y = teste_mv['y']
modelo.fit(X, Y)
print(modelo.coef_)
print(modelo.intercept_)

In [None]:
teste_mv['pred'] = modelo.predict(teste_mv[['x1', 'x2']])

In [None]:
sns.pairplot(teste_mv)

In [None]:
import plotly.express as px
px.scatter_3d(teste_mv, x='x1', y='x2', z='y')

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Mesh3d(x=teste_mv['x1'], y=teste_mv['x2'], z=teste_mv['pred'], color='lightpink', opacity=0.50)])
fig.show()

In [None]:
def simular_dado_mv_cat(parametros_x1, categorias_dict, desvpad_E, samples):
    x1 = np.random.normal(loc=parametros_x1[0],
                          scale=parametros_x1[1],
                          size=samples)
    cat = random.choices(list(categorias_dict.keys()), k = samples)
    eff_cat = list(map(lambda x: categorias_dict[x], cat))
    E = np.random.normal(loc=0, scale=desvpad_E, size=samples)
    y = parametros_x1[2] * x1 + eff_cat + E
    return pd.DataFrame({'x1': x1, 'categoria': cat, 'y': y})

In [None]:
teste_mv_cat = simular_dado_mv_cat((0, 1, 5), {'A' : 1, 'B' : 5, 'C' : 10}, 1, 100)

In [None]:
sns.boxplot(data = teste_mv_cat, x = 'categoria', y = 'y');

In [None]:
sns.scatterplot(data = teste_mv_cat, x = 'x1', y = 'y', hue = 'categoria');

In [None]:
pd.get_dummies(teste_mv_cat['categoria'])

In [None]:
teste_mv_cat.join(pd.get_dummies(teste_mv_cat['categoria']))

In [None]:
teste_mv_cat = teste_mv_cat.join(pd.get_dummies(teste_mv_cat['categoria']))

In [None]:
X = sm.add_constant(teste_mv_cat[['x1', 'A' , 'B', 'C']])
Y = teste_mv_cat['y']
modelo = sm.OLS(Y, X)
lm_fit = modelo.fit()
lm_fit.summary()

In [None]:
X = sm.add_constant(teste_mv_cat[['x1', 'B', 'C']])
Y = teste_mv_cat['y']
modelo = sm.OLS(Y, X)
lm_fit = modelo.fit()
lm_fit.summary()

In [None]:
teste_mv_cat['pred'] = lm_fit.predict()

## Dados Reais

In [None]:
tb_insu = pd.read_csv('data/tb_insurance.csv')

In [None]:
tb_insu.info()

In [None]:
sns.pairplot(tb_insu.select_dtypes(include = 'number'))

In [None]:
fig, ax = plt.subplots(1,3, figsize = (12,4))
sns.boxplot(data = tb_insu, x = 'sex', y = 'expenses', ax=ax[0])
sns.boxplot(data = tb_insu, x = 'smoker', y = 'expenses', ax=ax[1])
sns.boxplot(data = tb_insu, x = 'region', y = 'expenses', ax=ax[2])

In [None]:
sns.scatterplot(data = tb_insu, x = 'bmi', y = 'expenses', hue = 'smoker')

In [None]:
sns.scatterplot(data = tb_insu[tb_insu['smoker']=='yes'], x = 'bmi', y = 'expenses')

In [None]:
min(tb_insu[(tb_insu['smoker']=='yes') & (tb_insu['expenses']>45000)]['bmi'])

In [None]:
tb_insu['obese'] = np.where(tb_insu['bmi'] >= 30, 'yes', 'no')

In [None]:
tb_insu = tb_insu.join(pd.get_dummies(tb_insu['obese'], prefix = 'obese'))
tb_insu = tb_insu.join(pd.get_dummies(tb_insu['smoker'], prefix = 'smoker'))

In [None]:
tb_insu

In [None]:
tb_insu['obese_smoker'] = tb_insu['obese_yes'] * tb_insu['smoker_yes']

In [None]:
X = sm.add_constant(tb_insu[['obese_smoker', 'age', 'smoker_yes']])
Y = tb_insu['expenses']
modelo = sm.OLS(Y, X)
lm_fit = modelo.fit()
lm_fit.summary()

In [None]:
tb_insu['lm_pred'] = lm_fit.predict()
sns.pairplot(tb_insu[['expenses', 'lm_pred', 'age', 'bmi']])

In [None]:
sns.lmplot(data = tb_insu[(tb_insu['smoker']=='yes') & (tb_insu['bmi'] <= 30)], x = 'bmi', y = 'expenses')

In [None]:
tb_insu['bmi_smoker'] = tb_insu['bmi'] * tb_insu['smoker_yes']

In [None]:
tb_insu

In [None]:
X = sm.add_constant(tb_insu[['obese_smoker', 'age', 'bmi_smoker', 'smoker_yes']])
Y = tb_insu['expenses']
modelo = sm.OLS(Y, X)
lm_fit = modelo.fit()
lm_fit.summary()

In [None]:
tb_insu['age_disp'] = tb_insu['age'] - np.mean(tb_insu['age'])
tb_insu['bmi_disp'] = tb_insu['bmi'] - np.mean(tb_insu['bmi'])
tb_insu['bmi_disp_smoker'] = tb_insu['bmi_disp'] * tb_insu['smoker_yes']

In [None]:
X = sm.add_constant(tb_insu[['obese_smoker', 'age_disp', 'bmi_disp_smoker', 'smoker_yes']])
Y = tb_insu['expenses']
modelo = sm.OLS(Y, X)
lm_fit = modelo.fit()
lm_fit.summary()

In [None]:
tb_insu['lm_pred'] = lm_fit.predict()
sns.pairplot(tb_insu[['expenses', 'lm_pred', 'bmi']], kind="kde")

In [None]:
fig, ax = plt.subplots(1,1, figsize= (5,5))
ax.axis('equal')
ax.set_xlim(0, 60000)
ax.set_ylim(0, 60000)

sns.scatterplot(data = tb_insu, x = 'lm_pred', y = 'expenses')

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
modelo = LinearRegression()
X = tb_insu[['obese_smoker', 'age', 'bmi_smoker', 'smoker_yes']]
scaler = StandardScaler()
scaler.fit(X)

In [None]:
Y = tb_insu['expenses']
modelo.fit(scaler.transform(X), Y)

In [None]:
sns.barplot(x = X.columns, y = modelo.coef_)

In [None]:
modelo.intercept_