In [92]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

import seaborn as sns
import warnings

plt.style.use('seaborn') # gráficos estilo seaborn
plt.rcParams["figure.figsize"] = (8, 6) # Tamaño gráficos
plt.rcParams["figure.dpi"] = 70 # resolución gráficos
warnings.filterwarnings('ignore')

### 1. Preparando ambiente de trabajo

In [93]:
# machine learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [94]:
df = pd.read_csv('boston.csv').drop('Unnamed: 0', axis=1)

In [95]:
# medidas descriptivas
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


### 2. Dividir la muestra

In [96]:
# vector objetivo
var_obj = df.loc[:, 'medv']

In [97]:
# matriz con atributos
mat_attr = df.drop('medv', axis=1)

In [98]:
# separando el conjunto
X_train, X_test, y_train, y_test = train_test_split(mat_attr, var_obj, test_size=.33, random_state=11238)

### 3. Generación de modelos

In [99]:
# modelo sin intercepto
model_1 = LinearRegression(fit_intercept=False)

In [100]:
# modelo con intercepto
model_2 = LinearRegression(fit_intercept=True)

In [101]:
# haciendo fit a los datos
model_1 = model_1.fit(X_train, y_train)
model_2 = model_2.fit(X_train, y_train)

In [102]:
# realizando predicciones
y_hat_1 = model_1.predict(X_test)
y_hat_2 = model_2.predict(X_test)

### 4. Obtención de métricas

In [103]:
def report_scores(y_predict, y_validate):
    mse = mean_squared_error(y_validate, y_predict)
    r2 = r2_score(y_validate, y_predict).round(2)
    print(f'Error cuadrático medio: {mse}')
    print(f'R2: {r2}')

In [104]:
dict_ = {
    'Modelo 1 (sin intercepto)': y_hat_1,
    'Modelo 2 (con intercepto)': y_hat_2
}

In [106]:
for model_, y_predict in dict_.items():
    print(model_)
    report_scores(y_predict, y_test)

Modelo 1 (sin intercepto)
Error cuadrático medio: 34.26939996145284
R2: 0.55
Modelo 2 (con intercepto)
Error cuadrático medio: 30.697831517740802
R2: 0.6


De acuerdo a los resultados anteriores, seleccionamos el modelo_2 que posee intercepto. Esto se debe a que su error cuadrático medio es menor, lo que implica un mejor ajuste, y además, la varianza explicada por los regresores es mayor que en el modelo sin intercepto.

### 5. Refactorización del modelo

In [109]:
# función que calcula correlaciones entre atributos y variable objetivo
def fetch_features(df, v_obj='medv'):
    for col in df.columns:
        if col != v_obj:
            corr_ = df[col].corr(df[v_obj]).round(2)
            abs_corr_ = abs(corr_)
            print(f'Correlación {col}: {corr_}')

In [110]:
fetch_features(df, v_obj='medv')

Correlación crim: -0.39
Correlación zn: 0.36
Correlación indus: -0.48
Correlación chas: 0.18
Correlación nox: -0.43
Correlación rm: 0.7
Correlación age: -0.38
Correlación dis: 0.25
Correlación rad: -0.38
Correlación tax: -0.47
Correlación ptratio: -0.51
Correlación black: 0.33
Correlación lstat: -0.74
