In [61]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

plt.style.use('seaborn') # gráficos estilo seaborn
plt.rcParams["figure.figsize"] = (8, 6) # Tamaño gráficos
plt.rcParams["figure.dpi"] = 70 # resolución gráficos
warnings.filterwarnings('ignore')

### 1. Preparando ambiente de trabajo

In [62]:
# machine learning libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [63]:
df = pd.read_csv('boston.csv').drop('Unnamed: 0', axis=1)

In [64]:
# medidas descriptivas
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


### 2. Dividir la muestra

In [65]:
# vector objetivo
var_obj = df.loc[:, 'medv']

In [66]:
# matriz con atributos
mat_attr = df.drop('medv', axis=1)

In [67]:
# separando el conjunto
X_train, X_test, y_train, y_test = train_test_split(mat_attr, var_obj, test_size=.33, random_state=11238)

### 3. Generación de modelos

In [68]:
# modelo sin intercepto
model_1 = LinearRegression(fit_intercept=False)

In [69]:
# modelo con intercepto
model_2 = LinearRegression(fit_intercept=True)

In [70]:
# haciendo fit a los datos
model_1 = model_1.fit(X_train, y_train)
model_2 = model_2.fit(X_train, y_train)

In [71]:
# realizando predicciones
y_hat_1 = model_1.predict(X_test)
y_hat_2 = model_2.predict(X_test)

### 4. Obtención de métricas

In [72]:
def report_scores(y_predict, y_validate):
    mse = mean_squared_error(y_validate, y_predict)
    r2 = r2_score(y_validate, y_predict).round(2)
    print(f'Error cuadrático medio: {mse}')
    print(f'R2: {r2}')

In [73]:
dict_ = {
    'Modelo 1 (sin intercepto)': y_hat_1,
    'Modelo 2 (con intercepto)': y_hat_2
}

In [74]:
for model_, y_predict in dict_.items():
    print(model_)
    report_scores(y_predict, y_test)

Modelo 1 (sin intercepto)
Error cuadrático medio: 34.26939996145284
R2: 0.55
Modelo 2 (con intercepto)
Error cuadrático medio: 30.697831517740802
R2: 0.6


De acuerdo a los resultados anteriores, seleccionamos el modelo_2 que posee intercepto. Esto se debe a que su error cuadrático medio es menor, lo que implica un mejor ajuste, y además, la varianza explicada por los regresores es mayor que en el modelo sin intercepto.

### 5. Refactorización del modelo

In [75]:
# función que calcula correlaciones entre atributos y variable objetivo
def fetch_features(df, v_obj='medv'):
    columnas, corr_, abs_corr = [], [], []
    for col in df.columns:
        if col != v_obj:
            columnas.append(col)
            corr_.append(df[col].corr(df[v_obj]).round(2))
            abs_corr.append(abs(df[col].corr(df[v_obj]).round(2)))
    
    df_corr = pd.DataFrame({
        'attributes': columnas,
        'pearson_r': corr_,
        'abs_pearson_r': abs_corr})
    return df_corr

In [76]:
df_corr = fetch_features(df, v_obj='medv')
df_corr

Unnamed: 0,attributes,pearson_r,abs_pearson_r
0,crim,-0.39,0.39
1,zn,0.36,0.36
2,indus,-0.48,0.48
3,chas,0.18,0.18
4,nox,-0.43,0.43
5,rm,0.7,0.7
6,age,-0.38,0.38
7,dis,0.25,0.25
8,rad,-0.38,0.38
9,tax,-0.47,0.47


In [77]:
# top 6 correlaciones
top_6 = df_corr.sort_values('abs_pearson_r', ascending=False).reset_index(drop=True)[['attributes', 'abs_pearson_r']][:6]
top_6

Unnamed: 0,attributes,abs_pearson_r
0,lstat,0.74
1,rm,0.7
2,ptratio,0.51
3,indus,0.48
4,tax,0.47
5,nox,0.43


### 6. Refactorización modelo predictivo

In [78]:
# seleccion atributos con mayor correlación (top 6)
mat_attr_refact = mat_attr.loc[:, ['lstat', 'rm', 'ptratio', 'indus', 'tax', 'nox']]

In [79]:
# separamos conjuntos entrenamiento y validacion
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(mat_attr_refact, var_obj, test_size=.33, random_state=1129)

In [80]:
# ajustamos modelo
modelo_3 = LinearRegression(fit_intercept=True)
modelo_3 = modelo_3.fit(X_train_3, y_train_3)

In [81]:
# predecimos
y_hat_3 = modelo_3.predict(X_test_3)

In [82]:
# obtenemos métricas
report_scores(y_hat_3, y_test_3)

Error cuadrático medio: 24.0370551991645
R2: 0.71


A partir del resultado anterior podemos decir que el error cuadrático medio disminuyó considerablemente agregando solo las variables de mayor correlación. Además, aumentó la capacidad explicativa de la varianza del valor medio de las casas.

### 7. Predicción de casos

In [83]:
# peor escenario
worst_neighbor = np.array([37.9, 12.6, 3.5, 27.7, 187, 0.87]).reshape(1,-1)

In [84]:
# mejor escenario
best_neighbor = np.array([1.73, 22, 8.7, 0.46, 711, 0.38]).reshape(1,-1)

In [85]:
# prediccion peor escenario
worst_value = modelo_3.predict(worst_neighbor)
worst_value

array([47.10622713])

In [86]:
# prediccion mejor escenario
best_value = modelo_3.predict(best_neighbor)
best_value

array([99.50398532])