# Tarea métricas de regresión

**Desarrollado por:** Orlando Patricio Chacón Molina

**Fecha de creación:** 2022-02-04

**Fecha de actualización:** 2022-02-06

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import math
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
# Leer datos
filename = 'data/modifiedBostonHousing.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
df.shape

(506, 14)

In [4]:
df = df.loc[ : , ['RM','LSTAT','PTRATIO','price']]
df.head()

Unnamed: 0,RM,LSTAT,PTRATIO,price
0,6.575,4.98,15.3,24.0
1,6.421,9.14,17.8,21.6
2,7.185,4.03,17.8,34.7
3,6.998,2.94,18.7,33.4
4,7.147,5.33,18.7,36.2


In [5]:
df.shape

(506, 4)

In [6]:
df.isnull().sum()# el método .isnull() de pandas es un alias del método .isna()

RM         0
LSTAT      0
PTRATIO    0
price      1
dtype: int64

In [7]:
df = df.dropna(how = 'any')
df.isna().sum()

RM         0
LSTAT      0
PTRATIO    0
price      0
dtype: int64

In [8]:
df.corr()

Unnamed: 0,RM,LSTAT,PTRATIO,price
RM,1.0,-0.614889,-0.356013,0.143347
LSTAT,-0.614889,1.0,0.372996,-0.027292
PTRATIO,-0.356013,0.372996,1.0,-0.187262
price,0.143347,-0.027292,-0.187262,1.0


In [9]:
df.corr().sort_values(by='price')

Unnamed: 0,RM,LSTAT,PTRATIO,price
PTRATIO,-0.356013,0.372996,1.0,-0.187262
LSTAT,-0.614889,1.0,0.372996,-0.027292
RM,1.0,-0.614889,-0.356013,0.143347
price,0.143347,-0.027292,-0.187262,1.0


In [10]:
filtroPrecioNegativo = df.loc[ : , 'price'] < 0
df = df.loc[~filtroPrecioNegativo, : ]
df.head()

Unnamed: 0,RM,LSTAT,PTRATIO,price
0,6.575,4.98,15.3,24.0
1,6.421,9.14,17.8,21.6
2,7.185,4.03,17.8,34.7
3,6.998,2.94,18.7,33.4
4,7.147,5.33,18.7,36.2


In [11]:
X = df.loc[:, ['RM', 'LSTAT', 'PTRATIO']].values
X.shape

(404, 3)

In [12]:
y = df.loc[:,'price'].values
y.shape

(404,)

In [13]:
reg = LinearRegression(fit_intercept = True)
reg.fit(X,y)

LinearRegression()

In [14]:
# Agregar las columnas E1, E1_2, E2 y E2_2 como variables 
# auxiliares para calcular MSE y Var(y) 

y2=pd.DataFrame(y,columns=['PRICE'])

#Completando columnas para obtener R_2
#E1_2 también servirá para obtener el MSE
mean = y2['PRICE'].mean()
y2['PREDICTED'] = reg.predict(X)
y2['E1'] = y2['PRICE'] - y2['PREDICTED']
y2['E1_2'] = y2['E1'].apply(lambda x: math.pow(x,2))
y2['E2'] = y2['PRICE'] - mean
y2['E2_2'] = y2['E2'].apply(lambda x: math.pow(x,2))

#Completando columna para obtener MAE
y2['E1_abs'] = y2['E1'].apply(lambda x: abs(x))

y2.head()


Unnamed: 0,PRICE,PREDICTED,E1,E1_2,E2,E2_2,E1_abs
0,24.0,30.936923,-6.936923,48.120896,0.991337,0.982748,6.936923
1,21.6,25.547329,-3.947329,15.581407,-1.408663,1.984332,3.947329
2,34.7,32.878409,1.821591,3.318193,11.691337,136.687352,1.821591
3,33.4,31.221269,2.178731,4.746869,10.391337,107.979877,2.178731
4,36.2,31.141585,5.058415,25.58756,13.191337,174.011362,5.058415


## Calcular manualmente las métricas de regresión: $R^2$, $MAE$, $MSE$ y $RMSE$ 
En esta sección se calcularán las métricas de regresión y se compararán con los valores obtenidos mediante librerías de sklearn 

### Obtener $R ^ 2$ aplicando la fórmula: $R ^ 2 = 1- \frac{MSE}{Var(y)}$

In [15]:
# Aplicar la fórmula de R_2
R_2 = 1- (y2['E1_2'].sum()/y2['E2_2'].sum())
R_2

0.7848414273030437

In [16]:
# Se calcula mediante la fórmula incorporada en sklearn
# para comparar con el resultado encontrado mediante la fórmula.
reg.score(X,y)

0.7848414273030437

Los resultados obtenidos mediante la fórmula y la librería coinciden:

**0.7848414273030437**

### Obtener $MAE$ aplicando la fórmula: $MAE = \frac{1}{n}\displaystyle\sum_{i=1}^{n} |y_i - \hat{y}_i|$

In [17]:
MAE = y2['E1_abs'].sum()/y2['E1_abs'].count()
MAE

3.2313558816689705

In [18]:
mean_absolute_error(y,reg.predict(X))

3.2313558816689705

Los resultados obtenidos mediante la fórmula y la librería coinciden:

**3.2313558816689705**

### Obtener $MSE$ aplicando la fórmula: $MSE = \frac{1}{n}\displaystyle\sum_{i=1}^{n} {(y_i - \hat{y}_i)}^2$

In [19]:
MSE = y2['E1_2'].sum()/y2['E1_2'].count()
MSE

17.9154872752605

In [20]:
mean_squared_error(y,reg.predict(X))

17.9154872752605

### Obtener $RMSE$ aplicando la fórmula: $RMSE = \sqrt{\frac{1}{n}\displaystyle\sum_{i=1}^{n} {(y_i - \hat{y}_i)}^2}$

In [21]:
RMSE = math.sqrt(MSE)
RMSE

4.232669048633557

In [26]:
mean_squared_error(y,reg.predict(X),squared=False)

4.232669048633557