# 1 Crea almenys dos models de regressió diferents per intentar predir el millor possible el preu de les vivendes (MEDV) de l'arxiu adjunt.
# 2 Compara’ls en base al MSE i al R2.

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

data=pd.read_csv('housing data.csv')
feature_names = ('CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV')
data.columns=feature_names
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
4,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
501,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
502,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
503,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [5]:
#scaling the predictors
scaler=StandardScaler()
df= (scaler.fit_transform(data))
df=pd.DataFrame(df, columns=feature_names)

#Defining X and y
y = df['MEDV']
X = df.drop(['MEDV'], axis=1)

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((353, 13), (152, 13))

In [6]:
#MODEL A

# Creación del modelo de regresión lineal
model_lr = LinearRegression()

# Entrenamiento del modelo 
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"MSE del modelo de regresión lineal sin validación cruzada: {mse_lr}")
print(f"R2 del modelo de regresión lineal sin validación cruzada: {r2_lr}")

MSE del modelo de regresión lineal sin validación cruzada: 0.25278983728383014
R2 del modelo de regresión lineal sin validación cruzada: 0.728350264880846


In [7]:
#MODEL B

from sklearn import linear_model

Xb = X_train
yb = y_train

lm = linear_model.LinearRegression()
modelb = lm.fit(Xb,yb)

predictionsB=lm.predict(X_test)
print ('R2 value of model B is', lm.score(Xb,yb))
print ('MSE for model B is', mean_squared_error(y_test, predictionsB))

R2 value of model B is 0.7413624898352481
MSE for model B is 0.25278983728383014


Summary:

Model A -> R2=0.642 ; MSE=0.367
Model B -> R2=0.741 ; MSE=0.252

# 3 Entrena’ls utilitzant els diferents paràmetres que admeten per intentar millorar-ne la predicció.

The model can be further improved by doing cross-validation, features analysis, and feature engineering and, of course, by trying out more advanced machine learning algorithms such as Tree Family of Algorithms (Decision Tree and Random Forest) or Optimization Algorithms (Support Vector Machines and Neural Networks). 

In [8]:
#Building, Predicting, and Evaluating the Neural Network Model

from sklearn.tree import DecisionTreeRegressor

modeldtr = DecisionTreeRegressor()
modeldtr.fit(X_train, y_train)

predictionsdtr=modeldtr.predict(X_test)
print ('R2 value of the Decision Tree Regressor is', modeldtr.score(X_train,y_train))
print ('MSE for the Decision Tree Regressor is', mean_squared_error(y_test, predictionsdtr))


R2 value of the Decision Tree Regressor is 1.0
MSE for the Decision Tree Regressor is 0.21302399705925165


In [9]:
# Búsqueda de hiperparámetros utilizando GridSearchCV con validación cruzada para Modelo A

from sklearn.model_selection import GridSearchCV

params_lr = {'normalize': [True, False]}
grid_lr = GridSearchCV(model_lr, params_lr, cv=5)
grid_lr.fit(X_train, y_train)

y_pred_lr_grid = grid_lr.predict(X_test)
mse_lr_grid = mean_squared_error(y_test, y_pred_lr_grid)
r2_lr_grid = r2_score(y_test, y_pred_lr_grid)

print(f"Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {grid_lr.best_params_}")
print(f"MSE del modelo de regresión lineal con validación cruzada: {mse_lr_grid}")
print(f"R2 del modelo de regresión lineal con validación cruzada: {r2_lr_grid}")

Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {'normalize': True}
MSE del modelo de regresión lineal con validación cruzada: 0.2527898372838298
R2 del modelo de regresión lineal con validación cruzada: 0.7283502648808463


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

In [25]:
# Búsqueda de hiperparámetros utilizando GridSearchCV con validación cruzada para Modelo B


params_modelb = {'normalize': [True, False]}
grid_modelb = GridSearchCV(modelb, params_modelb, cv=5, scoring='neg_mean_squared_error')
grid_modelb.fit(X_train, y_train)

ey_pred_modelb = grid_modelb.predict(X_test)
mse_modelb = mean_squared_error(y_test, y_pred_modelb)
r2_modelb = r2_score(y_test, y_pred_modelb)

print(f"Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {grid_lr.best_params_}")
print(f"MSE del modelo de regresión lineal con validación cruzada: {mse_lr_grid}")
print(f"R2 del modelo de regresión lineal con validación cruzada: {r2_lr_grid}")

Mejor MSE con GridSearchCV: 0.28984185869612866
Mejores parámetros con GridSearchCV: {'normalize': True}
Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {'normalize': True}
MSE del modelo de regresión lineal con validación cruzada: 0.2527898372838298
R2 del modelo de regresión lineal con validación cruzada: 0.7283502648808463


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

# 4 Compara el seu rendiment emprant l’aproximació traint/test o emprant totes les dades (validació interna).

In [21]:
# Modelo A (Grid_lr)

df1=pd.DataFrame(data, columns=feature_names)
y1 = df1['MEDV']
X1 = df1.drop(['MEDV'], axis=1)

params_lr = {'normalize': [True, False]}
grid_lr = GridSearchCV(model_lr, params_lr, cv=5)
grid_lr.fit(X1, y1)

y_pred_lr_grid = grid_lr.predict(X1)
mse_lr_grid = mean_squared_error(y1, y_pred_lr_grid)
r2_lr_grid = r2_score(y1, y_pred_lr_grid)

print(f"Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {grid_lr.best_params_}")
print(f"MSE del modelo de regresión lineal con validación cruzada: {mse_lr_grid}")
print(f"R2 del modelo de regresión lineal con validación cruzada: {r2_lr_grid}")

Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {'normalize': False}
MSE del modelo de regresión lineal con validación cruzada: 21.865579933351402
R2 del modelo de regresión lineal con validación cruzada: 0.7414879883947041


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

In [13]:
# Modelo B (grid_modelb)

params_modelb = {'normalize': [True, False]}
grid_modelb = GridSearchCV(modelb, params_modelb, cv=5, scoring='neg_mean_squared_error')
grid_modelb.fit(X1, y1)

y_pred_modelb = grid_modelb.predict(X1)
mse_modelb = mean_squared_error(y1, y_pred_modelb)
r2_modelb = r2_score(y1, y_pred_modelb)

print(f"Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {grid_lr.best_params_}")
print(f"MSE del modelo de regresión lineal con validación cruzada: {mse_lr_grid}")
print(f"R2 del modelo de regresión lineal con validación cruzada: {r2_lr_grid}")

Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {'normalize': False}
MSE del modelo de regresión lineal con validación cruzada: 21.865579933351402
R2 del modelo de regresión lineal con validación cruzada: 0.7414879883947041


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

En la comparacion ambos modelos tienen le mismo MSE y mismo R2

# 5 No facis servir la variable del nombre d'habitacions (RM) a l’hora de fer prediccions.

In [22]:
X1.drop(columns=['RM'], inplace=True)

In [23]:
#Model A
grid_lr.fit(X1, y1)

y_pred_lr_grid = grid_lr.predict(X1)
mse_lr_grid = mean_squared_error(y1, y_pred_lr_grid)
r2_lr_grid = r2_score(y1, y_pred_lr_grid)

print(f"Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {grid_lr.best_params_}")
print(f"MSE del modelo de regresión lineal con validación cruzada: {mse_lr_grid}")
print(f"R2 del modelo de regresión lineal con validación cruzada: {r2_lr_grid}")

Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {'normalize': False}
MSE del modelo de regresión lineal con validación cruzada: 25.537843278435854
R2 del modelo de regresión lineal con validación cruzada: 0.6980716149266408


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

In [20]:
#Model B
grid_modelb.fit(X1, y1)

y_pred_modelb = grid_modelb.predict(X1)
mse_modelb = mean_squared_error(y1, y_pred_modelb)
r2_modelb = r2_score(y1, y_pred_modelb)

print(f"Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {grid_lr.best_params_}")
print(f"MSE del modelo de regresión lineal con validación cruzada: {mse_lr_grid}")
print(f"R2 del modelo de regresión lineal con validación cruzada: {r2_lr_grid}")

Mejor valor para el hiperparámetro 'normalize' en el modelo de regresión lineal: {'normalize': False}
MSE del modelo de regresión lineal con validación cruzada: 25.537843278435854
R2 del modelo de regresión lineal con validación cruzada: 0.6980716149266408


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi