In [1]:
import warnings
warnings.filterwarnings('ignore')

## train y test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
import numpy as np

X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [12]:
X_test.shape

(25, 1)

In [13]:
X_train.shape

(75, 1)

## Eligiendo un modelo

In [17]:
# Desde scikit learn

#Regresión Lineal
from sklearn.linear_model import LinearRegression 
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [18]:

#Regresor con gradiente descendiente
from sklearn.linear_model import SGDRegressor 
sgd_reg = SGDRegressor()
sgd_reg.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [19]:

#Arboles de decisión
from sklearn.tree import DecisionTreeRegressor 
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [21]:
# Calculamos el rmse en el conjunto de entrenamiento 
lin_reg_predict = lin_reg.predict(X_train)
sgd_reg_predict = sgd_reg.predict(X_train)
tree_reg_predict = tree_reg.predict(X_train)

In [22]:

from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_train, lin_reg_predict)
sgd_mse = mean_squared_error(y_train, sgd_reg_predict)
tree_mse = mean_squared_error(y_train, tree_reg_predict)

In [23]:
print("RMSE Entrenamiento: ", np.sqrt(lin_mse), np.sqrt(sgd_mse), np.sqrt(tree_mse))

RMSE Entrenamiento:  0.9055354866900556 1.2172753923097428 0.0


In [24]:
# Calculamos el rmse en el conjunto de test 
lin_reg_predict = lin_reg.predict(X_test)
sgd_reg_predict = sgd_reg.predict(X_test)
tree_reg_predict = tree_reg.predict(X_test)

In [25]:
lin_mse = mean_squared_error(y_test, lin_reg_predict)
sgd_mse = mean_squared_error(y_test, sgd_reg_predict)
tree_mse = mean_squared_error(y_test, tree_reg_predict)

In [26]:
print("RMSE Test: ", np.sqrt(lin_mse), np.sqrt(sgd_mse), np.sqrt(tree_mse))

RMSE Test:  0.9937066350712891 1.2509114374317918 1.7123884454711797


## Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score
# Indicamos que queremos hacer cross validation con 10 cortes. 
# Es decir vamos a iterar 10 veces y obtener los scores de cada iteración
scores = cross_val_score(tree_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)

In [28]:
rmse_scores = np.sqrt(-scores)

In [29]:
print("Scores: ", rmse_scores)
print("Promedio: ", rmse_scores.mean())
print("Desvío estandar: ", rmse_scores.std())

Scores:  [1.27073993 0.90063171 1.51230917 1.68431279 0.66295806 1.76176164
 2.03883428 1.60925294 1.69985363 0.87933068]
Promedio:  1.4019984857445693
Desvío estandar:  0.42970816892011793


In [30]:
scores = cross_val_score(lin_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)


In [31]:
print("Scores: ", rmse_scores)
print("Promedio: ", rmse_scores.mean())
print("Desvío estandar: ", rmse_scores.std())

Scores:  [0.93774523 0.85987809 0.88793725 0.95549012 0.88408271 1.09991267
 1.16941979 0.85266981 0.81658157 0.80376022]
Promedio:  0.9267477461166422
Desvío estandar:  0.11418014132299167
