# Gradient Descent Models

We will predict the price (`price` column) of an AirBNB dataset used last week.

**Therefore, our unit of analysis is an AIRBNB LISTING**

## 1. Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor 
from sklearn.dummy import DummyRegressor

np.random.seed(1)

# 2. Load the data

We will use the AirBNB data that we cleaned in last class (the original, not the one that you altered for last weeks exercise).

In [2]:
X_train = pd.read_csv("./data/airbnb_train_X_price.csv")
X_test = pd.read_csv("./data/airbnb_test_X_price.csv")
y_train = pd.read_csv("./data/airbnb_train_y_price.csv")
y_test = pd.read_csv("./data/airbnb_test_y_price.csv")

## 3. Model the data

First, we will create a dataframe to hold all the results of our models.

In [3]:
results = pd.DataFrame(y_train, columns=["actual"])

rmses = pd.DataFrame({"model": [], "rmse": []})

### 3.1 Fit and test the baseline Model

In [4]:
dummy_reg = DummyRegressor(strategy="mean")
_ = dummy_reg.fit(X_train, y_train)

results["dummy"] = dummy_reg.predict(X_train)

In [5]:
#Baseline Test RMSE
dummy_test_pred = dummy_reg.predict(X_test)
dummy_test_rmse = np.sqrt(mean_squared_error(y_test, dummy_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"baseline", 'rmse': dummy_test_rmse}, index=[0])])

print(f"Baseline Test RMSE: {dummy_test_rmse:.3f}")


Baseline Test RMSE: 102.625


### 3.2  Fit and test a Linear Regression Using Stochastic Gradient Descent

In [6]:
#Stochastic Gradient Descent:
# https://scikit-learn.org/stable/modules/sgd.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor


# eta0 = learning rate
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.01)
sgd_reg.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg.n_iter_}")

results["SGD_preds"] = sgd_reg.predict(X_train)

Number of iterations = 46


In [7]:
#SGD Test RMSE
SGD_test_pred = sgd_reg.predict(X_test)
SGD_test_rmse = np.sqrt(mean_squared_error(y_test, SGD_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD", 'rmse': SGD_test_rmse}, index=[0])])

print(f"SGD Test RMSE: {SGD_test_rmse:.3f}")

SGD Test RMSE: 65.459


### 3.3 Fit and test SGDRegression using L2 Regularization

In [8]:
#Stochastic Gradient with L2 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l2 = SGDRegressor(max_iter=100, penalty='l2', alpha = 0.1, eta0=0.01)
_ = sgd_reg_l2.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l2.n_iter_}")

results["SGD_preds_l2"] = sgd_reg_l2.predict(X_train)


Number of iterations = 62


In [9]:
#Train RMSE
#SGD Test RMSE
SGD_test_pred_l2 = sgd_reg_l2.predict(X_test)
SGD_test_rmse_l2 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l2))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L2", 'rmse': SGD_test_rmse_l2}, index=[0])])

print(f"SGD Test with l2 RMSE: {SGD_test_rmse_l2:.3f}")

SGD Test with l2 RMSE: 68.369


### 3.4 Fit and test SGDRegression Using L1 Regularization

In [10]:
#Stochastic Gradient with L1 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l1 = SGDRegressor(max_iter=100, penalty='l1', alpha = 0.1, eta0=0.01)
_ = sgd_reg_l1.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l1.n_iter_}")

results["SGD_preds_l1"] = sgd_reg_l1.predict(X_train)

Number of iterations = 46


In [11]:
#Train RMSE
#SGD Test RMSE with L1
SGD_test_pred_l1 = sgd_reg_l1.predict(X_test)
SGD_test_rmse_l1 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l1))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L1", 'rmse': SGD_test_rmse_l1}, index=[0])])

print(f"SGD with l1 Test RMSE: {SGD_test_rmse_l1:.3f}")

SGD with l1 Test RMSE: 65.730


### 3.5 Fit and test SGDRegression using ElasticNet Regularization

In [12]:
#Stochastic Gradient with Elastic Net regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_elastic = SGDRegressor(max_iter=100, penalty='elasticnet', l1_ratio=0.5, alpha = 0.1, eta0=0.01)
sgd_reg_elastic.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_elastic.n_iter_}")

results["SGD_preds_elastic"] = sgd_reg_elastic.predict(X_train)

Number of iterations = 34


In [13]:
#Train RMSE
#SGD Test RMSE with ElasticNet
SGD_test_pred_elastic = sgd_reg_elastic.predict(X_test)
SGD_test_rmse_elastic = np.sqrt(mean_squared_error(y_test, SGD_test_pred_elastic))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Elastic", 'rmse': SGD_test_rmse_elastic}, index=[0])])

print(f"SGD wt ElasticNet Test RMSE: {SGD_test_rmse_elastic:.3f}")

SGD wt ElasticNet Test RMSE: 67.579


### 3.6 Fit and test SGDRegression using Early Stopping 

In [14]:
#Stochastic Gradient with Early Stopping regularization:
# 
# # tol is the early stopping criteria

sgd_reg_es = SGDRegressor(max_iter=500, early_stopping=True, n_iter_no_change=5, tol=0.0001, validation_fraction=0.2, eta0=0.01)
sgd_reg_es.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_es.n_iter_}")

results["SGD_preds_es"] = sgd_reg_es.predict(X_train)

Number of iterations = 8


In [15]:
#Train RMSE
#SGD Test RMSE with Eaarly Stopping
SGD_test_pred_es = sgd_reg_es.predict(X_test)
SGD_test_rmse_es = np.sqrt(mean_squared_error(y_test, SGD_test_pred_es))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD ES", 'rmse': SGD_test_rmse_es}, index=[0])])

print(f"SGD wt Early Stopping Test RMSE: {SGD_test_rmse_es:.3f}")

SGD wt Early Stopping Test RMSE: 68.107


## 4.0 Polynomial Regression

We've already seen an example of this. Let's now try applying this to our AirBnb data.

This is done by creating the polynomial "variables" of the existing variables, then fitting them in a regular regression model

In [16]:
from sklearn.preprocessing import PolynomialFeatures

# Create second degree terms and interaction terms
poly_features = PolynomialFeatures(degree=2).fit(X_train)
X_train_poly = poly_features.transform(X_train)
X_test_poly = poly_features.transform(X_test)

#This will create the polynomial terms of the categorical variables too (since they are encoded as numbers)

#if degree=3, then it creates all combinations: a, a^2, a^3, b, b^2, b^3, a.b, a^2.b, a.b^2, a^2.b^2 

In [17]:
#We still fit a linear regression model

poly_lin_reg = SGDRegressor(max_iter=1000, penalty=None, eta0=0.01) 
poly_lin_reg.fit(X_train_poly, np.ravel(y_train))

print(f"Number of iterations = {poly_lin_reg.n_iter_}")

results["SGD_preds_ using polynomial"] = poly_lin_reg.predict(X_train_poly)

Number of iterations = 6


In [18]:
# Train RMSE
# SGD with polynomial input
poly_test_pred = poly_lin_reg.predict(X_test_poly)
poly_test_rmse = np.sqrt(mean_squared_error(y_test, poly_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Poly", 'rmse': poly_test_rmse}, index=[0])])

print(f"SGD wt Polynomial input Test RMSE: {poly_test_rmse:.3f}")

SGD wt Polynomial input Test RMSE: 1714113453796.395


The RMSE result from the polynomial is very large, a strong indicator that this may not be a good model. The problem is most likely related to having mamy coeficients that are not significant. We can use Lasso to reduce the size of some of the coeficients, or reduce the degree of the polynomial.

In [19]:
poly_lin_reg_l1 = SGDRegressor(max_iter=1000, penalty='l1', alpha=0.5,  eta0=0.01) 
poly_lin_reg_l1.fit(X_train_poly, np.ravel(y_train))

print(f"Number of iterations = {poly_lin_reg_l1.n_iter_}")

results["SGD_preds_ using polynomial with l1"] = poly_lin_reg_l1.predict(X_train_poly)

poly_test_pred_l1 = poly_lin_reg_l1.predict(X_test_poly)
poly_test_rmse_l1 = np.sqrt(mean_squared_error(y_test, poly_test_pred_l1))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Poly l1", 'rmse': poly_test_rmse_l1}, index=[0])])

print(f"SGD wt Polynomial input l1 regularization Test RMSE: {poly_test_rmse_l1:.3f}")

Number of iterations = 14
SGD wt Polynomial input l1 regularization Test RMSE: 278357632941.825


In [20]:
poly_lin_reg_l2 = SGDRegressor(max_iter=1000, penalty='l2', alpha=0.5,  eta0=0.01) 
poly_lin_reg_l2.fit(X_train_poly, np.ravel(y_train))

print(f"Number of iterations = {poly_lin_reg_l2.n_iter_}")

results["SGD_preds_ using polynomial with l2"] = poly_lin_reg_l2.predict(X_train_poly)

poly_test_pred_l2 = poly_lin_reg_l2.predict(X_test_poly)
poly_test_rmse_l2 = np.sqrt(mean_squared_error(y_test, poly_test_pred_l2))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Poly l2", 'rmse': poly_test_rmse_l2}, index=[0])])

print(f"SGD wt Polynomial input l2 regularization Test RMSE: {poly_test_rmse_l2:.3f}")

Number of iterations = 10
SGD wt Polynomial input l2 regularization Test RMSE: 1031453569474.263


In [21]:
poly_lin_reg_elastic = SGDRegressor(max_iter=1000, penalty='elasticnet', l1_ratio=.5, alpha=0.5,  eta0=0.01) 
poly_lin_reg_elastic.fit(X_train_poly, np.ravel(y_train))

print(f"Number of iterations = {poly_lin_reg_elastic.n_iter_}")

results["SGD_preds_ using polynomial with elastic net"] = poly_lin_reg_elastic.predict(X_train_poly)

poly_test_pred_elastic = poly_lin_reg_l1.predict(X_test_poly)
poly_test_rmse_elastic= np.sqrt(mean_squared_error(y_test, poly_test_pred_elastic))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Poly elastic", 'rmse': poly_test_rmse_elastic}, index=[0])])

print(f"SGD wt Polynomial input elastic net regularization Test RMSE: {poly_test_rmse_elastic:.3f}")

Number of iterations = 6
SGD wt Polynomial input elastic net regularization Test RMSE: 278357632941.825


### 4.1 Fit and test SGDRegression using Early Stopping 

In [22]:
#Stochastic Gradient with Early Stopping regularization:
# 
# # tol is the early stopping criteria

poly_sgd_reg_es = SGDRegressor(max_iter=500, early_stopping=True, n_iter_no_change=5, tol=0.0001, validation_fraction=0.2, eta0=0.01)
poly_sgd_reg_es.fit(X_train_poly, np.ravel(y_train))

print(f"Number of iterations = {poly_sgd_reg_es.n_iter_}")

results["POLY_SGD_preds_es"] = poly_sgd_reg_es.predict(X_train_poly)

Number of iterations = 15


In [23]:
#Train RMSE
#SGD Test RMSE with Early Stopping
poly_SGD_test_pred_es = poly_sgd_reg_es.predict(X_test_poly)
poly_SGD_test_rmse_es = np.sqrt(mean_squared_error(y_test, poly_SGD_test_pred_es))

rmses = pd.concat([rmses, pd.DataFrame({'model':"poly SGD ES", 'rmse': poly_SGD_test_rmse_es}, index=[0])])

print(f"SGD wt Early Stopping Test RMSE: {poly_SGD_test_rmse_es:.3f}")

SGD wt Early Stopping Test RMSE: 594744529656.351


## 5.0 Summary

In [24]:
rmses.sort_values(by=['rmse'])

Unnamed: 0,model,rmse
0,SGD,65.45902
0,SGD L1,65.73027
0,SGD Elastic,67.57895
0,SGD ES,68.10702
0,SGD L2,68.36941
0,baseline,102.6252
0,SGD Poly l1,278357600000.0
0,SGD Poly elastic,278357600000.0
0,poly SGD ES,594744500000.0
0,SGD Poly l2,1031454000000.0
