# Gradient Descent Models

We will predict the price (`price` column) of an AirBNB dataset used last week.

**Therefore, our unit of analysis is an AIRBNB LISTING**

## 1. Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor 
from sklearn.dummy import DummyRegressor

np.random.seed(1)

# 2. Load the data

We will use the AirBNB data that we cleaned in last class (the original, not the one that you altered for last weeks exercise).

In [2]:
X_train = pd.read_csv("./data/airbnb_train_X_price.csv")
X_test = pd.read_csv("./data/airbnb_test_X_price.csv")
y_train = pd.read_csv("./data/airbnb_train_y_price.csv")
y_test = pd.read_csv("./data/airbnb_test_y_price.csv")

## 3. Model the data

First, we will create a dataframe to hold all the results of our models.

In [3]:
results = pd.DataFrame(y_train, columns=["actual"])

rmses = pd.DataFrame({"model": [], "rmse": []})

### 3.1 Fit and test the baseline Model

In [4]:
dummy_reg = DummyRegressor(strategy="mean")
_ = dummy_reg.fit(X_train, y_train)

results["dummy"] = dummy_reg.predict(X_train)

In [5]:
#Baseline Test RMSE
dummy_test_pred = dummy_reg.predict(X_test)
dummy_test_rmse = np.sqrt(mean_squared_error(y_test, dummy_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"baseline", 'rmse': dummy_test_rmse}, index=[0])])

print(f"Baseline Test RMSE: {dummy_test_rmse:.3f}")


Baseline Test RMSE: 102.625


### 3.2  Fit and test a Linear Regression Using Stochastic Gradient Descent

In [6]:
#Stochastic Gradient Descent:
# https://scikit-learn.org/stable/modules/sgd.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor


# eta0 = learning rate
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.01)
sgd_reg.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg.n_iter_}")

results["SGD_preds"] = sgd_reg.predict(X_train)

Number of iterations = 46


In [7]:
#SGD Test RMSE
SGD_test_pred = sgd_reg.predict(X_test)
SGD_test_rmse = np.sqrt(mean_squared_error(y_test, SGD_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD", 'rmse': SGD_test_rmse}, index=[0])])

print(f"SGD Test RMSE: {SGD_test_rmse:.3f}")

SGD Test RMSE: 65.459


### 3.3 Fit and test SGDRegression using L2 Regularization

In [8]:
#Stochastic Gradient with L2 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l2 = SGDRegressor(max_iter=100, penalty='l2', alpha = 0.1, eta0=0.01)
_ = sgd_reg_l2.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l2.n_iter_}")

results["SGD_preds_l2"] = sgd_reg_l2.predict(X_train)


Number of iterations = 62


In [9]:
#Train RMSE
#SGD Test RMSE
SGD_test_pred_l2 = sgd_reg_l2.predict(X_test)
SGD_test_rmse_l2 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l2))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L2", 'rmse': SGD_test_rmse_l2}, index=[0])])

print(f"SGD Test with l2 RMSE: {SGD_test_rmse_l2:.3f}")

SGD Test with l2 RMSE: 68.369


### 3.4 Fit and test SGDRegression Using L1 Regularization

In [10]:
#Stochastic Gradient with L1 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l1 = SGDRegressor(max_iter=100, penalty='l1', alpha = 0.1, eta0=0.01)
_ = sgd_reg_l1.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l1.n_iter_}")

results["SGD_preds_l1"] = sgd_reg_l1.predict(X_train)

Number of iterations = 46


In [11]:
#Train RMSE
#SGD Test RMSE with L1
SGD_test_pred_l1 = sgd_reg_l1.predict(X_test)
SGD_test_rmse_l1 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l1))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L1", 'rmse': SGD_test_rmse_l1}, index=[0])])

print(f"SGD with l1 Test RMSE: {SGD_test_rmse_l1:.3f}")

SGD with l1 Test RMSE: 65.730


### 3.5 Fit and test SGDRegression using ElasticNet Regularization

In [12]:
#Stochastic Gradient with Elastic Net regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_elastic = SGDRegressor(max_iter=100, penalty='elasticnet', l1_ratio=0.5, alpha = 0.1, eta0=0.01)
sgd_reg_elastic.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_elastic.n_iter_}")

results["SGD_preds_elastic"] = sgd_reg_elastic.predict(X_train)

Number of iterations = 34


In [13]:
#Train RMSE
#SGD Test RMSE with ElasticNet
SGD_test_pred_elastic = sgd_reg_elastic.predict(X_test)
SGD_test_rmse_elastic = np.sqrt(mean_squared_error(y_test, SGD_test_pred_elastic))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Elastic", 'rmse': SGD_test_rmse_elastic}, index=[0])])

print(f"SGD wt ElasticNet Test RMSE: {SGD_test_rmse_elastic:.3f}")

SGD wt ElasticNet Test RMSE: 67.579


### 3.6 Fit and test SGDRegression using Early Stopping 

In [14]:
#Stochastic Gradient with Early Stopping regularization:
# 
# # tol is the early stopping criteria

sgd_reg_es = SGDRegressor(max_iter=500, early_stopping=True, n_iter_no_change=5, tol=0.0001, validation_fraction=0.2, eta0=0.01)
sgd_reg_es.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_es.n_iter_}")

results["SGD_preds_es"] = sgd_reg_es.predict(X_train)

Number of iterations = 8


In [15]:
#Train RMSE
#SGD Test RMSE with Eaarly Stopping
SGD_test_pred_es = sgd_reg_es.predict(X_test)
SGD_test_rmse_es = np.sqrt(mean_squared_error(y_test, SGD_test_pred_es))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD ES", 'rmse': SGD_test_rmse_es}, index=[0])])

print(f"SGD wt Early Stopping Test RMSE: {SGD_test_rmse_es:.3f}")

SGD wt Early Stopping Test RMSE: 68.107


## 5.0 Summary

In [16]:
rmses.sort_values(by=['rmse'])

Unnamed: 0,model,rmse
0,SGD,65.459018
0,SGD L1,65.730268
0,SGD Elastic,67.578949
0,SGD ES,68.107018
0,SGD L2,68.369405
0,baseline,102.62523
