# Model Visualizations

### Notebook Summary:

#### Objective: provide visualizations to compare all modeling steps outcomes 

* Please go to the "modeling" directory to locate the relevant notebooks
* Relevant notebook names: "baseline_regression", "new_baseline_regression", "poly_regressions", "regularization", "ensembles"
* Models compared are: 
    * The initial Linear Regression baseline model
    * The new Linear Regression baseline after feature extraction, transformation, binarization, and ETL
    * Decision Tree, Support Vector, and KNN regressions
    * Linear Regression after adding interactions
    * Linear Regressions after adding interactions and performing feature selection
    * Lasso, Ridge, and Elastic Net regressions
    * Random Forest, Bagging, Adaboost, Gradient Boosting ensemble regressions

#### Conclusions: 
* We choose the tuned Lasso model for the best trade off between bias and variance 

Plot for First Batch of Regressions

In [None]:
X = ['Linear', 'Decision Tree', 'Support Vector', 'KNN', 'Linear w Interactions', 'Linear w Feature Sel','Quadratic']

In [None]:
train_errors = [np.sqrt(-scores_lin['train_neg_mean_squared_error'].mean()),
                np.sqrt(-scores_tree['train_neg_mean_squared_error'].mean())+.001,
                np.sqrt(-gs_results_svreg['mean_train_score'].mean()),
                np.sqrt(-gs_results_nn['mean_train_score'].mean()), 
                np.sqrt(-scores_lin_intonly['train_neg_mean_squared_error'].mean())+.001, 
                np.sqrt(-scores_lin_ratios['train_neg_mean_squared_error'].mean()),
                np.sqrt(-scores_lin_ratios_fsel['train_neg_mean_squared_error'].mean()),
                np.sqrt(-scores_quad['train_neg_mean_squared_error'].mean())+.001]

In [None]:
val_errors = [np.sqrt(-scores_lin['test_neg_mean_squared_error'].mean()),
              np.sqrt(-scores_tree['test_neg_mean_squared_error'].mean()),
              np.sqrt(-gs_results_svreg['mean_test_score'].mean()),
              np.sqrt(-gs_results_nn['mean_test_score'].mean()),
              np.sqrt(-scores_lin_intonly['test_neg_mean_squared_error'].mean()),
              np.sqrt(-scores_lin_ratios['test_neg_mean_squared_error'].mean()), 
              np.sqrt(-scores_lin_ratios_fsel['test_neg_mean_squared_error'].mean()),
              np.sqrt(-scores_quad['test_neg_mean_squared_error'].mean())]

In [None]:
test_errors = [lin_reg_rmse_test,
            np.sqrt(mean_squared_error(y_test, test_predictions_tree_reg)),
            np.sqrt(mean_squared_error(y_test, test_predictions_best_model_svr)),
            np.sqrt(mean_squared_error(y_test, test_predictions_best_model_kneigh)),
            np.sqrt(mean_squared_error(y_test_intonly, test_predictions_lin_reg_intonly)),
            test_lin_ratios_nofs,
            np.sqrt(mean_squared_error(y_test, test_predictions_ratios_lin_reg)),
            np.sqrt(mean_squared_error(y_test_quad, test_predictions_quad_reg))]

In [None]:
ax = plt.subplot(111)
x1 = [i-0.2 for i in range(len(train_errors))]
x2 = [i for i in range(len(train_errors))]
x3 = [i+0.2 for i in range(len(train_errors))]
ax.bar(x1, train_errors, width=0.2, color='b', align='center')
ax.bar(x2, val_errors, width=0.2, color='g', align='center')
ax.bar(x3, test_errors, width=0.2, color='r', align='center')
ax.set_xticklabels(X)
ax.xaxis.set_major_locator(ticker.FixedLocator([i-0.05 for i in x2]))
ax.set_xlabel('Models')
ax.set_ylabel('RMSE')
ax.set_title('Regression Models Comparison')
plt.show()

Plot for Regularized Regressions

In [None]:
np.sqrt(-scores_quad['train_neg_mean_squared_error'].mean())

In [None]:
X = ['Lasso', 'Ridge', 'Elastic Net']

In [None]:
train_errors = [train_scores_lasso.mean(), train_scores_ridge.mean(), train_scores_en.mean()]
val_errors = [val_scores_lasso.mean(), val_scores_ridge.mean(), val_scores_en.mean()]
test_errors = [np.sqrt(mean_squared_error(y_test, test_predictions_lasso)),
               np.sqrt(mean_squared_error(y_test, test_predictions_ridge)), 
               np.sqrt(mean_squared_error(y_test, test_predictions_en))]

In [None]:
np.sqrt(mean_squared_error(y_test, test_predictions_lasso))

In [None]:
ax = plt.subplot(111)
x1 = [i-0.2 for i in range(len(train_errors))]
x2 = [i for i in range(len(train_errors))]
x3 = [i+0.2 for i in range(len(train_errors))]
ax.bar(x1, train_errors, width=0.2, color='b', align='center')
ax.bar(x2, val_errors, width=0.2, color='g', align='center')
ax.bar(x3, test_errors, width=0.2, color='r', align='center')
ax.set_xticklabels(X)
ax.xaxis.set_major_locator(ticker.FixedLocator([i-0.05 for i in x2]))
ax.set_xlabel('Models')
ax.set_ylabel('RMSE')
ax.set_title('Regularized Models Comparison')
plt.show()

Plot for Ensembles

In [None]:
X = ['Random Forest', 'Bagging', 'AdaBoost', 'Gradient Boosting']

In [None]:
np.sqrt(-best_rfr_scores['train_neg_mean_squared_error'].mean())

In [None]:
train_errors = [np.sqrt(-best_rfr_scores['train_neg_mean_squared_error'].mean()),
                train_scores_br.mean(),
                train_scores_abr.mean(),
                train_scores_gbr.mean()]

val_errors = [np.sqrt(-best_rfr_scores['test_neg_mean_squared_error'].mean()),
              val_scores_br.mean(),
              val_scores_abr.mean(),
              val_scores_gbr.mean()]

test_errors = [np.sqrt(mean_squared_error(y_test, test_predictions_rf)),
               np.sqrt(mean_squared_error(y_test, test_predictions_br)),
               np.sqrt(mean_squared_error(y_test, test_predictions_abr)),
               np.sqrt(mean_squared_error(y_test, test_predictions_gbr))]

In [None]:
import matplotlib.ticker as ticker
ax = plt.subplot(111)
x1 = [i-0.2 for i in range(len(train_errors))]
x2 = [i for i in range(len(train_errors))]
x3 = [i+0.2 for i in range(len(train_errors))]
ax.bar(x1, train_errors, width=0.2, color='b', align='center')
ax.bar(x2, val_errors, width=0.2, color='g', align='center')
ax.bar(x3, test_errors, width=0.2, color='r', align='center')
ax.set_xticklabels(X)
ax.xaxis.set_major_locator(ticker.FixedLocator([i-0.05 for i in x2]))
ax.set_xlabel('Models')
ax.set_ylabel('RMSE')
ax.set_title('Ensembles Comparison')
plt.show()

In [None]:
# Ensemble Regression Models