# Introduction

### Imports

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV, RidgeCV, Lasso, Ridge, ElasticNet

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Import-Data" data-toc-modified-id="Import-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import Data</a></span></li><li><span><a href="#Modeling" data-toc-modified-id="Modeling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Modeling</a></span></li><li><span><a href="#Evaluate-Best-Model" data-toc-modified-id="Evaluate-Best-Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Evaluate Best Model</a></span></li></ul></div>

# Import Data

# Modeling

In [None]:
X = sm.add_constant(data_x)
results = sm.OLS(list(data_y),X).fit()
results.summary()

In [None]:
def model_results(model_list, train_x, train_y, test_x, test_y):
    
    model_dict = []
    for name, model in model_list.items():
        print(name)
        model_stats = {}
        
        alpha_dict = {'alpha': [.01,.1,1,10,100,1000]}
        if name in ['elastic', 'lasso']:
            model = GridSearchCV(model, alpha_dict, cv=3)
            model.fit(train_x, train_y)
            model_stats['best_alpha'] = model.best_params_['alpha']
            model_stats['coef'] = model.best_estimator_.coef_
        else:
            model.fit(train_x,train_y)
            model_stats['coef'] = np.around(model.coef_)
            if name != 'linear':
                model_stats['best_alpha'] = model.alpha_
        
        pred_y = model.predict(test_x)
        
        model_stats['r2'] = model.score(test_x, test_y)
        model_stats['root_MSE'] = np.sqrt(mse(test_y, pred_y))
        model_stats['MAE'] = np.abs(test_y - pred_y).mean()
        model_stats['MAPE'] = (np.abs(test_y - pred_y) / test_y).mean() * 100
        model_stats['model_name'] = name
        
        
        model_dict.append(model_stats)
            
    model_df = pd.DataFrame(model_dict).set_index('model_name')

    return model_df

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=.2)

elastic = ElasticNet()
ridge = RidgeCV(alphas=(.01,1,10,100,1000))
lasso = Lasso()
linear = LinearRegression()
model_list = {'elastic': elastic, 'ridge': ridge, 'lasso': lasso, 'linear': linear}

In [None]:
model_df = model_results(model_list, train_x, train_y, test_x, test_y)

model_df

# Evaluate Best Model