# Modeling Exercises

Do your work for this exercise in a jupyter notebook named modeling within the regression-exercises repo. Add, commit, and push your work.

## 1. Select a dataset with a continuous target variable.

In [22]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from pydataset import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE, mutual_info_regression
from sklearn.linear_model import LinearRegression

import wrangle as wg
import evaluate as ev

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
tips = data('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


## 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [4]:
to_scale = ['total_bill']

train, val, test = wg.scale(tips, scaled_cols=to_scale)

In [5]:
train.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
190,0.419564,4.0,Male,Yes,Sun,Dinner,3
146,0.110599,1.5,Female,No,Thur,Lunch,2
75,0.24424,2.2,Female,No,Sat,Dinner,2


In [6]:
train = pd.get_dummies(train, columns=['sex', 'smoker', 'time'], drop_first=True)

In [7]:
train.head()

Unnamed: 0,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch
190,0.419564,4.0,Sun,3,1,1,0
146,0.110599,1.5,Thur,2,0,0,1
75,0.24424,2.2,Sat,2,0,0,0
235,0.260997,3.0,Sat,2,1,1,0
199,0.208002,2.0,Thur,2,0,1,1


In [8]:
rename = {'sex_Male':'male', 'smoker_Yes': 'smoker', 'time_Lunch':'Lunch'}

train = train.rename(columns=rename)

In [9]:
train = pd.get_dummies(train, columns=['day'], drop_first=False)

In [10]:
train = pd.get_dummies(train, columns=['size'], drop_first=False)

In [11]:
train.head()

Unnamed: 0,total_bill,tip,male,smoker,Lunch,day_Fri,day_Sat,day_Sun,day_Thur,size_1,size_2,size_3,size_4,size_5,size_6
190,0.419564,4.0,1,1,0,0,0,1,0,0,0,1,0,0,0
146,0.110599,1.5,0,0,1,0,0,0,1,0,1,0,0,0,0
75,0.24424,2.2,0,0,0,0,1,0,0,0,1,0,0,0,0
235,0.260997,3.0,1,1,0,0,1,0,0,0,1,0,0,0,0
199,0.208002,2.0,0,1,1,0,0,0,1,0,1,0,0,0,0


In [12]:
val.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
137,0.152074,2.0,Female,No,Thur,Lunch,2
129,0.174068,2.0,Female,No,Thur,Lunch,2
16,0.387725,3.92,Male,No,Sun,Dinner,2
140,0.211353,2.75,Female,No,Thur,Lunch,2
161,0.386049,3.5,Male,No,Sun,Dinner,4


In [13]:
val = pd.get_dummies(val, columns=['sex', 'smoker', 'time'], drop_first=True)
rename = {'sex_Male':'male', 'smoker_Yes': 'smoker', 'time_Lunch':'Lunch'}
val = val.rename(columns=rename)
val = pd.get_dummies(val, columns=['day'], drop_first=False)
val = pd.get_dummies(val, columns=['size'], drop_first=False)

In [14]:
val.head()

Unnamed: 0,total_bill,tip,male,smoker,Lunch,day_Fri,day_Sat,day_Sun,day_Thur,size_1,size_2,size_3,size_4,size_5,size_6
137,0.152074,2.0,0,0,1,0,0,0,1,0,1,0,0,0,0
129,0.174068,2.0,0,0,1,0,0,0,1,0,1,0,0,0,0
16,0.387725,3.92,1,0,0,0,0,1,0,0,1,0,0,0,0
140,0.211353,2.75,0,0,1,0,0,0,1,0,1,0,0,0,0
161,0.386049,3.5,1,0,0,0,0,1,0,0,0,0,1,0,0


## 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [15]:
X_train, y_train = wg.split_xy(train, 'tip')
X_val, y_val = wg.split_xy(val, 'tip')

In [16]:
results = pd.DataFrame({'actual' : y_train,
                         'y_mean': y_train.mean(),
                         'y_median': y_train.median()})

In [17]:
results.head()

Unnamed: 0,actual,y_mean,y_median
190,4.0,2.873765,2.715
146,1.5,2.873765,2.715
75,2.2,2.873765,2.715
235,3.0,2.873765,2.715
199,2.0,2.873765,2.715


In [18]:
ev.eval_model(results.actual, results.y_mean)

1.2763251258945032

In [19]:
ev.eval_model(results.actual, results.y_median)

1.2861617545330484

The model mean did ever so slightly better. We will use the mean.

In [20]:
lm = LinearRegression()

ev.train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 0.9728795175346702.
The validate RMSE is 1.1462331975916555.


In [24]:
ll = LassoLars(alpha=0)

ev.train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 0.9728795175346702.
The validate RMSE is 1.146233197591656.


In [38]:
ll = LassoLars(alpha=.03)

ev.train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 1.023337123748478.
The validate RMSE is 1.1343591021122756.


In [39]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [41]:
lm = LinearRegression()

ev.train_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 1.6388030149139878.
The validate RMSE is 2.2931339661747465.


In [42]:
tweedie = TweedieRegressor()

ev.train_model(tweedie, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 1.1413984702818962.
The validate RMSE is 1.5485725355573339.


In [43]:
rf = RandomForestRegressor()

ev.train_model(rf, X_train, y_train, X_val, y_val)
print()
ev.train_model(rf, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 0.4239838379966306.
The validate RMSE is 0.8736748261749856.

The train RMSE is 0.38359729358672345.
The validate RMSE is 0.8928673071112568.


In [45]:
xgbr = XGBRegressor()

ev.train_model(xgbr, X_train, y_train, X_val, y_val)

The train RMSE is 0.021288647421442104.
The validate RMSE is 1.0962602655852232.
