# Modeling

In [1]:
import sys

sys.path.append('/Users/nickolaspedrimiranda/CodeUp/Regression-Project/')

In [2]:
# Import basic DS libraries
import pandas as pd
import numpy as np

# Import libraries for visuals
import matplotlib.pyplot as plt
import seaborn as sns

# Import useful DS functions
import acquire as ac
import prepare as prep
import evaluate as ev
import explore as ex

# For modeling
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

seed = 100

In [3]:
wine = ac.get_wine()

In [4]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [5]:
wine = ex.reajust_range(wine)
# wine = wine.drop(columns='residual_sugar')

In [6]:
train, val, test = prep.train_val_test(wine, strat='quality', seed=seed)
train, val, test = prep.scale(wine, train=train, val=val, test=test, scaled_cols=(wine.select_dtypes(float).columns))
train, val, test = prep.dummies(train, val, test, drop_first=['wine_type'])

In [7]:
train.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine_type_white
324,0.739726,0.573427,0.253165,0.514851,0.563636,0.152866,0.173913,0.928986,0.4,0.61039,0.137931,6,0
5697,0.273973,0.181818,0.43038,0.247525,0.336364,0.369427,0.529644,0.3796,0.457143,0.207792,0.448276,5,1
2405,0.246575,0.321678,0.329114,0.623762,0.336364,0.713376,0.786561,0.670755,0.409524,0.246753,0.206897,5,1
3107,0.30137,0.181818,0.620253,0.688119,0.463636,0.624204,0.648221,0.774048,0.771429,0.285714,0.241379,6,1
5166,0.315068,0.251748,0.341772,0.178218,0.2,0.242038,0.29249,0.244028,0.361905,0.220779,0.534483,6,1


In [8]:
x_train, y_train = prep.split_xy(train, 'quality')
x_val, y_val = prep.split_xy(val, 'quality')

## BASELINE

In [9]:
results = ev.baseline(train.quality)

In [10]:
results.head()

Unnamed: 0,quality,base_median,base_mean
324,6,6.0,5.83048
5697,5,6.0,5.83048
2405,5,6.0,5.83048
3107,6,6.0,5.83048
5166,6,6.0,5.83048


In [11]:
SSE, MSE, RMSE = ev.eval_model(results.quality, results.base_mean.round())
SSE, MSE, RMSE

(3193.0, 0.7591535901093676, 0.8712942041063785)

In [12]:
SSE, MSE, RMSE = ev.eval_model(results.quality, results.base_median)
SSE, MSE, RMSE

(3193.0, 0.7591535901093676, 0.8712942041063785)

## POLYNOMIAL FEATURES

In [13]:
poly = PolynomialFeatures()
x_train_s = poly.fit_transform(x_train)
x_val_s = poly.transform(x_val)

## Linear Regression

In [14]:
lm = LinearRegression()  # Create model object
ev.train_model_gen2(lm, x_train, y_train, x_val, y_val)  # Check train and validate

The train RMSE is 0.7795532573385248.
The validate RMSE is 0.7684107299774686.


In [15]:
lm = LinearRegression()  # Create model object
ev.train_model_gen2(lm, x_train_s, y_train, x_val_s, y_val)  # Check train and validate

The train RMSE is 0.7426930472507584.
The validate RMSE is 0.7479160837744295.


## XGBRegressor

In [16]:
xgbr = XGBRegressor(max_depth=8, n_estimators=8, random_state=100)  # Create best model

ev.train_model_gen2(xgbr, x_train, y_train, x_val, y_val) # Evaluate

The train RMSE is 0.6699869765449888.
The validate RMSE is 0.7988615651174962.


In [17]:
xgbr = XGBRegressor(max_depth=8, n_estimators=8, random_state=100)  # Create best model

ev.train_model_gen2(xgbr, x_train_s, y_train, x_val_s, y_val) # Evaluate

The train RMSE is 0.6251961175897992.
The validate RMSE is 0.7995559255833236.


## Random Forest Regressor

In [18]:
# Create best model
rf = RandomForestRegressor(max_depth=6, min_samples_split=8, random_state=100)

ev.train_model_gen2(rf, x_train, y_train, x_val, y_val)  # Evaluate model

The train RMSE is 0.6810735281429181.
The validate RMSE is 0.7351942218490143.


In [19]:
# Create best model
rf = RandomForestRegressor(max_depth=6, min_samples_split=8, random_state=100)

ev.train_model_gen2(rf, x_train_s, y_train, x_val_s, y_val)  # Evaluate model

The train RMSE is 0.6715819749108831.
The validate RMSE is 0.732168718388364.


## LASSO LARS

In [20]:
ll = LassoLars(alpha=0)
ev.train_model(ll, x_train, y_train, x_val, y_val)

The train RMSE is 0.716771885110661.
The validate RMSE is 0.7163999296023469.


## KNN

In [21]:
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')
ev.train_model(knn, x_train, y_train, x_val, y_val)

The train RMSE is 0.7062656880124635.
The validate RMSE is 0.784137975129349.


In [27]:
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')
ev.train_model(knn, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 0.7094565372322593.
The validate RMSE is 0.7791682864734351.


## Random Forest Classifier

In [26]:
rf1 = RandomForestClassifier(max_depth=5, random_state=seed)
ev.train_model(rf1, x_train, y_train, x_val, y_val)

The train RMSE is 0.7635291087787347.
The validate RMSE is 0.7684107299774686.


In [25]:
rf1 = RandomForestClassifier(max_depth=5, random_state=seed)
ev.train_model(rf1, x_train_s, y_train, x_val_s, y_val)

The train RMSE is 0.7328641563154681.
The validate RMSE is 0.7508781392624222.


## Decision Tree

In [23]:
clf = DecisionTreeClassifier(max_depth=3, random_state=seed)
ev.train_model(clf, x_train, y_train, x_val, y_val)

The train RMSE is 0.7977917692512981.
The validate RMSE is 0.8016353983424719.


## Logistic Regression

In [24]:
logreg = LogisticRegression()
ev.train_model(logreg, x_train, y_train, x_val, y_val)

The train RMSE is 0.7830527820868777.
The validate RMSE is 0.7974710304425588.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RESULTS WITH SUGAR: 

lm = 76, 74
xgb = 79, 79
random forest regressor = 73, 73
KNN = 78.5
rf1 = 76
decision tree = 80
logreg= 79

WITHOUT SUGAR

lm = 78, 76
xgb = 80, 83
random forest regressor = 73, 73
KNN = 78.5
rf1 = 76
decision tree = 80
logreg= 79

Without sugar is slightly worse