In [14]:
# To get rid of those blocks of red warnings
import warnings
warnings.filterwarnings("ignore")

# Standard Imports
import numpy as np
from scipy import stats
import pandas as pd
import os
from scipy.stats import spearmanr
from sklearn import metrics
from random import randint
from typing import Dict, List, Optional, Union, cast
from time import sleep
import pyspark
from pydataset import data
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Vis Imports
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Modeling Imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import sklearn.preprocessing
import statsmodels.api as sm
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# NLP Imports
import unicodedata
import re
import json
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# Custom Module Imports
import env

## Use the cross validation techniques discussed in the lesson to figure out what kind of model works best with the cars dataset used in the lesson.

In [3]:
df = data('mpg')

In [4]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [6]:
df.trans = np.where(df.trans.str.startswith('auto'), 'auto', 'manual')

In [7]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto,f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual,f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual,f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto,f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto,f,16,26,p,compact


In [9]:
df['mpg_avg'] = (df.cty + df.hwy) / 2

In [11]:
X = df[['displ', 'year', 'cyl', 'cty', 'hwy', 'mpg_avg']]
y = df.trans

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=123, stratify=y)


In [13]:
tree = DecisionTreeClassifier()

In [88]:
params = {'max_depth': range(1,16),
          'max_features': [1, 2, 3, 4],
         'min_samples_leaf': range(1,16),
         'criterion': ['gini', 'entropy']}

grid = GridSearchCV(tree, params, cv=5)

grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(max_depth=3),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 16),
                         'max_features': [1, 2, 3, 4],
                         'min_samples_leaf': range(1, 16)})

In [89]:
results = grid.cv_results_


In [90]:
test_scores = results['mean_test_score']
test_scores

array([0.68435277, 0.69487909, 0.66273115, ..., 0.6200569 , 0.61465149,
       0.69445235])

In [91]:
params = results['params']


In [92]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score', ascending=False)

Unnamed: 0,criterion,max_depth,max_features,min_samples_leaf,score
504,gini,9,2,10,0.737696
8,gini,1,1,9,0.727027
132,gini,3,1,13,0.722048
933,entropy,1,3,4,0.721906
189,gini,4,1,10,0.721764
...,...,...,...,...,...
1186,entropy,5,4,2,0.566145
1340,entropy,8,2,6,0.566145
1006,entropy,2,4,2,0.561024
1096,entropy,4,2,2,0.561024


In [43]:
knn = KNeighborsClassifier()

In [44]:
list(range(1,16))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [56]:
params = {'n_neighbors': range(1,16),
         'weights': ['uniform', 'distance'],
         'leaf_size': range(1,51)}

grid = GridSearchCV(knn, params, cv=5)

grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': range(1, 51), 'n_neighbors': range(1, 16),
                         'weights': ['uniform', 'distance']})

In [57]:
results = grid.cv_results_
test_scores = results['mean_test_score']
params = results['params']

In [58]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score', ascending=False)

Unnamed: 0,leaf_size,n_neighbors,weights,score
902,31,2,uniform,0.695021
662,23,2,uniform,0.695021
722,25,2,uniform,0.695021
1082,37,2,uniform,0.695021
632,22,2,uniform,0.695021
...,...,...,...,...
792,27,7,uniform,0.598151
252,9,7,uniform,0.598151
1182,40,7,uniform,0.598151
222,8,7,uniform,0.598151


In [93]:
params = {'max_depth': range(1,16),
          'max_features': [1, 2, 3, 4],
         'min_samples_leaf': range(1,16),
         'criterion': ['gini', 'entropy']}

grid = GridSearchCV(tree, params, cv=5)

grid.fit(X_train, y_train)

results = grid.cv_results_


In [94]:
dt_model = grid.best_estimator_

In [95]:
dt_model.score(X_test, y_test)

0.6595744680851063