In [20]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import evaluate as ev

# Cross Validation Practice

### Lesson Walkthrough

In [2]:
cars_df = pd.read_csv('cars.csv')

In [3]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297899 entries, 0 to 297898
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Id       297899 non-null  int64 
 1   Price    297899 non-null  int64 
 2   Year     297899 non-null  int64 
 3   Mileage  297899 non-null  int64 
 4   City     297899 non-null  object
 5   State    297899 non-null  object
 6   Vin      297899 non-null  object
 7   Make     297899 non-null  object
 8   Model    297899 non-null  object
dtypes: int64(4), object(5)
memory usage: 20.5+ MB


In [4]:
cars_df.head()

Unnamed: 0,Id,Price,Year,Mileage,City,State,Vin,Make,Model
0,1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
1,2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
2,3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
3,4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
4,5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [7]:
cars_df.columns = [c.lower() for c in cars_df]
cars_df.set_index('id', inplace=True)

print('{} rows x {} cols'.format(*cars_df.shape))
cars_df.head()

297899 rows x 8 cols


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [9]:
cars_df['avg_saleprice'] = cars_df.groupby(['year', 'make', 'model']).price.transform('mean')
cars_df['gt_avg'] = (cars_df.price > cars_df.avg_saleprice).astype(int)

In [10]:
cars_df.head()

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_saleprice,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0


In [11]:
cars_df.drop(columns=['price', 'city', 'vin', 'avg_saleprice'], inplace=True)

In [12]:
from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars_df[col])
    cars_df[col] = le.transform(cars_df[col])

In [13]:
cars_df.head()

Unnamed: 0_level_0,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,18,18681,28,7,523,0
2,18,27592,19,7,525,0
3,18,13650,32,7,526,0
4,18,25195,22,7,525,0
5,18,22800,38,7,523,0


In [15]:
X, y = cars_df.drop(columns='gt_avg'), cars_df.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [16]:
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=.3333)

### Cross Validation Portion

In [17]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)
cross_val_score(tree, X_train, y_train, cv=3)

array([0.59281007, 0.59227371, 0.59593671])

In [18]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='precision')

array([0.58200519, 0.58216252, 0.58581711])

In [19]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='recall')

array([0.46112882, 0.45636826, 0.46627128])

In [24]:
cross_val_score(tree, X_train, y_train, cv=3, scoring='f1')

array([0.51456354, 0.51164684, 0.51925237])

In [22]:
tree.fit(X_train, y_train)
ev.run_metrics(X_train, y_train, tree, data_set = 'train')

train data set accuracy score: 59.37%
train data set precision score 58.34%
train data set recall score: 46.12%

-------------------------------
classification report
              precision    recall  f1-score   support

           0       0.60      0.71      0.65     84527
           1       0.58      0.46      0.52     74360

    accuracy                           0.59    158887
   macro avg       0.59      0.59      0.58    158887
weighted avg       0.59      0.59      0.59    158887

-------------------------------

confusion matrix
[[60037 24490]
 [40064 34296]]

train data set model metrics
---------------------------------
True positive rate for the model is 46.12%
False positive rate for the model is  28.97%
True negative rate for the model is 71.03%
False negative rate for the model is 53.88%



In [25]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier()

grid = GridSearchCV(tree, params, cv=3)

grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4], 'max_features': [None, 1, 3]})

In [27]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [30]:
results_df = pd.DataFrame(results)

In [32]:
results_df.sort_values(by = 'mean_test_score', ascending = False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,0.092927,0.003818,0.005813,0.000441,4,,"{'max_depth': 4, 'max_features': None}",0.642203,0.642007,0.642933,0.642381,0.000398,1
3,0.069761,0.000601,0.004988,5.5e-05,3,,"{'max_depth': 3, 'max_features': None}",0.636407,0.632378,0.630622,0.633135,0.002422,2
8,0.057409,0.005992,0.005355,0.000212,4,3.0,"{'max_depth': 4, 'max_features': 3}",0.612182,0.643046,0.624523,0.626584,0.012684,3
0,0.062513,0.014708,0.004895,0.000407,2,,"{'max_depth': 2, 'max_features': None}",0.59281,0.592274,0.595937,0.593673,0.001615,4
5,0.041497,0.005167,0.004649,0.000301,3,3.0,"{'max_depth': 3, 'max_features': 3}",0.636407,0.540954,0.599807,0.592389,0.03932,5
1,0.018638,0.00362,0.004258,0.000348,2,1.0,"{'max_depth': 2, 'max_features': 1}",0.611332,0.537593,0.595937,0.581621,0.03176,6
2,0.032061,0.005661,0.004251,0.000223,2,3.0,"{'max_depth': 2, 'max_features': 3}",0.59281,0.540954,0.596654,0.576806,0.0254,7
7,0.028181,0.002391,0.004897,0.000201,4,1.0,"{'max_depth': 4, 'max_features': 1}",0.538508,0.552925,0.601507,0.564313,0.02695,8
4,0.021342,0.003808,0.004514,0.000305,3,1.0,"{'max_depth': 3, 'max_features': 1}",0.558428,0.531721,0.545391,0.54518,0.010904,9


### Now Let's Do Some Other Models

In [33]:
knn = KNeighborsClassifier()

In [36]:
cross_val_score(knn, X_train, y_train, cv=5)

array([0.55846812, 0.55466046, 0.55008339, 0.55096453, 0.5529471 ])

In [None]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 1, 3]}

grid = GridSearchCV(knn, params, cv=5)

grid.fit(X_train, y_train)