# Central NJ Data Science Meetup
Challenge 1 - Evaluate the Cars!

In [231]:
import pandas as pd
import numpy as np

# Problem statement
We need to predict a car's condition (unacceptable, acceptable, good, very good) based on six variables (buying price, maintenance price, number of doors, capacity, size of luggage boot and safety rating).

Reading the CSVs

In [232]:
df_sample_sub = pd.read_csv('cars-sample-submission.csv')

In [233]:
df_sample_sub.head()

Unnamed: 0,car.id,class
0,10,acc
1,13,unacc
2,24,good
3,26,vgood
4,27,acc


In [234]:
len(df_sample_sub)

434

In [235]:
df_final_pred = pd.read_csv('cars-final-prediction.csv')

In [236]:
df_final_pred.head()

Unnamed: 0,car.id,buying,maint,doors,persons,lug_boot,safety
0,1103,med,med,2,more,med,med
1,1340,low,vhigh,3,4,big,med
2,783,high,low,2,more,big,high
3,1694,low,low,4,more,small,med
4,312,vhigh,med,5more,4,med,high


In [237]:
df_training = pd.read_csv('cars-train.csv')

In [238]:
df_training.head()

Unnamed: 0,car.id,buying,maint,doors,persons,lug_boot,safety,class
0,1,vhigh,vhigh,2,2,small,low,unacc
1,3,vhigh,vhigh,2,2,small,high,unacc
2,4,vhigh,vhigh,2,2,med,low,unacc
3,5,vhigh,vhigh,2,2,med,med,unacc
4,7,vhigh,vhigh,2,2,big,low,unacc


In [239]:
df_test = pd.read_csv('cars-test.csv')

In [240]:
df_test.head()

Unnamed: 0,car.id,buying,maint,doors,persons,lug_boot,safety,class
0,891,med,vhigh,2,more,big,high,acc
1,861,high,low,5more,more,med,high,acc
2,1523,low,med,2,4,small,med,acc
3,1038,med,high,4,4,small,high,acc
4,932,med,vhigh,4,4,med,med,acc


## Checking types
Checking variable types in training set. They're all strings except for car.id.

In [241]:
#Checking variable types in training set. All strings except for car id
df_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970 entries, 0 to 969
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   car.id    970 non-null    int64 
 1   buying    970 non-null    object
 2   maint     970 non-null    object
 3   doors     970 non-null    object
 4   persons   970 non-null    object
 5   lug_boot  970 non-null    object
 6   safety    970 non-null    object
 7   class     970 non-null    object
dtypes: int64(1), object(7)
memory usage: 60.8+ KB


## Making car id the index so that all columns are strings.

In [242]:
list_df_names = [df_training, df_test, df_final_pred]

In [243]:
for i in range(0, len(list_df_names)):
    list_df_names[i].set_index("car.id", inplace = True)

In [244]:
df_training.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,vhigh,vhigh,2,2,small,low,unacc
3,vhigh,vhigh,2,2,small,high,unacc
4,vhigh,vhigh,2,2,med,low,unacc
5,vhigh,vhigh,2,2,med,med,unacc
7,vhigh,vhigh,2,2,big,low,unacc


In [245]:
df_final_pred.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1103,med,med,2,more,med,med
1340,low,vhigh,3,4,big,med
783,high,low,2,more,big,high
1694,low,low,4,more,small,med
312,vhigh,med,5more,4,med,high


## Checking number of values for each variable with nunique()

In [246]:
list_of_cols = list(df_training.columns) 
list_of_cols

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [247]:
for i in range(0, len(list_of_cols)):
    col_name = list_of_cols[i]
    vc = df_training[col_name].nunique()
    print(f"{col_name} has {vc} different values.")

buying has 4 different values.
maint has 4 different values.
doors has 4 different values.
persons has 3 different values.
lug_boot has 3 different values.
safety has 3 different values.
class has 4 different values.


## For loop to itemize the variables and get value counts

In [248]:
for c in df_training.columns:
    print(f'-----{c}------')
    print(df_training[c].value_counts())

-----buying------
med      255
vhigh    241
high     238
low      236
Name: buying, dtype: int64
-----maint------
low      253
med      246
high     239
vhigh    232
Name: maint, dtype: int64
-----doors------
4        257
2        246
3        234
5more    233
Name: doors, dtype: int64
-----persons------
2       333
4       319
more    318
Name: persons, dtype: int64
-----lug_boot------
med      333
big      325
small    312
Name: lug_boot, dtype: int64
-----safety------
low     325
high    323
med     322
Name: safety, dtype: int64
-----class------
unacc    680
acc      216
good      38
vgood     36
Name: class, dtype: int64


## Turning buying, maint, lug_boot, safety categorical variables into integers

In [249]:
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

In [250]:
cols_to_encode = ['buying', 'maint', 'lug_boot', 'safety']

In [251]:
encoding_dict_lists = [{'col':'buying', 'mapping':[('low', 1), ('med', 2), ('high', 3), ('vhigh', 4)]},\
                       {'col':'maint', 'mapping':[('low', 1), ('med', 2), ('high', 3), ('vhigh', 4)]},\
                       {'col':'lug_boot', 'mapping': [('small', 1), ('med', 2), ('big', 3)]},\
                       {'col':'safety', 'mapping': [('low', 1), ('med', 2), ('high', 3)]}]

In [252]:
ce_ord = ce.OrdinalEncoder(encoding_dict_lists)
df_training[cols_to_encode] = ce_ord.fit_transform(df_training[cols_to_encode])

## Doors and persons are still strings. Replacing 5more and more strings with the string '5'

In [253]:
df_training['doors'].value_counts()

4        257
2        246
3        234
5more    233
Name: doors, dtype: int64

In [254]:
df_training['persons'].value_counts()

2       333
4       319
more    318
Name: persons, dtype: int64

In [255]:
df_training = df_training.replace(['5more', 'more'], '5')

In [256]:
df_training['doors'].value_counts()

4    257
2    246
3    234
5    233
Name: doors, dtype: int64

In [257]:
df_training['persons'].value_counts()

2    333
4    319
5    318
Name: persons, dtype: int64

## Turning number strings into integers

In [258]:
df_training['doors'] = df_training['doors'].astype(int)
df_training['persons'] = df_training['persons'].astype(int)

In [259]:
df_training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 970 entries, 1 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    970 non-null    int32 
 1   maint     970 non-null    int32 
 2   doors     970 non-null    int32 
 3   persons   970 non-null    int32 
 4   lug_boot  970 non-null    int32 
 5   safety    970 non-null    int32 
 6   class     970 non-null    object
dtypes: int32(6), object(1)
memory usage: 37.9+ KB


## Applying the above steps to test set and final predictions

In [260]:
df_test[cols_to_encode] = ce_ord.fit_transform(df_test[cols_to_encode])
df_final_pred[cols_to_encode] = ce_ord.fit_transform(df_final_pred[cols_to_encode])

In [261]:
df_test.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
891,3,1,2,more,3,2,acc
861,2,4,5more,more,2,2,acc
1523,4,3,2,4,1,3,acc
1038,3,2,4,4,1,2,acc
932,3,1,4,4,2,3,acc


In [262]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 891 to 1125
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    324 non-null    int32 
 1   maint     324 non-null    int32 
 2   doors     324 non-null    object
 3   persons   324 non-null    object
 4   lug_boot  324 non-null    int32 
 5   safety    324 non-null    int32 
 6   class     324 non-null    object
dtypes: int32(4), object(3)
memory usage: 15.2+ KB


In [263]:
df_test = df_test.replace(['5more', 'more'], '5')
df_final_pred = df_final_pred.replace(['5more', 'more'], '5')

In [264]:
df_test['doors'] = df_test['doors'].astype(int)
df_test['persons'] = df_test['persons'].astype(int)
df_final_pred['doors'] = df_final_pred['doors'].astype(int)
df_final_pred['persons'] = df_final_pred['persons'].astype(int)

In [265]:
df_test.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
891,3,1,2,5,3,2,acc
861,2,4,5,5,2,2,acc
1523,4,3,2,4,1,3,acc
1038,3,2,4,4,1,2,acc
932,3,1,4,4,2,3,acc


In [266]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 891 to 1125
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    324 non-null    int32 
 1   maint     324 non-null    int32 
 2   doors     324 non-null    int32 
 3   persons   324 non-null    int32 
 4   lug_boot  324 non-null    int32 
 5   safety    324 non-null    int32 
 6   class     324 non-null    object
dtypes: int32(6), object(1)
memory usage: 12.7+ KB


In [267]:
df_final_pred.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1103,3,3,2,5,2,3
1340,4,1,3,4,3,3
783,2,4,2,5,3,2
1694,4,4,4,5,1,3
312,1,3,5,4,2,2


In [268]:
df_final_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 434 entries, 1103 to 1674
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   buying    434 non-null    int32
 1   maint     434 non-null    int32
 2   doors     434 non-null    int32
 3   persons   434 non-null    int32
 4   lug_boot  434 non-null    int32
 5   safety    434 non-null    int32
dtypes: int32(6)
memory usage: 13.6 KB


## Separating features from target variable in training and test sets

In [269]:
X_cols = list(df_training.columns[:-1])

In [270]:
X_cols

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']

In [271]:
X_train = df_training[X_cols]
X_train.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,2,2,1,1
3,1,1,2,2,1,2
4,1,1,2,2,2,1
5,1,1,2,2,2,3
7,1,1,2,2,3,1


In [272]:
y_train = df_training['class']
y_train.head()

car.id
1    unacc
3    unacc
4    unacc
5    unacc
7    unacc
Name: class, dtype: object

In [273]:
X_test = df_test[X_cols]
y_test = df_test['class']

# Modeling

## First two submissions were Random Forest

In [274]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score,log_loss

In [275]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 42)
base_model = clf.fit(X_train, y_train)
y_pred = base_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {acc:.5f}")

Accuracy score: 0.97840


## First default RF, then hyperparameter tuning
Accuracy score 0.97840 for both default and tuned Random Forest

In [223]:
#Tuning random forest. Got same accuracy score
param_grid = {'n_estimators' : [25, 50, 75, 100], 'max_depth' : [3, 6, 9], 'max_features': ['auto', 'sqrt'],\
             'min_samples_split': [2, 4, 6], 'min_samples_leaf':[1, 2, 3, 4, 5]}
#Grid of 360 params

In [224]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
rand_grid = RandomizedSearchCV(clf, param_grid, n_iter = 120, cv = 3, n_jobs = 2, verbose = 3)
rand_grid.fit(X_train, y_train)
print(f"Best Accuracy: {rand_grid.best_score_:.5f}")
print(f"Best Params: {rand_grid.best_params_}")

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  32 tasks      | elapsed:    3.7s
[Parallel(n_jobs=2)]: Done 224 tasks      | elapsed:   16.7s
[Parallel(n_jobs=2)]: Done 360 out of 360 | elapsed:   29.2s finished


Best Accuracy: 0.84638
Best Params: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 9}


In [276]:
best_rf = RandomForestClassifier(n_estimators = 100, min_samples_split = 6, min_samples_leaf = 3,\
                                 max_features = 'sqrt', max_depth = 9, random_state = 42)
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
best_rf_acc = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {acc:.5f}")

Accuracy score: 0.97840


## First two submissions

In [277]:
final_pred = base_model.predict(df_final_pred)

In [278]:
df_final_pred['class'] = final_pred
df_final_pred.reset_index(inplace=True)
submission_df = df_final_pred[['car.id', 'class']]

In [279]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   car.id  434 non-null    int64 
 1   class   434 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.9+ KB


In [229]:
submission_1 = submission_df.to_csv('submission_1.csv', index=False)

In [280]:
#Resetting final prediction dataframe
df_final_pred.head()

Unnamed: 0,car.id,buying,maint,doors,persons,lug_boot,safety,class
0,1103,3,3,2,5,2,3,acc
1,1340,4,1,3,4,3,3,acc
2,783,2,4,2,5,3,2,acc
3,1694,4,4,4,5,1,3,acc
4,312,1,3,5,4,2,2,acc


In [281]:
df_final_pred.set_index("car.id", inplace = True)
df_final_pred = df_final_pred[X_cols]

In [282]:
df_final_pred.head()

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety
car.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1103,3,3,2,5,2,3
1340,4,1,3,4,3,3
783,2,4,2,5,3,2
1694,4,4,4,5,1,3
312,1,3,5,4,2,2


In [283]:
bestrf_final_pred = best_rf.predict(df_final_pred)
df_final_pred['class'] = bestrf_final_pred
df_final_pred.reset_index(inplace=True)
submission_df_2 = df_final_pred[['car.id', 'class']]

In [284]:
submission_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   car.id  434 non-null    int64 
 1   class   434 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.9+ KB


In [285]:
#submission_2 = submission_df_2.to_csv('batista_submission_2.csv', index=False)

# XGBoost

## Default
First one with 0.99339 public score

In [286]:
import xgboost as xgb
xgb_class = xgb.XGBClassifier(random_state = 42, objective = 'reg:logistic')
xgb_class.fit(X_train, y_train)
y_pred = xgb_class.predict(X_test)
print("Accuracy on training set:", xgb_class.score(X_train, y_train))
print("Accuracy on test set:", xgb_class.score(X_test, y_test))
print("Classification report:")
print(classification_report(y_test, y_pred))



Accuracy on training set: 1.0
Accuracy on test set: 0.9969135802469136
Classification report:
              precision    recall  f1-score   support

         acc       0.99      1.00      0.99        72
        good       1.00      0.92      0.96        13
       unacc       1.00      1.00      1.00       227
       vgood       1.00      1.00      1.00        12

    accuracy                           1.00       324
   macro avg       1.00      0.98      0.99       324
weighted avg       1.00      1.00      1.00       324



In [288]:
df_final_pred.set_index("car.id", inplace = True)
df_final_pred = df_final_pred[X_cols]

In [289]:
xgb_final_pred = xgb_class.predict(df_final_pred)
df_final_pred['class'] = xgb_final_pred
df_final_pred.reset_index(inplace=True)
submission_df_3 = df_final_pred[['car.id', 'class']]

In [290]:
submission_df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   car.id  434 non-null    int64 
 1   class   434 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.9+ KB


In [52]:
#submission_3 = submission_df_3.to_csv('batista_submission_3.csv', index=False)

## XG Boost hyperparameter tuning
This was Submission 4. Performed worse than tuned Random Forest

In [55]:
xgb_params = {'learning_rate' : [0.01, 0.1, 0.5], 'max_depth': [6, 7, 8, 9], 'colsample_bytree': [0.5, 1],\
             'n_estimators': [50, 75, 100, 125, 150, 175, 200]}

In [56]:
from sklearn.model_selection import GridSearchCV
xgb_randcv = GridSearchCV(xgb_class, xgb_params, cv = 3, n_jobs = 2, verbose = 3)
xgb_randcv.fit(X_train, y_train)
print(f"Best Accuracy: {xgb_randcv.best_score_:.5f}")
print(f"Best Params: {xgb_randcv.best_params_}")

Fitting 3 folds for each of 168 candidates, totalling 504 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    6.4s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   35.1s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done 504 out of 504 | elapsed:  2.3min finished


Best Accuracy: 0.84947
Best Params: {'colsample_bytree': 0.5, 'learning_rate': 0.5, 'max_depth': 6, 'n_estimators': 175}


In [291]:
best_xgb = xgb.XGBClassifier(n_estimators = 175, max_depth = 6, learning_rate = 0.5, colsample_bytree = 0.5)
best_xgb.fit(X_train, y_train)
y_pred = best_xgb.predict(X_test)
print("Accuracy on training set:", best_xgb.score(X_train, y_train))
print("Accuracy on test set:", best_xgb.score(X_test, y_test))
print("Classification report:")
print(classification_report(y_test, y_pred))

Accuracy on training set: 1.0
Accuracy on test set: 0.9938271604938271
Classification report:
              precision    recall  f1-score   support

         acc       0.99      0.99      0.99        72
        good       1.00      0.92      0.96        13
       unacc       1.00      1.00      1.00       227
       vgood       1.00      1.00      1.00        12

    accuracy                           0.99       324
   macro avg       1.00      0.98      0.99       324
weighted avg       0.99      0.99      0.99       324



In [292]:
df_final_pred.set_index("car.id", inplace = True)
df_final_pred = df_final_pred[X_cols]

In [293]:
best_xgb_final_pred = best_xgb.predict(df_final_pred)
df_final_pred['class'] = best_xgb_final_pred
df_final_pred.reset_index(inplace=True)
submission_df_4 = df_final_pred[['car.id', 'class']]

In [60]:
#submission_4 = submission_df_4.to_csv('batista_submission_4.csv', index=False)

## Tuning n_estimators
Saw that 175 was best n_estimator param. Tuned only the estimators (from 175 to 200).

In [294]:
estimator_range = np.arange(175, 201)

In [295]:
estimator_range

array([175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
       188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200])

In [296]:
xgb_params = {'learning_rate' : [0.5], 'max_depth': [6], 'colsample_bytree': [0.5],\
             'n_estimators': estimator_range}

## Ran default XG Boost again
Not sure why

In [297]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
xgb_class = xgb.XGBClassifier(random_state = 42, objective = 'reg:logistic')
xgb_class.fit(X_train, y_train)
y_pred = xgb_class.predict(X_test)
print("Accuracy on training set:", xgb_class.score(X_train, y_train))
print("Accuracy on test set:", xgb_class.score(X_test, y_test))
print("Classification report:")
print(classification_report(y_test, y_pred))

Accuracy on training set: 1.0
Accuracy on test set: 0.9969135802469136
Classification report:
              precision    recall  f1-score   support

         acc       0.99      1.00      0.99        72
        good       1.00      0.92      0.96        13
       unacc       1.00      1.00      1.00       227
       vgood       1.00      1.00      1.00        12

    accuracy                           1.00       324
   macro avg       1.00      0.98      0.99       324
weighted avg       1.00      1.00      1.00       324



In [298]:
tuned_cv = GridSearchCV(xgb_class, xgb_params, cv = 3, n_jobs = 2, verbose = 3)
tuned_cv.fit(X_train, y_train)
print(f"Best Accuracy: {tuned_cv.best_score_:.5f}")
print(f"Best Params: {tuned_cv.best_params_}")

Fitting 3 folds for each of 26 candidates, totalling 78 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    6.8s
[Parallel(n_jobs=2)]: Done  78 out of  78 | elapsed:   21.6s finished


Best Accuracy: 0.85050
Best Params: {'colsample_bytree': 0.5, 'learning_rate': 0.5, 'max_depth': 6, 'n_estimators': 190}


## Best n_estimators turned out to be 190
Accuracy on test set the same<br>
'acc' precision improved from .99 to 1.00<br>
'good' recall improved from .92 to 1.00<br>
'good' f1 score improved from .96 to 1.00<br>
'acc' recall dropped from 1.00 to .99<br>

In [299]:
tuned_xgb = xgb.XGBClassifier(n_estimators = 190, max_depth = 6, learning_rate = 0.5, colsample_bytree = 0.5)
tuned_xgb.fit(X_train, y_train)
y_pred = tuned_xgb.predict(X_test)
print("Accuracy on training set:", tuned_xgb.score(X_train, y_train))
print("Accuracy on test set:", tuned_xgb.score(X_test, y_test))
print("Classification report:")
print(classification_report(y_test, y_pred))

Accuracy on training set: 1.0
Accuracy on test set: 0.9969135802469136
Classification report:
              precision    recall  f1-score   support

         acc       1.00      0.99      0.99        72
        good       1.00      1.00      1.00        13
       unacc       1.00      1.00      1.00       227
       vgood       1.00      1.00      1.00        12

    accuracy                           1.00       324
   macro avg       1.00      1.00      1.00       324
weighted avg       1.00      1.00      1.00       324



In [300]:
df_final_pred.set_index("car.id", inplace = True)
df_final_pred = df_final_pred[X_cols]

## Final submission

In [301]:
tuned_xgb_final_pred = xgb_class.predict(df_final_pred)
df_final_pred['class'] = tuned_xgb_final_pred
df_final_pred.reset_index(inplace=True)
submission_df_5 = df_final_pred[['car.id', 'class']]

In [59]:
#submission_5 = submission_df_5.to_csv('batista_submission_5.csv', index=False)