In [1]:
# Packages for the data
import pandas as pd
import numpy as np

# Package for visualization
import seaborn as sns

# Packages for machine learning classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

In [2]:
# Read the data into dataframes
train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')

# Put all data together so we can wrangle all the data at the same time
all_data = pd.concat(objs=[train_df, test_df], axis=0, sort=True).reset_index(drop=True)

all_data.head()

Unnamed: 0,AUCGUART,Auction,BYRNO,Color,IsBadBuy,IsOnlineSale,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,...,Trim,VNST,VNZIP1,VehBCost,VehOdo,VehYear,VehicleAge,WarrantyCost,WheelType,WheelTypeID
0,,ADESA,21973,RED,0.0,0,8155.0,9829.0,11636.0,13600.0,...,i,FL,33619,7100.0,89046,2006,3,1113,Alloy,1.0
1,,ADESA,19638,WHITE,0.0,0,6854.0,8383.0,10897.0,12572.0,...,ST,FL,33619,7600.0,93593,2004,5,1053,Alloy,1.0
2,,ADESA,19638,MAROON,0.0,0,3202.0,4760.0,6943.0,8457.0,...,SXT,FL,33619,4900.0,73807,2005,4,1389,Covers,2.0
3,,ADESA,19638,SILVER,0.0,0,1893.0,2675.0,4658.0,5690.0,...,SXT,FL,33619,4100.0,65617,2004,5,630,Alloy,1.0
4,,ADESA,19638,SILVER,0.0,0,3913.0,5054.0,7723.0,8707.0,...,ZX3,FL,33619,4000.0,69367,2005,4,1020,Covers,2.0


### A. Drop the unimportant stuff

In [3]:
train_df.drop(labels='AUCGUART', axis='columns', inplace=True)
test_df.drop(labels='AUCGUART', axis='columns', inplace=True)

train_df.drop(labels='PRIMEUNIT', axis='columns', inplace=True)
test_df.drop(labels='PRIMEUNIT', axis='columns', inplace=True)

### B. Observe missing values

<p>Let's take a look at what missing values we'll have to handle.</p>

In [4]:
# Find out what columns have null values
train_df.isnull().sum()

RefId                                   0
IsBadBuy                                0
PurchDate                               0
Auction                                 0
VehYear                                 0
VehicleAge                              0
Make                                    0
Model                                   0
Trim                                 2360
SubModel                                8
Color                                   8
Transmission                            9
WheelTypeID                          3169
WheelType                            3174
VehOdo                                  0
Nationality                             5
Size                                    5
TopThreeAmericanName                    5
MMRAcquisitionAuctionAveragePrice      18
MMRAcquisitionAuctionCleanPrice        18
MMRAcquisitionRetailAveragePrice       18
MMRAcquisitonRetailCleanPrice          18
MMRCurrentAuctionAveragePrice         315
MMRCurrentAuctionCleanPrice       

In [5]:
train_df.dtypes

RefId                                  int64
IsBadBuy                               int64
PurchDate                             object
Auction                               object
VehYear                                int64
VehicleAge                             int64
Make                                  object
Model                                 object
Trim                                  object
SubModel                              object
Color                                 object
Transmission                          object
WheelTypeID                          float64
WheelType                             object
VehOdo                                 int64
Nationality                           object
Size                                  object
TopThreeAmericanName                  object
MMRAcquisitionAuctionAveragePrice    float64
MMRAcquisitionAuctionCleanPrice      float64
MMRAcquisitionRetailAveragePrice     float64
MMRAcquisitonRetailCleanPrice        float64
MMRCurrent

In [6]:
for df in [train_df, test_df]:
    # Fill NaN values for Embarked and Fare
    df['Trim'].fillna(df['Trim'].mode()[0], inplace=True)
    df['SubModel'].fillna(df['SubModel'].mode()[0], inplace=True)
    df['Color'].fillna(df['Color'].mode()[0], inplace=True)
    df['Transmission'].fillna(df['Transmission'].mode()[0], inplace=True)
    df['WheelType'].fillna(df['WheelType'].mode()[0], inplace=True)
    df['Nationality'].fillna(df['Nationality'].mode()[0], inplace=True)
    df['Size'].fillna(df['Size'].mode()[0], inplace=True)
    df['TopThreeAmericanName'].fillna(df['TopThreeAmericanName'].mode()[0], inplace=True)
    
    df['WheelTypeID'].fillna(df['WheelTypeID'].median(), inplace=True)
    df['MMRAcquisitionAuctionAveragePrice'].fillna(df['MMRAcquisitionAuctionAveragePrice'].median(), inplace=True)
    df['MMRAcquisitionAuctionCleanPrice'].fillna(df['MMRAcquisitionAuctionCleanPrice'].median(), inplace=True)
    df['MMRAcquisitionRetailAveragePrice'].fillna(df['MMRAcquisitionRetailAveragePrice'].median(), inplace=True)
    df['MMRAcquisitonRetailCleanPrice'].fillna(df['MMRAcquisitonRetailCleanPrice'].median(), inplace=True)
    df['MMRCurrentAuctionAveragePrice'].fillna(df['MMRCurrentAuctionAveragePrice'].median(), inplace=True)
    df['MMRCurrentAuctionCleanPrice'].fillna(df['MMRCurrentAuctionCleanPrice'].median(), inplace=True)
    df['MMRCurrentRetailAveragePrice'].fillna(df['MMRCurrentRetailAveragePrice'].median(), inplace=True)
    df['MMRCurrentRetailCleanPrice'].fillna(df['MMRCurrentRetailCleanPrice'].median(), inplace=True)    

train_df.isnull().sum()

RefId                                0
IsBadBuy                             0
PurchDate                            0
Auction                              0
VehYear                              0
VehicleAge                           0
Make                                 0
Model                                0
Trim                                 0
SubModel                             0
Color                                0
Transmission                         0
WheelTypeID                          0
WheelType                            0
VehOdo                               0
Nationality                          0
Size                                 0
TopThreeAmericanName                 0
MMRAcquisitionAuctionAveragePrice    0
MMRAcquisitionAuctionCleanPrice      0
MMRAcquisitionRetailAveragePrice     0
MMRAcquisitonRetailCleanPrice        0
MMRCurrentAuctionAveragePrice        0
MMRCurrentAuctionCleanPrice          0
MMRCurrentRetailAveragePrice         0
MMRCurrentRetailCleanPric

In [7]:
train_df.dtypes

RefId                                  int64
IsBadBuy                               int64
PurchDate                             object
Auction                               object
VehYear                                int64
VehicleAge                             int64
Make                                  object
Model                                 object
Trim                                  object
SubModel                              object
Color                                 object
Transmission                          object
WheelTypeID                          float64
WheelType                             object
VehOdo                                 int64
Nationality                           object
Size                                  object
TopThreeAmericanName                  object
MMRAcquisitionAuctionAveragePrice    float64
MMRAcquisitionAuctionCleanPrice      float64
MMRAcquisitionRetailAveragePrice     float64
MMRAcquisitonRetailCleanPrice        float64
MMRCurrent

In [8]:
nonList=['PurchDate','Auction','Make','Model','Trim','SubModel','Color','Transmission','WheelType','Nationality','Size','TopThreeAmericanName','VNST']
for this_label in nonList:
    train_df.drop(labels=this_label, axis='columns', inplace=True)
    test_df.drop(labels=this_label, axis='columns', inplace=True)

In [9]:
train_df.head()

Unnamed: 0,RefId,IsBadBuy,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,BYRNO,VNZIP1,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,2006,3,1.0,89046,8155.0,9829.0,11636.0,13600.0,7451.0,8552.0,11597.0,12409.0,21973,33619,7100.0,0,1113
1,2,0,2004,5,1.0,93593,6854.0,8383.0,10897.0,12572.0,7456.0,9222.0,11374.0,12791.0,19638,33619,7600.0,0,1053
2,3,0,2005,4,2.0,73807,3202.0,4760.0,6943.0,8457.0,4035.0,5557.0,7146.0,8702.0,19638,33619,4900.0,0,1389
3,4,0,2004,5,1.0,65617,1893.0,2675.0,4658.0,5690.0,1844.0,2646.0,4375.0,5518.0,19638,33619,4100.0,0,630
4,5,0,2005,4,2.0,69367,3913.0,5054.0,7723.0,8707.0,3247.0,4384.0,6739.0,7911.0,19638,33619,4000.0,0,1020


## 4. Diving in (machine learning)

<p>Now that the data has been properly cleaned, we can try to find a model that works well for making our predictions. We'll first split our training set into the features (Xtrain) and the results (Ytrain). We'll also load in some classifiers which we will compare.</p>

In [10]:
Ytrain = train_df['IsBadBuy']
Xtrain = train_df.drop(columns=['IsBadBuy'])

RFC = RandomForestClassifier()
Ada = AdaBoostClassifier()
KNN = KNeighborsClassifier()
classifiers = [RFC, Ada, KNN]
clf_names = ['Random Forest', 'AdaBoost', 'K Nearest Neighbors']

<p>For this analysis, we'll only be comparing across three classifiers: Random Forest, AdaBoost, and K Nearest Neighbors. For more information on other potential (or more complicated) classifiers I invite you to check out the other kernels posted by those who top the leaderboards for this competition.</p>

<p>For each of these classifiers, we'll want to make sure we create the models with the optimal parameters. We can do this with a Grid Search. We define the set of parameters we want to scan for each type of classifier, and then run our grid searches.</p>

In [11]:
# Use kfold as our cross validation
kfold = StratifiedKFold(n_splits=4)

# Set grid search parameter settings
rfc_param_grid = {'max_depth': [None],
                 'max_features': [1],
                 'min_samples_split': [2],
                 'min_samples_leaf': [1],
                 'bootstrap': [False],
                 'n_estimators': [100],
                 'criterion': ['gini']}
ada_param_grid = {'n_estimators': [20],
                 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10]}
knn_param_grid = {'n_neighbors': [5],
                  'weights': ['uniform', 'distance'],
                 'leaf_size': [5]}
param_grids = [rfc_param_grid, ada_param_grid, knn_param_grid]

# Perform grid searches to get estimators with the optimal settings
grid_searches = []
for i in range(len(classifiers)):
    grid_searches.append(GridSearchCV(estimator=classifiers[i], param_grid=param_grids[i], 
                                      n_jobs=4, cv=kfold, verbose=1))

<p>We'll now want to see the training scores for each of our models and determine which one works the best. We'll fit each model to our training set and add the best scores from each to a list.</p>

In [12]:
# Train the models
best_scores = []
for i in range(len(grid_searches)):
    grid_searches[i].fit(Xtrain, Ytrain)
    best_scores.append(grid_searches[i].best_score_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   12.1s remaining:   12.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.3s finished




[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:    8.4s finished


Fitting 4 folds for each of 2 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:   10.3s finished


<p>Let's see the best scores for each classifier.</p>

In [13]:
# Best scores
for i in range(len(best_scores)):
    print(clf_names[i] + ": " + str(best_scores[i]))

Random Forest: 0.696573174574901
AdaBoost: 0.6886398202321089
K Nearest Neighbors: 0.685241768631051


<p>Based on these training scores, it makes the most sense to use the Random Forest Classifier to make the predictions. We'll predict on the test set, and then write the predictions to a csv file for submission.</p>

In [14]:
# Make predictions
#Xtest = test_df.drop(columns='RefId', axis='columns')
predictions = grid_searches[0].predict(test_df)

# Write predictions to output csv
pred_df = pd.DataFrame({'RefId': test_df['RefId'],
                        'IsBadBuy': predictions})
pred_df.to_csv('predictions.csv', index=False)

print("Done writing to csv")

Done writing to csv
