###  Read the data


In [1]:
# Read the data into dataframes
import pandas as pd

train_df = pd.read_csv('training.csv')
print ("training set has ", train_df.shape)
test_df = pd.read_csv('test.csv')
print ("test set has ", test_df.shape)
ntest = test_df.shape[0]

print ("remove one column from the training set")
print ("merge training set and test set to all_data")

Ytrain = train_df['IsBadBuy']
train_df = train_df.drop(columns=['IsBadBuy'])

# Put all data together so we can wrangle all the data at the same time
all_data = pd.concat([train_df, test_df], ignore_index=True)
print ("all_data has ", all_data.shape)


training set has  (72983, 34)
test set has  (48707, 33)
remove one column from the training set
merge training set and test set to all_data
all_data has  (121690, 33)


In [2]:
all_data.isnull().sum()

RefId                                     0
PurchDate                                 0
Auction                                   0
VehYear                                   0
VehicleAge                                0
Make                                      0
Model                                     0
Trim                                   3910
SubModel                                 13
Color                                    12
Transmission                             12
WheelTypeID                            5357
WheelType                              5362
VehOdo                                    0
Nationality                              12
Size                                     12
TopThreeAmericanName                     12
MMRAcquisitionAuctionAveragePrice        28
MMRAcquisitionAuctionCleanPrice          28
MMRAcquisitionRetailAveragePrice         28
MMRAcquisitonRetailCleanPrice            28
MMRCurrentAuctionAveragePrice           458
MMRCurrentAuctionCleanPrice     

###  Drop the unimportant stuff

There are too many missing data in "PRIMEUNIT" and "AUCGUART", so we can just remove these features.
Also "RefId" has nothing to do but only the ID number that can be removed as well.

In [3]:
all_data.drop(labels='AUCGUART', axis='columns', inplace=True)
all_data.drop(labels='PRIMEUNIT', axis='columns', inplace=True)
all_data.drop(labels='RefId', axis='columns', inplace=True)

### Deal with missing values

<p>Let's take a look at what missing values we'll have to handle.</p> For simplicity, just remove all columns that has null values

In [4]:
all_isnull = all_data.columns[all_data.isnull().any()]
for col in all_isnull:
    all_data.drop(labels=col, axis='columns', inplace=True)
all_data.isnull().sum()

PurchDate       0
Auction         0
VehYear         0
VehicleAge      0
Make            0
Model           0
VehOdo          0
BYRNO           0
VNZIP1          0
VNST            0
VehBCost        0
IsOnlineSale    0
WarrantyCost    0
dtype: int64

### Deal with category variable


In [5]:
import numpy as np

nonList = list(all_data.select_dtypes(include=[np.object]))
print (nonList)

for this_label in nonList:
    all_data.drop(labels=this_label, axis='columns', inplace=True)
all_data2 = all_data.copy()


['PurchDate', 'Auction', 'Make', 'Model', 'VNST']


KeyError: "['MMRCurrentRetailCleanPrice'] not found in axis"

###  Diving in (machine learning)

<p>Now that the data has been cleaned, we can try to find a model that works well for making our predictions. We'll also load in some classifiers which we will compare.</p>

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

Xtrain = all_data2.head(len(Ytrain))

RFC = RandomForestClassifier()
Ada = AdaBoostClassifier()
KNN = KNeighborsClassifier()
classifiers = [RFC, Ada, KNN]
clf_names = ['Random Forest', 'AdaBoost', 'K Nearest Neighbors']

<p>For this analysis, we'll only be comparing across three classifiers: Random Forest, AdaBoost, and K Nearest Neighbors. For more information on other potential (or more complicated) classifiers I invite you to check out the other kernels posted by those who top the leaderboards for this competition.</p>

<p>For each of these classifiers, we'll want to make sure we create the models with the optimal parameters. We can do this with a Grid Search. We define the set of parameters we want to scan for each type of classifier, and then run our grid searches.</p>

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

# Use kfold as our cross validation
kfold = StratifiedKFold(n_splits=4)

# Set grid search parameter settings
rfc_param_grid = {'max_depth': [None],
                 'max_features': [1],
                 'min_samples_split': [2],
                 'min_samples_leaf': [1],
                 'bootstrap': [False],
                 'n_estimators': [100],
                 'criterion': ['gini']}
ada_param_grid = {'n_estimators': [20],
                 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1, 10]}
knn_param_grid = {'n_neighbors': [5],
                  'weights': ['uniform', 'distance'],
                 'leaf_size': [5]}
param_grids = [rfc_param_grid, ada_param_grid, knn_param_grid]

# Perform grid searches to get estimators with the optimal settings
grid_searches = []
for i in range(len(classifiers)):
    grid_searches.append(GridSearchCV(estimator=classifiers[i], param_grid=param_grids[i], 
                                      n_jobs=4, cv=kfold, verbose=1))

<p>We'll now want to see the training scores for each of our models and determine which one works the best. We'll fit each model to our training set and add the best scores from each to a list.</p>

In [None]:
# Train the models
best_scores = []
for i in range(len(grid_searches)):
    grid_searches[i].fit(Xtrain, Ytrain)
    best_scores.append(grid_searches[i].best_score_)

<p>Let's see the best scores for each classifier.</p>

In [None]:
# Best scores
for i in range(len(best_scores)):
    print(clf_names[i] + ": " + str(best_scores[i]))

<p>Based on these training scores, it makes the most sense to use the Random Forest Classifier to make the predictions. We'll predict on the test set, and then write the predictions to a csv file for submission.</p>

In [None]:
# Make predictions
Xtest = all_data2.tail(ntest)
#Xtest = test_df.drop(columns='RefId', axis='columns')
predictions = grid_searches[0].predict(Xtest)

# Write predictions to output csv
pred_df = pd.DataFrame({'RefId': test_df['RefId'],
                        'IsBadBuy': predictions})
pred_df.to_csv('predictions.csv', index=False)

print("Done writing to csv")