In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# The Description of the competition
One of the biggest challenges of an auto dealership purchasing a used car at an auto auction is the risk of that the vehicle might have serious issues that prevent it from being sold to customers. The auto community calls these unfortunate purchases "kicks".

Kicked cars often result when there are tampered odometers, mechanical issues the dealer is not able to address, issues with getting the vehicle title from the seller, or some other unforeseen problem. Kick cars can be very costly to dealers after transportation cost, throw-away repair work, and market losses in reselling the vehicle.

Modelers who can figure out which cars have a higher risk of being kick can provide real value to dealerships trying to provide the best inventory selection possible to their customers.

The challenge of this competition is to predict if the car purchased at the Auction is a Kick (bad buy).

Reference - faysal, Will Adams, Will Cukierski. (2011). Don't Get Kicked!. Kaggle. https://kaggle.com/competitions/DontGetKicked

# Dataset Description

-The challenge of this competition is to predict if the car purchased at the Auction is a good / bad buy.

-All the variables in the data set are defined in the file Carvana_Data_Dictionary.txt 

-The data contains missing values 

-The dependent variable (IsBadBuy) is binary (C2)

-There are 32 Independent variables (C3-C34)

-The data set is split to 60% training and 40% testing.

# Import necessary datasets

In [None]:
#Display all columns of the dataset 
pd.options.display.max_columns=99

In [None]:
train=pd.read_csv("/kaggle/input/DontGetKicked/training.csv")
train

In [None]:
test=pd.read_csv("/kaggle/input/DontGetKicked/test.csv")
test

# Data Pre-processing

In [None]:
#Merge Train and Test datasets to ease the process of the data pre-processing
all_data=pd.concat([train,test])
all_data

In [None]:
#Seperate PurchDate into Months/Day/Weekday
all_data["PurchDate"]=pd.to_datetime(all_data["PurchDate"])
all_data['Months']= all_data['PurchDate'].dt.month
all_data['Day']= all_data['PurchDate'].dt.day
all_data['Weekday']= all_data['PurchDate'].dt.weekday


#Drop the unnecssary columns or a column with the outcome variable
all_data2=all_data.drop(columns=["IsBadBuy","PurchDate","RefId"])

#Manual encoding process 
all_data2["Auction"].unique()
all_data2["Auction"]=all_data2["Auction"].replace({"ADESA":0,"OTHER":1,"MANHEIM":2})
all_data2
all_data2["Make"].unique()

#Automatic Encoding process
from sklearn.preprocessing import LabelEncoder 
le=LabelEncoder()

all_data2["Make"]=le.fit_transform(all_data2["Make"])
all_data2

In [None]:
#Filter out all columns with categorical variablesout of the dataset
cat_features=all_data2.columns[all_data2.dtypes==object]
cat_features

In [None]:
#Apply encoding process for all columns with categorical variables
for i in cat_features:
    all_data2[i]=le.fit_transform(all_data2[i])

In [None]:
all_data2

In [None]:
#Fill 0s for missing values
all_data2=all_data2.fillna(0)

In [None]:
#Seperate train and test datasets with the same numbers of rows as original train and test datasets
train_2=all_data2[:len(train)]
test_2=all_data2[len(train):]
train_2

In [None]:
test_2

# Modelling

In [None]:
#Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_jobs=4)
rf.fit(train_2,train["IsBadBuy"])

In [None]:
#Use probability of good and bad buy for the auction for the prediction process
result=rf.predict_proba(test_2)
result

## Feature Importance

In [None]:
#Can see how much independent variables (x) affects the dependent (outcome) variable 
rf.feature_importances_


In [None]:
pd.Series(rf.feature_importances_)

In [None]:
pd.Series(rf.feature_importances_,index=train_2.columns)

In [None]:
#WheelType and WheelTypeID are the most important variables that affect outcome variables
pd.Series(rf.feature_importances_,index=train_2.columns).sort_values(ascending=False)

# Submission

In [None]:
sub=pd.read_csv("/kaggle/input/DontGetKicked/example_entry.csv")
sub

In [None]:
#Always trying to get the index=1 column of correct guess columns which can also be translated to the probability when Auction is a bad buy
sub["IsBadBuy"]=result[:,1]
sub

In [None]:
sub.to_csv("sub_2.csv",index=0)

Private Score = 0.23735

Public Score= 0.23167

Approximately Top 20% for Private Score Standing (Late Submission)