# 1. Read and check data

In [1]:
# import libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

#from sklearn.utils import resample
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split

In [2]:
def get_info_df(df, print_null_info):
    df_shape = df.shape
    print ("\nThe dataframe has %s data and %s features" %(df_shape[0], df_shape[1]))
    #
    features_obj = list(df.select_dtypes(include=['object']).columns)
    print ("\nThe number of object features: %s" %(len(features_obj)))
    for feature in features_obj:
        if (print_null_info):
            num_null = df[feature].isnull().sum()
            if (num_null > 0):
                print ("%50s has %8d null values (%7.2f percent)" %(feature, num_null, num_null/df_shape[0]*100))
    #
    features_num = list(df.select_dtypes(exclude=['object']).columns)
    print ("\nThe number of numeric features: %s" %(len(features_num)))
    for feature in features_num:
        if (print_null_info):
            num_null = df[feature].isnull().sum()
            if (num_null > 0):
                print ("%50s has %8d null values (%7.2f percent)" %(feature, num_null, num_null/df_shape[0]*100))

In [3]:
dataset = pd.read_csv('training.csv')
get_info_df(dataset, print_null_info= True)


The dataframe has 72983 data and 34 features

The number of object features: 15
                                              Trim has     2360 null values (   3.23 percent)
                                          SubModel has        8 null values (   0.01 percent)
                                             Color has        8 null values (   0.01 percent)
                                      Transmission has        9 null values (   0.01 percent)
                                         WheelType has     3174 null values (   4.35 percent)
                                       Nationality has        5 null values (   0.01 percent)
                                              Size has        5 null values (   0.01 percent)
                              TopThreeAmericanName has        5 null values (   0.01 percent)
                                         PRIMEUNIT has    69564 null values (  95.32 percent)
                                          AUCGUART has    69564 null valu

In [4]:
# Droping ID
feature = "RefId"
dataset.drop(labels=feature, axis='columns', inplace=True)

# Check for duplicated data
dataset[dataset.duplicated(keep=False)].index

Int64Index([], dtype='int64')

In [5]:
def check_target_bibary(df, target_feature):
    unique_target_vals = df[target_feature].unique()
    print ("\nUnique values of the target:", unique_target_vals)

In [6]:
check_target_bibary(df=dataset, target_feature='IsBadBuy')


Unique values of the target: [0 1]


In [7]:
def check_target_balance(df, target_feature):
    target_vals = df[target_feature]
    len_target_vals = len(target_vals)
    sum_target_vals = sum(target_vals)
    frac1 = sum_target_vals/ len_target_vals
    frac0 = 1-frac1
    print ("fraction of (%s==0): %6.2f" %(target_feature, frac0*100))
    print ("fraction of (%s==1): %6.2f" %(target_feature, frac1*100))

In [8]:
check_target_balance(df=dataset, target_feature='IsBadBuy')

fraction of (IsBadBuy==0):  87.70
fraction of (IsBadBuy==1):  12.30


In [9]:
def select_imbalanced_data(df, target_feature):
    target_vals = df[target_feature]
    len_target_vals = len(target_vals)
    n_target_1 = sum(target_vals)
    n_target_0 = len_target_vals-n_target_1
    #
    if (n_target_1 < n_target_0):
        df_minor = df[df[target_feature]==1]
        df_major = df[df[target_feature]==0]
    else:
        df_major = df[df[target_feature]==1]
        df_minor = df[df[target_feature]==0]
    return [df_minor, df_major]

In [10]:
downsample, upsample = select_imbalanced_data(df=dataset, target_feature='IsBadBuy')

###  Split 

In [11]:
from sklearn.model_selection import train_test_split

test_size = 0.3
train_up, valid_up = train_test_split(upsample, test_size=test_size)
train_down, valid_down = train_test_split(downsample, test_size=test_size)

#get_info_df(train_up, print_null_info=False)
#get_info_df(valid_up, print_null_info=True)
#get_info_df(train_down, print_null_info=False)
#get_info_df(valid_down, print_null_info=True)


The dataframe has 19203 data and 33 features

The number of object features: 15
                                              Trim has      598 null values (   3.11 percent)
                                          SubModel has        3 null values (   0.02 percent)
                                             Color has        3 null values (   0.02 percent)
                                      Transmission has        3 null values (   0.02 percent)
                                         WheelType has      306 null values (   1.59 percent)
                                       Nationality has        3 null values (   0.02 percent)
                                              Size has        3 null values (   0.02 percent)
                              TopThreeAmericanName has        3 null values (   0.02 percent)
                                         PRIMEUNIT has    18191 null values (  94.73 percent)
                                          AUCGUART has    18191 null valu

In [12]:
huy = aa

NameError: name 'aa' is not defined

In [None]:
train_df_major_downsampled = resample(train_df_major, replace=False, n_samples=n_minor, random_state=123)
train_data = pd.concat([train_df_major_downsampled, train_df_minor])

Ytrain = train_data.IsBadBuy

train_data.IsBadBuy.value_counts()

### 1. Numeric variables

In [None]:
col_num = list(train_data.select_dtypes(include=['float64','int64']).columns)
print (col_num)

In [None]:
# Create the function to plot figures
def his_plot(df, features_plot, feature_target, ncolplot, rotang):
    nfig = len(features_plot)
    nrowplot = np.int(np.ceil(nfig/ncolplot))
    fig, axs = plt.subplots(nrowplot, ncolplot, figsize=(16,16))
    labels = [feature_target + " = 1", feature_target + " = 0"]
    num = 0
    for feature in features_plot:
        data_hist = [df[df[feature_target]==1][feature].dropna(), df[df[feature_target]==0][feature].dropna()]
        id_row = int(num/ncolplot)
        id_col = num-id_row*ncolplot
        try:
            axs[id_row,id_col].hist(data_hist, label=labels)
            axs[id_row,id_col].legend(prop={'size': 10})
            axs[id_row,id_col].set_title(feature)
            axs[id_row,id_col].xaxis.set_tick_params(rotation=rotang)
        except:
            axs[id_col].hist(data_hist, label=labels)
            axs[id_col].legend(prop={'size': 10})
            axs[id_col].set_title(feature)
            axs[id_col].xaxis.set_tick_params(rotation=rotang)
        num += 1

In [None]:
his_plot(df=train_data, features_plot=col_num, feature_target='IsBadBuy', ncolplot=4, rotang = 0)

In [None]:
can_drop = []

feature = "IsOnlineSale"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
col_num = list(train_data.select_dtypes(include=['float64','int64']).columns)

corr = train_data[col_num].corr()

plt.figure(figsize = (10,8))

sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, cmap="YlGnBu", annot=True, fmt=".1f")

plt.show()

In [None]:
feature = "VehYear"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
feature = "MMRAcquisitionAuctionAveragePrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRAcquisitionAuctionCleanPrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRAcquisitionRetailAveragePrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRAcquisitonRetailCleanPrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRCurrentAuctionAveragePrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRCurrentAuctionCleanPrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRCurrentRetailAveragePrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "MMRCurrentRetailCleanPrice"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
col_num = list(train_data.select_dtypes(include=['float64','int64']).columns)

corr = train_data[col_num].corr()

plt.figure(figsize = (10,8))

sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, cmap="YlGnBu", annot=True, fmt=".1f")

plt.show()

### 2. Object variables

In [None]:
col_object = list(train_data.select_dtypes(include=['object']).columns)
print (col_object)

In [None]:
for col in col_object:
    print (col, ":  the number of unique data: ", len(train_data[col].unique()))
    if (len(train_data[col].unique()) < 20):
        print (train_data[col].unique())
    print ()

In [None]:
his_plot(df=train_data, features_plot=col_object, feature_target='IsBadBuy', ncolplot=4, rotang = 0)

In [None]:
feature = "Transmission"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "PRIMEUNIT"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "AUCGUART"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
col_object = list(train_data.select_dtypes(include=['object']).columns)
print (col_object)

df = train_data[col_object]
corr = df.apply(lambda x: x.factorize()[0]).corr()

plt.figure(figsize = (10,8))

sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, cmap="YlGnBu", annot=True, fmt=".1f")

plt.show()

In [None]:
feature = "Nationality"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
train_data.info()

In [None]:
col_plot = ['WheelType','WheelTypeID']
his_plot(df=train_data, features_plot=col_plot, feature_target='IsBadBuy', ncolplot=2, rotang = 0)

In [None]:
feature = "WheelTypeID"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

## 2. Deal with missing values

<p>Let's take a look at what missing values we'll have to handle.</p> For the numeric variable, replace null by median and for the category variable, most popular value is used to fill null

In [None]:
all_null = train_data.columns[train_data.isnull().any()]
for col in all_null:
    try:
        train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    except:
        train_data[col].fillna(train_data[col].median(), inplace=True)
train_data.info()

### Deal with date

In [None]:
date_feature = 'PurchDate'
train_data[date_feature] = pd.to_datetime(train_data[date_feature])
train_data[date_feature].head()

In [None]:
print ("data from:", train_data[date_feature].min(), " to:", train_data[date_feature].max() )
print ("which has total", train_data[date_feature].max() - train_data[date_feature].min())

In [None]:
date_feature = 'PurchDate'

train_data['Year'] = train_data[date_feature].dt.year
train_data['Month'] = train_data[date_feature].dt.month
train_data['Day'] = train_data[date_feature].dt.day
train_data['Day_Name'] = train_data[date_feature].dt.day_name()
train_data['Day_Name_Num'] = train_data[date_feature].dt.dayofweek

feature = "PurchDate"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
col_plot = ['Year', 'Month', 'Day', 'Day_Name', 'Day_Name_Num']
his_plot(df=train_data, features_plot=col_plot, feature_target='IsBadBuy', ncolplot=5, rotang = 90)

In [None]:
train_data['sinMonth'] = np.sin(train_data['Month'] * 2.0 *np.pi/12.0)
train_data['cosMonth'] = np.cos(train_data['Month'] * 2.0 *np.pi/12.0)
#train_data[['sinMonth','cosMonth','Month']].head()

train_data['sinDay'] = np.sin(train_data['Day'] * 2.0 *np.pi/30.0)
train_data['cosDay'] = np.cos(train_data['Day'] * 2.0 *np.pi/30.0)
#train_data[['sinDay','cosDay','Day']].head()

train_data['sinDay_Name_Num'] = np.sin(train_data['Day_Name_Num'] * 2.0 *np.pi/30.0)
train_data['cosDay_Name_Num'] = np.cos(train_data['Day_Name_Num'] * 2.0 *np.pi/30.0)
#train_data[['sinDay_Name_Num','cosDay_Name_Num','Day_Name_Num']].head()


feature = "Month"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "Day"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "Day_Name_Num"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

feature = "Day_Name"
can_drop.append(feature)
train_data.drop(labels=feature, axis='columns', inplace=True)

In [None]:
drop_X_train = train_data.select_dtypes(exclude=['object'])
y = drop_X_train.IsBadBuy
X = drop_X_train.drop('IsBadBuy', axis=1)

In [None]:
col_object = list(train_data.select_dtypes(include=['object']).columns)

from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = train_data.copy()
#label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in col_object:
    label_X_train[col] = label_encoder.fit_transform(train_data[col])
    #label_X_valid[col] = label_encoder.transform(X_valid[col])

y = label_X_train.IsBadBuy
X = label_X_train.drop('IsBadBuy', axis=1)

In [None]:
for col in col_object:
    print (col, ":  the number of unique data: ", len(train_data[col].unique()))
    if (len(train_data[col].unique()) < 20):
        print (train_data[col].unique())
    print ()

In [None]:
col_object = list(train_data.select_dtypes(include=['object']).columns)

from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_data[col_object]))
#OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_data.index
#OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_data.drop(col_object, axis=1)
#num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
#OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

y = OH_X_train.IsBadBuy
X = OH_X_train.drop('IsBadBuy', axis=1)

In [None]:
train_data.info()

In [None]:
drop_X_train.info()

In [None]:
label_X_train.info()

In [None]:
OH_X_train.info()

###  Diving in (machine learning)

<p>Now that the data has been cleaned, we can try to find a model that works well for making our predictions. We'll also load in some classifiers which we will compare.</p>

### Model 1: Evaluate using train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1234567)

#Create a Gaussian Classifier
rfc = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
rfc.fit(X_train,y_train)

# prediction on test set
y_pred = rfc.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Using "cross_val_score"

In [None]:
rfc2 = RandomForestClassifier(n_estimators=100)
rfc_cv_score = cross_val_score(rfc, X, y, cv=5, scoring='accuracy')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

### Model 2: Pipeline and cross_val_score

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()), ('model', RandomForestClassifier(n_estimators=100, random_state=1234567))])

from sklearn.model_selection import cross_val_score

scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')

print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

### Model 3: GridSearchCV

In [None]:
RFC = RandomForestClassifier()

# Use kfold as our cross validation
kfold = StratifiedKFold(n_splits=5)

# Set grid search parameter settings
#n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 3)]
n_estimators = [100]

# max depth
#max_depth = [int(x) for x in np.linspace(5, 50, num = 2)]
#max_depth.append(None)
max_depth = [None]

rfc_param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators}

#rfc_param_grid = {'n_estimators': [100]}

# Perform grid searches to get estimators with the optimal settings
grid_search = GridSearchCV(estimator=RFC, param_grid=rfc_param_grid, n_jobs=1, cv=kfold, verbose=1)

grid_search.fit(X, y)

print (grid_search.best_score_)