In [39]:
%matplotlib inline
import math
import random
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.stats.stats import pearsonr

from sklearn import tree
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn import neighbors
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

In [40]:
def clean_data(df):
    df.replace({'Trim' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'SubModel' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'Color' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'Transmission' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'WheelTypeID' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'WheelType' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'Nationality' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'Size' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'TopThreeAmericanName' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRAcquisitionAuctionAveragePrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRAcquisitionAuctionCleanPrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRAcquisitionRetailAveragePrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRAcquisitonRetailCleanPrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRCurrentAuctionAveragePrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRCurrentAuctionCleanPrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRCurrentRetailAveragePrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'MMRCurrentRetailCleanPrice' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'PRIMEUNIT' : { np.nan : 'NS'}}, inplace=True)
    df.replace({'AUCGUART' : { np.nan : 'NS'}}, inplace=True)

    df['Transmission'] = df['Transmission'].str.upper()

    df = df.drop('PurchDate', axis=1)
    df = df.drop('VehYear',axis=1)
    df = df.drop('WheelTypeID', axis=1)
    df = df.drop('TopThreeAmericanName', axis=1)
    df = df.drop('BYRNO', axis=1)

    auctions = sorted(df['Auction'].unique())
    vehicleAges = sorted(df['VehicleAge'].unique())
    makes = sorted(df['Make'].unique())
    models = sorted(df['Model'].unique())
    trims = sorted(df['Trim'].unique())
    subModels = sorted(df['SubModel'].unique())
    colors = sorted(df['Color'].unique())
    transmissions = sorted(df['Transmission'].unique())
    wheeltypes = sorted(df['WheelType'].unique())
    vehOdos = sorted(df['VehOdo'].unique())
    nationalities = sorted(df['Nationality'].unique())
    sizes = sorted(df['Size'].unique())
    demands = sorted(df['PRIMEUNIT'].unique())
    guarantees = sorted(df['AUCGUART'].unique())
    zipcodes = sorted(df['VNZIP1'].unique())    #VNZIP1
    states = sorted(df['VNST'].unique())        #VNST
    vehcosts = sorted(df['VehBCost'].unique())
    onlinesales = sorted(df['IsOnlineSale'].unique())
    warranties = sorted(df['WarrantyCost'].unique())
    
    auctionsDictionary = dict(zip(auctions,range(0, len(auctions)+1)))
    makesDictionary = dict(zip(makes,range(0, len(makes)+1)))
    modelsDictionary = dict(zip(models,range(0, len(models)+1)))
    trimsDictionary = dict(zip(trims,range(0, len(trims)+1)))
    subModelsDictionary = dict(zip(subModels,range(0, len(subModels)+1)))
    colorsDictionary = dict(zip(colors,range(0, len(colors)+1)))
    transmissionsDictionary = dict(zip(transmissions,range(0, len(transmissions)+1)))
    wheeltypesDictionary = dict(zip(wheeltypes,range(0, len(wheeltypes)+1)))
    nationalitiesDictionary = dict(zip(nationalities,range(0, len(nationalities)+1)))
    sizesDictionary = dict(zip(sizes,range(0, len(sizes)+1)))
    demandsDictionary = dict(zip(demands,range(0, len(demands)+1)))
    guaranteesDictionary = dict(zip(guarantees,range(0, len(guarantees)+1)))
    vehcostsDictionary = dict(zip(vehcosts,range(0, len(vehcosts)+1)))  #not object but mapped as dictionary
    zipcodesDictionary = dict(zip(zipcodes,range(0, len(zipcodes)+1)))  #not object but mapped as dictionary
    statesDictionary = dict(zip(states,range(0, len(states)+1)))
    warrantiesDictionary = dict(zip(warranties,range(0, len(warranties)+1)))
    
    df['AuctionVal'] = df['Auction'].map(auctionsDictionary)
    df['MakeVal'] = df['Make'].map(makesDictionary)
    df['ModelVal'] = df['Model'].map(modelsDictionary)
    df['TrimVal'] = df['Trim'].map(trimsDictionary)
    df['SubModelVal'] = df['SubModel'].map(subModelsDictionary)
    df['ColorVal'] = df['Color'].map(colorsDictionary)
    df['TransmissionVal'] = df['Transmission'].map(transmissionsDictionary)
    df['WheelTypeVal'] = df['WheelType'].map(wheeltypesDictionary)
    df['NationalityVal'] = df['Nationality'].map(nationalitiesDictionary)
    df['SizeVal'] = df['Size'].map(sizesDictionary)
    df['PRIMEUNITVal'] = df['PRIMEUNIT'].map(demandsDictionary)
    df['AUCGUARTVal'] = df['AUCGUART'].map(guaranteesDictionary)
    df['VehBCostVal'] = df['VehBCost'].map(vehcostsDictionary)
    df['VNZIP1Val'] = df['VNZIP1'].map(zipcodesDictionary)
    df['VNSTVal'] = df['VNST'].map(statesDictionary)
    df['WarrantyCostVal'] = df['WarrantyCost'].map(warrantiesDictionary)

    dfFeatureSelection = df.drop(['RefId','MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice','MMRAcquisitionRetailAveragePrice',
                                 'MMRAcquisitonRetailCleanPrice','MMRCurrentAuctionAveragePrice','MMRCurrentAuctionCleanPrice','MMRCurrentRetailAveragePrice',
                                 'MMRCurrentRetailCleanPrice'], axis=1)

    dfFeatureSelection = dfFeatureSelection.drop(['Auction','Make','Model','Trim','SubModel','Color','Transmission','WheelType','Nationality','Size',
                                  'PRIMEUNIT','AUCGUART','VehBCost','VNZIP1','VNST','WarrantyCost'], axis=1)

    nAttributesRanking = len(dfFeatureSelection.columns)
    
    return df

In [54]:
df = pd.read_csv("training.csv") 
df2 = pd.read_csv("test.csv")

test_id = df2['RefId']

df = clean_data(df)
df2 = clean_data(df2)

# Divide the array in two
df_goodbuy = df[df['IsBadBuy'].map(lambda x: x == 0)]
df_badbuy = df[df['IsBadBuy'].map(lambda x: x == 1)]

#df_goodbuy.drop(df_goodbuy.index[[1,3]], inplace=True)
df_goodbuy = df_goodbuy.sample(frac=1)
df_goodbuy = df_goodbuy[:][:len(df_badbuy)]

frames = [df_goodbuy, df_badbuy]
df = pd.concat(frames)

Unnamed: 0,RefId,IsBadBuy,Auction,VehicleAge,Make,Model,Trim,SubModel,Color,Transmission,...,TransmissionVal,WheelTypeVal,NationalityVal,SizeVal,PRIMEUNITVal,AUCGUARTVal,VehBCostVal,VNZIP1Val,VNSTVal,WarrantyCostVal
11458,11467,0,MANHEIM,8,MERCURY,SABLE,GS,4D SEDAN GS,SILVER,AUTO,...,0,1,0,5,1,1,379,21,19,110
35931,35954,0,MANHEIM,2,DODGE,CARAVAN GRAND FWD V6,SE,MINIVAN 3.3L FFV SE,SILVER,AUTO,...,0,1,0,12,1,1,1168,13,34,179
63274,63305,0,MANHEIM,7,ACURA,3.2 CL,S,2D COUPE TYPE S NAVIGATION AUTO,SILVER,AUTO,...,0,0,3,10,1,1,1070,23,19,212
46971,46995,0,MANHEIM,3,DODGE,MAGNUM V6 2.7L V6 MP,SE,WAGON 2.7L,WHITE,AUTO,...,0,1,0,10,1,1,1188,92,32,114
9817,9826,0,MANHEIM,7,HONDA,CIVIC,LX,4D SEDAN LX AUTO,MAROON,AUTO,...,0,1,4,0,1,1,912,12,14,44


In [42]:
df_train = pd.concat([
    df['IsBadBuy'],     # mah?! e' il target
    df['VehicleAge'],
    df['VehOdo'],
    df['IsOnlineSale'],
    df['AuctionVal'],
    df['MakeVal'],
    df['ModelVal'],
    df['TrimVal'],
    df['SubModelVal'],
    df['ColorVal'],
    df['TransmissionVal'],
    df['WheelTypeVal'],
    df['NationalityVal'],
    df['SizeVal'],
    df['PRIMEUNITVal'],
    df['AUCGUARTVal'],
    df['VehBCostVal'],
    df['VNZIP1Val'],
    df['VNSTVal'],
    df['WarrantyCostVal']
    ],
    axis=1)

df_test = pd.concat([
    df2['VehicleAge'],
    df2['VehOdo'],
    df2['IsOnlineSale'],
    df2['AuctionVal'],
    df2['MakeVal'],
    df2['ModelVal'],
    df2['TrimVal'],
    df2['SubModelVal'],
    df2['ColorVal'],
    df2['TransmissionVal'],
    df2['WheelTypeVal'],
    df2['NationalityVal'],
    df2['SizeVal'],
    df2['PRIMEUNITVal'],
    df2['AUCGUARTVal'],
    df2['VehBCostVal'],
    df2['VNZIP1Val'],
    df2['VNSTVal'],
    df2['WarrantyCostVal']
    ],
    axis=1)

In [43]:
#Convert the DataFrame to a numpy array:
train_data = df_train.values

In [44]:
# Training data features, skip the first column 'Survived'
train_features = train_data[:, 1:]

# 'Survived' column values
train_target = train_data[:, 0]

test_features = df_test.values

In [45]:
train_features[0]

array([    3, 89046,     0,     0,    17,   586,   133,   221,    13,
           0,     0,     3,     5,     1,     1,  1019,    47,     5,    70])

In [46]:
train_target[1]

0

In [47]:
# Fit the model to our training data
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, 
                                  min_samples_split=2, min_samples_leaf=2)
clf = clf.fit(train_features, train_target)

In [48]:
clf.feature_importances_

array([  1.33316066e-02,   4.35522554e-02,   3.32985930e-03,
         1.01405368e-01,   8.92028203e-03,   2.15009294e-02,
         1.40849889e-02,   1.56878465e-02,   1.39215454e-02,
         3.06124581e-04,   1.30552988e-01,   8.49781273e-04,
         4.38255140e-03,   2.08463393e-03,   0.00000000e+00,
         3.30356162e-02,   4.47874471e-01,   1.24582610e-01,
         2.05965417e-02])

In [49]:
pred_target = clf.predict(train_features)

print metrics.precision_score(train_target, pred_target, average='weighted')
print metrics.recall_score(train_target, pred_target, average='weighted')
print metrics.f1_score(train_target, pred_target, average='weighted')
print metrics.accuracy_score(train_target, pred_target)
print metrics.precision_recall_fscore_support(train_target, pred_target)

0.978597036691
0.97816399287
0.978159052342
0.97816399287
(array([ 0.96420074,  0.99299334]), array([ 0.9932041 ,  0.96312389]), array([ 0.97848754,  0.97783056]), array([8976, 8976]))


In [50]:
len(train_data)

17952

In [51]:
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=7, 
                                  min_samples_split=2, min_samples_leaf=10) #, class_weight='balanced')

clf = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=288500278, splitter='best')

clf = RandomForestClassifier(n_estimators=30, 
                             criterion='gini',
                             max_features=None,
                             max_depth=None, 
                             min_samples_split=2,
                             min_samples_leaf=5,
                             bootstrap=True,
                             oob_score=False,
                             random_state=None,
                             class_weight=None) #balanced

clf=clf.fit(train_features, train_target)

In [52]:
train_pred = clf.predict(train_features)
test_pred = clf.predict(test_features)

test = np.arange(73015, 121722, dtype=np.int)
tmp = pd.concat([
    pd.DataFrame(test_id),
    pd.DataFrame(test_pred)
    ],
    axis=1)

tmp.to_csv("foo.csv", sep=",", index=False)