# Neural networks and deep learning
# Supervised neural nets
# Challenge: Make Your Network

__Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing.__ Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

Data: [Vehicles info](https://www.kaggle.com/epa/vehicle-fuel-economy#vehicles.csv)

In [97]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import Image
from IPython.display import display

import warnings
# Suppress annoying harmless error.
warnings.simplefilter('ignore')

## Clean Data

In [25]:
# Open file.
df1 = pd.read_csv('vehicles.csv')
df2 = pd.read_csv('emissions.csv')
df3 = pd.read_csv('vehicle_fields.csv')
df4 = pd.read_csv('emissions_fields.csv')

### df1 or Vehicles data

In [26]:
display(df1.head())

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [30]:
df1.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',
       'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn', 'modifiedOn

In [31]:
df1.dtypes

barrels08          float64
barrelsA08         float64
charge120          float64
charge240          float64
city08               int64
city08U            float64
cityA08              int64
cityA08U           float64
cityCD             float64
cityE              float64
cityUF             float64
co2                  int64
co2A                 int64
co2TailpipeAGpm    float64
co2TailpipeGpm     float64
comb08               int64
comb08U            float64
combA08              int64
combA08U           float64
combE              float64
combinedCD         float64
combinedUF         float64
cylinders          float64
displ              float64
drive               object
engId                int64
eng_dscr            object
feScore              int64
fuelCost08           int64
fuelCostA08          int64
                    ...   
rangeCity          float64
rangeCityA         float64
rangeHwy           float64
rangeHwyA          float64
trany               object
UCity              float64
U

In [32]:
## Select all that are not object and are object   
df1_str = df1.select_dtypes(include=['object'])
df1_num = df1.select_dtypes(exclude=['object'])

In [33]:
display(df1_num.head())

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,UCity,UCityA,UHighway,UHighwayA,year,youSaveSpend,charge240b,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,23.3333,0.0,35.0,0.0,1985,-1750,0.0,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,11.0,0.0,19.0,0.0,1985,-10500,0.0,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,29.0,0.0,47.0,0.0,1985,250,0.0,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,12.2222,0.0,16.6667,0.0,1985,-10500,0.0,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,21.0,0.0,32.0,0.0,1993,-4750,0.0,0,0,0


In [34]:
print(df1_num.shape)

(39101, 60)


In [42]:
df1_num.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'engId', 'feScore', 'fuelCost08', 'fuelCostA08', 'ghgScore',
       'ghgScoreA', 'highway08', 'highway08U', 'highwayA08', 'highwayA08U',
       'highwayCD', 'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4',
       'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity', 'rangeCityA',
       'rangeHwy', 'rangeHwyA', 'UCity', 'UCityA', 'UHighway', 'UHighwayA',
       'year', 'youSaveSpend', 'charge240b', 'phevCity', 'phevHwy',
       'phevComb'],
      dtype='object')

In [43]:
# Count nulls 
null_count = df1_num.isnull().sum()
null_count[null_count>0]

cylinders    145
displ        143
dtype: int64

In [53]:
#df1_num['id']

### df2 or Emissions data

In [47]:
display(df2.head())

Unnamed: 0,efid,id,salesArea,score,scoreAlt,smartwayScore,standard,stdText
0,4HNXV03.2MJE,19332,3,1.0,-1.0,-1,B9,BIN 9
1,4HNXV03.2MJE,19332,7,1.0,-1.0,-1,L1,LEV
2,4HNXV03.2MJE,19333,3,1.0,-1.0,-1,B9,BIN 9
3,4HNXV03.2MJE,19333,7,1.0,-1.0,-1,L1,LEV
4,4ADXV01.8346,19334,3,1.0,-1.0,-1,B9,BIN 9


In [48]:
df2.columns

Index(['efid', 'id', 'salesArea', 'score', 'scoreAlt', 'smartwayScore',
       'standard', 'stdText'],
      dtype='object')

In [49]:
df2.dtypes

efid              object
id                 int64
salesArea          int64
score            float64
scoreAlt         float64
smartwayScore      int64
standard          object
stdText           object
dtype: object

In [29]:
print(df2.shape)

(33534, 8)


In [52]:
#df2['id']

### df4 or Emissions Feilds

In [74]:
display(df4)

Unnamed: 0,field,description
0,efid,engine family ID
1,id,vehicle record ID (links emission data to the ...
2,salesArea,EPA sales area code
3,score,EPA 1-10 smog rating for fuelType1
4,scoreAlt,EPA 1-10 smog rating for fuelType2
5,smartwayScore,SmartWay Code
6,standard,Vehicle Emission Standard Code
7,stdText,Vehicle Emission Standard


In [75]:
df4.dtypes

field          object
description    object
dtype: object

## Join Data

In [54]:
df = df1_num.set_index('id').join(df2.set_index('id'), lsuffix='_vehicles', rsuffix='_emissions', sort=False)

In [55]:
display(df.head())

Unnamed: 0_level_0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,phevCity,phevHwy,phevComb,efid,salesArea,score,scoreAlt,smartwayScore,standard,stdText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,0,0,0,,,,,,,
2,14.982273,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,...,0,0,0,,,,,,,
3,19.388824,0.0,0.0,0.0,15,0.0,0,0.0,0.0,0.0,...,0,0,0,,,,,,,
4,19.388824,0.0,0.0,0.0,15,0.0,0,0.0,0.0,0.0,...,0,0,0,,,,,,,
5,20.600625,0.0,0.0,0.0,15,0.0,0,0.0,0.0,0.0,...,0,0,0,,,,,,,


In [56]:
df.dtypes

barrels08          float64
barrelsA08         float64
charge120          float64
charge240          float64
city08               int64
city08U            float64
cityA08              int64
cityA08U           float64
cityCD             float64
cityE              float64
cityUF             float64
co2                  int64
co2A                 int64
co2TailpipeAGpm    float64
co2TailpipeGpm     float64
comb08               int64
comb08U            float64
combA08              int64
combA08U           float64
combE              float64
combinedCD         float64
combinedUF         float64
cylinders          float64
displ              float64
engId                int64
feScore              int64
fuelCost08           int64
fuelCostA08          int64
ghgScore             int64
ghgScoreA            int64
                    ...   
highwayUF          float64
hlv                  int64
hpv                  int64
lv2                  int64
lv4                  int64
phevBlended           bool
p

In [57]:
print(df.shape)

(56163, 66)


In [58]:
# Count nulls 
null_count = df.isnull().sum()
null_count[null_count>0]

cylinders          233
displ              230
efid             22708
salesArea        22708
score            22708
scoreAlt         22708
smartwayScore    22708
standard         22708
stdText          22708
dtype: int64

In [63]:
## Assumption: the Median is a good enough representative of the missing data 

## Change objects to numeric 
fill_na = (['cylinders','displ','salesArea','score','scoreAlt','smartwayScore'])
for col in fill_na:
    df[col].fillna(value = df[col].median(), inplace=True)

In [64]:
# Count nulls 
null_count = df.isnull().sum()
null_count[null_count>0]

efid        22708
standard    22708
stdText     22708
dtype: int64

Those are objects so will drop them from the data

In [70]:
df['smartwayScore'].describe()

count    56163.000000
mean        -0.734469
std          0.693095
min         -1.000000
25%         -1.000000
50%         -1.000000
75%         -1.000000
max          2.000000
Name: smartwayScore, dtype: float64

In [71]:
df['score'].describe()

count    56163.000000
mean         5.056977
std          1.416363
min        -12.000000
25%          5.000000
50%          5.000000
75%          5.000000
max         10.000000
Name: score, dtype: float64

## Create Training and Test Set 

In [76]:
X = df.drop(['efid','standard', 'stdText', 'score'], axis = 1)
Y = df['score']

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  

# Split the data 
X_train, X_test, Y_train, Y_test = train_test_split(
    X,Y,
    test_size=0.5,
    random_state=None)

## Multi Layer Perceptron

### MLP with 60 and 40 hidden layers

In [87]:
# Start time for execution speed.
import time
start_time = time.clock()

# Fit the model.
from sklearn.neural_network import MLPClassifier
mlp1 = MLPClassifier(hidden_layer_sizes=(60,40,), alpha=0.05)
mlp1.fit(X_train, Y_train)

# Accuracy.
print('Accuracy:' )
print('\nScore on Training set: ', mlp1.score(X_train, Y_train))
print('\nScore on Test set: ', mlp1.score(X_test, Y_test))

# Cross validation scores.
from sklearn.model_selection import cross_val_score
print('\nCV Scores: ', cross_val_score(mlp1, X, Y, cv=5))

# End time for execution speed.
print('\nRuntime for MLP with 60 and 40 hidden layers: '+'%s seconds'% (time.clock() - start_time))

Accuracy:

Score on Training set:  0.690894198925

Score on Test set:  0.695285236094

CV Scores:  [ 0.67921338  0.67998576  0.21600926  0.6463621   0.17643392]

Runtime for MLP with 60 and 40 hidden layers: 59.388524000000075 seconds


It is not overfitting but not perfoming well on itself

In [94]:
y_mlp1 = mlp1.predict(X_test)

In [95]:
pd.crosstab(Y_test, y_mlp1, margins=True)

col_0,-2.0,1.0,2.0,3.0,5.0,6.0,7.0,8.0,9.0,10.0,All
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-12.0,0,0,0,0,8,0,0,0,0,0,8
-2.0,0,0,2,0,1,0,0,0,0,0,3
1.0,0,0,2,0,1401,9,0,5,0,0,1417
2.0,0,0,38,0,621,25,0,2,0,0,686
3.0,0,0,0,68,255,68,3,69,0,0,463
4.0,0,0,0,0,1,0,0,0,0,0,1
5.0,0,8,20,15,18013,282,3,370,0,3,18714
6.0,2,0,4,38,3445,1178,41,407,0,0,5115
7.0,0,0,0,13,316,80,69,124,2,0,604
8.0,0,0,0,8,150,38,27,69,1,0,293


In [106]:
mlp1.predict_proba(X_test)

array([[  1.00478415e-041,   2.76530103e-022,   1.27119054e-004, ...,
          2.37819944e-007,   1.03705811e-007,   2.44365227e-074],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       ..., 
       [  2.68214397e-120,   1.14949536e-121,   1.23407486e-091, ...,
          3.29837584e-133,   2.47278671e-122,   4.41709590e-079],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000]])

### MLP with 100 and 150 hidden layers

In [88]:
# Start time for execution speed.
import time
start_time = time.clock()

# Fit the model.
from sklearn.neural_network import MLPClassifier
mlp2 = MLPClassifier(hidden_layer_sizes=(100,150,), alpha=0.05)
mlp2.fit(X_train, Y_train)

# Accuracy.
print('Accuracy:' )
print('\nScore on Training set: ', mlp2.score(X_train, Y_train))
print('\nScore on Test set: ', mlp2.score(X_test, Y_test))

# Cross validation scores.
from sklearn.model_selection import cross_val_score
print('\nCV Scores: ', cross_val_score(mlp2, X, Y, cv=5))

# End time for execution speed.
print('\nRuntime for MLP with 100 and 150 hidden layers: '+'%s seconds'% (time.clock() - start_time))

Accuracy:

Score on Training set:  0.674441793383

Score on Test set:  0.674738266505

CV Scores:  [ 0.69033636  0.70232268  0.47912029  0.60245792  0.26416103]

Runtime for MLP with 100 and 150 hidden layers: 265.68328800000006 seconds


It is not overfitting but not perfoming well on itself

In [93]:
y_mlp2 = mlp2.predict(X_test)

In [99]:
pd.crosstab(Y_test, y_mlp2, margins=True)

col_0,1.0,2.0,3.0,5.0,6.0,7.0,8.0,9.0,10.0,All
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-12.0,0,0,0,8,0,0,0,0,0,8
-2.0,3,0,0,0,0,0,0,0,0,3
1.0,238,348,4,737,87,2,0,1,0,1417
2.0,97,263,2,272,51,0,0,1,0,686
3.0,8,14,91,109,231,0,0,10,0,463
4.0,0,0,0,1,0,0,0,0,0,1
5.0,818,1603,68,14407,1730,8,0,74,6,18714
6.0,118,231,52,935,3690,4,0,85,0,5115
7.0,4,0,17,172,299,9,8,95,0,604
8.0,3,0,11,79,126,7,4,63,0,293


In [105]:
mlp2.predict_proba(X_test)

array([[  2.18403702e-011,   1.75405501e-028,   1.47698405e-001, ...,
          2.00551956e-005,   5.84379173e-005,   1.53725764e-047],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       ..., 
       [  1.09551932e-109,   1.44287166e-108,   6.39837300e-063, ...,
          2.62915174e-060,   8.32987747e-067,   3.90493409e-131],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000],
       [  0.00000000e+000,   0.00000000e+000,   0.00000000e+000, ...,
          0.00000000e+000,   0.00000000e+000,   0.00000000e+000]])

## Random Forest 

In [85]:
# Start time for execution speed.
start_time = time.clock()

# Fit the model.
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier(max_depth=10,max_features='auto', n_estimators=40)

rfc.fit(X_train, Y_train)

# Accuracy.
print('Accuracy:' )
print('\nScore on Training set: ', rfc.score(X_train, Y_train))
print('\nScore on Test set: ', rfc.score(X_test, Y_test))

# Cross validation scores.
print('\nCV Scores: ', cross_val_score(rfc, X, Y, cv=5))

# End time for execution speed.
print('\nRuntime for Random Forest Classifier: '+'%s seconds'% (time.clock() - start_time))

Accuracy:

Score on Training set:  0.840924468502

Score on Test set:  0.827896873442

CV Scores:  [ 0.68081509  0.70178873  0.72477963  0.64440289  0.09921625]

Runtime for Random Forest Classifier: 10.702404000000001 seconds


It is not overfitting but not perfoming well on itself

In [100]:
y_rfc = rfc.predict(X_test)

In [107]:
pd.crosstab(Y_test, y_rfc, margins=True)

col_0,-2.0,1.0,2.0,3.0,5.0,6.0,7.0,8.0,9.0,10.0,All
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-12.0,0,0,8,0,0,0,0,0,0,0,8
-2.0,1,0,0,0,2,0,0,0,0,0,3
1.0,0,576,24,18,797,0,2,0,0,0,1417
2.0,0,95,249,0,330,12,0,0,0,0,686
3.0,0,3,0,247,147,51,15,0,0,0,463
4.0,0,0,0,0,1,0,0,0,0,0,1
5.0,0,18,1,46,18164,459,23,0,0,3,18714
6.0,0,4,7,0,1476,3613,6,4,5,0,5115
7.0,0,0,0,31,217,192,152,1,11,0,604
8.0,0,0,0,0,102,108,23,43,17,0,293


In [102]:
rfc.predict_proba(X_test)

array([[  2.49352228e-05,   0.00000000e+00,   3.24940330e-02, ...,
          2.16087875e-04,   1.47806063e-03,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   1.48296820e-03, ...,
          0.00000000e+00,   6.97544643e-06,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   6.00075978e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   1.90074400e-03, ...,
          0.00000000e+00,   6.97544643e-06,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  5.59125804e-05,   0.00000000e+00,   3.75487747e-03, ...,
          1.96309384e-05,   4.44370330e-04,   0.00000000e+00]])

## Gradient Boosting

In [90]:
# Start time for execution speed.
start_time = time.clock()

# Fit the model.
from sklearn import ensemble
params = {'n_estimators': 50,
          'max_depth': 5,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, Y_train)

# Accuracy.
print('Accuracy:' )
print('\nScore on Training set: ', clf.score(X_train, Y_train))
print('\nScore on Test set: ', clf.score(X_test, Y_test))

# Cross validation scores.
print('\nCV Scores: ', cross_val_score(clf, X, Y, cv=5))

# End time for execution speed.
print('\nRuntime: '+'%s seconds'% (time.clock() - start_time))

Accuracy:

Score on Training set:  0.897724440013

Score on Test set:  0.875115732498

CV Scores:  [ 0.70679836  0.73863131  0.63814442  0.55597115  0.11319914]

Runtime: 571.281356 seconds


It is not overfitting but not perfoming well on itself

In [103]:
y_clf = clf.predict(X_test)

In [108]:
pd.crosstab(Y_test, y_clf, margins=True)

col_0,-12.0,-2.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,All
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
-12.0,8,0,0,0,0,0,0,0,0,0,0,0,8
-2.0,0,1,2,0,0,0,0,0,0,0,0,0,3
1.0,1,0,1151,48,16,0,197,2,2,0,0,0,1417
2.0,0,0,100,482,0,0,88,16,0,0,0,0,686
3.0,0,0,14,4,390,0,41,7,7,0,0,0,463
4.0,0,0,0,0,0,1,0,0,0,0,0,0,1
5.0,0,0,148,31,75,1,17759,625,28,10,37,0,18714
6.0,0,0,18,7,3,0,843,4160,30,11,43,0,5115
7.0,0,0,0,0,46,0,109,185,244,5,15,0,604
8.0,0,0,0,0,0,0,54,119,29,65,26,0,293


In [104]:
clf.predict_proba(X_test)

array([[  8.20227949e-05,   8.20234258e-05,   1.18997878e-03, ...,
          4.34155854e-04,   9.34589525e-04,   8.25391471e-05],
       [  8.29245377e-05,   8.29251756e-05,   4.88690954e-04, ...,
          4.96124316e-04,   8.78733667e-04,   8.34465667e-05],
       [  8.17482011e-05,   8.17488300e-05,   5.58089475e-04, ...,
          4.89086481e-04,   8.66743591e-04,   8.22628248e-05],
       ..., 
       [  8.29257573e-05,   8.29263952e-05,   5.27942658e-04, ...,
          4.96131612e-04,   8.69049099e-04,   8.34477939e-05],
       [  8.39679286e-05,   8.39685745e-05,   3.83721858e-04, ...,
          5.94626997e-04,   8.79970893e-04,   8.44965259e-05],
       [  8.28935861e-05,   8.28942237e-05,   5.75646871e-04, ...,
          5.87258407e-04,   8.95394383e-04,   8.34154202e-05]])