In [48]:
from collections import OrderedDict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


In [4]:
X = pd.read_pickle('data/new_X.p')
X.shape

(8124, 117)

In [5]:
y = pd.read_pickle('data/new_y.p')
y.shape

(8124,)

In [7]:
columns_name = pd.read_pickle('data/columns.p')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

### Scoring for train set

In [21]:
cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')

array([1., 1., 1.])

In [25]:
y_train_pred = cross_val_predict(model, X_train, y_train, cv=3)
y_train_pred

array([1, 1, 1, ..., 1, 0, 1])

In [28]:
confusion_matrix(y_train, y_train_pred)

array([[3176,    0],
       [   0, 2917]])

In [31]:
precision_score(y_train, y_train_pred)

1.0

In [32]:
recall_score(y_train, y_train_pred)

1.0

In [37]:
f1_score(y_train, y_train_pred)

1.0

### Scoring for test set

In [34]:
y_test_pred = cross_val_predict(model, X_test, y_test, cv=3)
y_test_pred

array([0, 1, 0, ..., 1, 0, 1])

In [35]:
confusion_matrix(y_test, y_test_pred)

array([[1032,    0],
       [   1,  998]])

In [38]:
precision_score(y_test, y_test_pred)

1.0

In [39]:
recall_score(y_test, y_test_pred)

0.998998998998999

In [40]:
f1_score(y_test, y_test_pred)

0.99949924887331

### Features importance

In [47]:
feature_importance = dict(zip(columns_name, model.feature_importances_))

In [50]:
sorted_feature_importance = OrderedDict(sorted(
    feature_importance.items(), 
    key=lambda kv: kv[1],
    reverse=True
))
sorted_feature_importance

OrderedDict([('odor_c', 0.11797534151175547),
             ('gill-size_n', 0.07999529153508685),
             ('odor_l', 0.07390960178868597),
             ('stalk-surface-above-ring_f', 0.0637861139300347),
             ('gill-size_b', 0.0636917405176454),
             ('spore-print-color_n', 0.042725015345862406),
             ('gill-color_k', 0.03351220125808997),
             ('ring-type_n', 0.03337968206208832),
             ('stalk-surface-below-ring_f', 0.030635498809359698),
             ('bruises_f', 0.025898363598337755),
             ('stalk-surface-above-ring_k', 0.02234077022848767),
             ('ring-type_l', 0.02178072838662492),
             ('bruises_t', 0.018976445656140456),
             ('gill-spacing_w', 0.0181886475074377),
             ('stalk-shape_e', 0.01784855556712227),
             ('population_y', 0.0156571675911134),
             ('gill-spacing_c', 0.015507297769742898),
             ('stalk-root_c', 0.015485573800078442),
             ('spore-print-col