REF: https://chrisalbon.com/machine-learning/random_forest_classifier_example_scikit.html

# Preliminaries

In [1]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)

# Load Data & Pre-process

In [3]:
# Create a dataframe with the raw exported data
data = pd.read_csv('./nation_data.csv')

# View the top 5 rows
data.head()

Unnamed: 0,1 - State Name,8 - Structure Number,22 - Owner,27 - Year Built,29 - Average Daily Traffic,34 - Skew,43A - Main Span Materials,45 - Number Of Main Spans,48 - Length Of Largest Span(m),52 - Deck Width(m),58 - Deck
0,1 - Alabama,203,2 - County Highway Agency,1922,50,0,3 - Steel,1,9.8,4.1,5 - FAIR CONDITION
1,1 - Alabama,206,2 - County Highway Agency,1922,70,0,3 - Steel,1,11.9,5.5,5 - FAIR CONDITION
2,1 - Alabama,233,2 - County Highway Agency,1924,505,0,3 - Steel,1,7.0,7.0,5 - FAIR CONDITION
3,1 - Alabama,258,2 - County Highway Agency,1925,45,0,3 - Steel,1,12.8,4.5,6 - SATISFACTORY CONDITION
4,1 - Alabama,317,2 - County Highway Agency,1925,115,0,3 - Steel,1,18.3,4.0,6 - SATISFACTORY CONDITION


In [3]:
# factorize categorical variable
data['deck_rating2'] = pd.factorize(data.deck_rating)[0]

# View the top 5 rows
data.head()

Unnamed: 0,year_built,adt,skew,span_length,deck_width,deck_rating,deck_rating2
0,1970,2081,15,39.6,18.1,6 - SATISFACTORY CONDITION,0
1,1941,11268,12,11.6,15.4,5 - FAIR CONDITION,1
2,1930,8834,0,9.4,13.4,5 - FAIR CONDITION,1
3,1929,15197,0,9.8,10.2,5 - FAIR CONDITION,1
4,1934,5861,0,11.3,10.3,6 - SATISFACTORY CONDITION,0


# Create Training And Test Data

In [4]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75

# View the top 5 rows
data.head()

Unnamed: 0,year_built,adt,skew,span_length,deck_width,deck_rating,deck_rating2,is_train
0,1970,2081,15,39.6,18.1,6 - SATISFACTORY CONDITION,0,True
1,1941,11268,12,11.6,15.4,5 - FAIR CONDITION,1,True
2,1930,8834,0,9.4,13.4,5 - FAIR CONDITION,1,True
3,1929,15197,0,9.8,10.2,5 - FAIR CONDITION,1,True
4,1934,5861,0,11.3,10.3,6 - SATISFACTORY CONDITION,0,True


In [5]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = data[data['is_train']==True], data[data['is_train']==False]

In [6]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 919
Number of observations in the test data: 303


# Train the Random Forest Classifier

In [46]:
# get features used to train
features = data.columns[:-3]

# get targets used to train
y = pd.factorize(train['deck_rating'])[0]

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=-1, random_state=0, max_features='sqrt', n_estimators=20,
                            min_samples_leaf=10,max_depth = 25)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [47]:
features

Index(['year_built', 'adt', 'skew', 'span_length', 'deck_width'], dtype='object')

# Apply Classifier To Test Data

In [48]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])

array([0, 0, 0, 0, 0, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 2, 0, 2, 1, 0, 2,
       0, 2, 2, 2, 2, 2, 1, 0, 2, 1, 1, 0, 2, 0, 0, 0, 6, 0, 0, 0, 1, 0, 1,
       2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 3,
       2, 0, 2, 2, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 6, 0, 0, 0,
       0, 1, 0, 1, 2, 3, 1, 0, 2, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 1, 2,
       1, 2, 2, 2, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 2, 0, 0, 0, 2, 2,
       1, 2, 2, 0, 1, 0, 0, 0, 1, 0, 2, 2, 2, 0, 1, 2, 2, 0, 2, 2, 1, 0, 0,
       0, 2, 0, 1, 2, 2, 2, 2, 2, 6, 0, 2, 2, 1, 3, 2, 0, 0, 2, 2, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 2, 1, 0, 0, 2, 1, 0, 2, 0, 1, 0, 0, 0,
       0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 2, 2, 3, 3, 0, 2, 3,
       3, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 0, 0, 2, 6, 0, 3, 3, 6, 0, 6, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0,
       0, 0,

In [49]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:5]

array([[ 0.64569686,  0.15825511,  0.10684228,  0.03446882,  0.05473692,
         0.        ,  0.        ],
       [ 0.45379151,  0.17792219,  0.26109465,  0.05807995,  0.04725985,
         0.        ,  0.00185185],
       [ 0.61997176,  0.20298896,  0.14527697,  0.01350909,  0.01825322,
         0.        ,  0.        ],
       [ 0.36529414,  0.24672214,  0.28996243,  0.06056075,  0.03746053,
         0.        ,  0.        ],
       [ 0.59318924,  0.1648513 ,  0.18046881,  0.03262533,  0.02886533,
         0.        ,  0.        ]])

# Evaluate Classifier

In [50]:
# Create actual english names for the plants for each predicted plant class
target_names = data.deck_rating.unique() 
preds = target_names[clf.predict(test[features])]

In [51]:
# View the PREDICTED species for the first five observations
preds[0:10]

array(['6 - SATISFACTORY CONDITION', '6 - SATISFACTORY CONDITION',
       '6 - SATISFACTORY CONDITION', '6 - SATISFACTORY CONDITION',
       '6 - SATISFACTORY CONDITION', '5 - FAIR CONDITION',
       '6 - SATISFACTORY CONDITION', '7 - GOOD CONDITION',
       '5 - FAIR CONDITION', '5 - FAIR CONDITION'], dtype=object)

In [52]:
# View the ACTUAL species for the first five observations
test['deck_rating'].head(10)

7             5 - FAIR CONDITION
8             7 - GOOD CONDITION
10            7 - GOOD CONDITION
13    6 - SATISFACTORY CONDITION
17            7 - GOOD CONDITION
18            5 - FAIR CONDITION
19    6 - SATISFACTORY CONDITION
20    6 - SATISFACTORY CONDITION
21            5 - FAIR CONDITION
23            5 - FAIR CONDITION
Name: deck_rating, dtype: object

# Create a confusion matrix

In [53]:
# Create confusion matrix
pd.crosstab(test['deck_rating'], preds, rownames=['Actual Ratings'], colnames=['Predicted Ratings'])

Predicted Ratings,5 - FAIR CONDITION,6 - SATISFACTORY CONDITION,7 - GOOD CONDITION,8 - VERY GOOD CONDITION,9 - EXCELLENT CONDITION
Actual Ratings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3 - SERIOUS CONDITION,0,3,0,0,0
4 - POOR CONDITION,4,7,1,0,0
5 - FAIR CONDITION,19,24,6,0,0
6 - SATISFACTORY CONDITION,15,57,27,0,0
7 - GOOD CONDITION,8,32,46,2,0
8 - VERY GOOD CONDITION,2,7,20,5,3
9 - EXCELLENT CONDITION,0,3,6,3,3


In [54]:
test.deck_rating.value_counts()

6 - SATISFACTORY CONDITION    99
7 - GOOD CONDITION            88
5 - FAIR CONDITION            49
8 - VERY GOOD CONDITION       37
9 - EXCELLENT CONDITION       15
4 - POOR CONDITION            12
3 - SERIOUS CONDITION          3
Name: deck_rating, dtype: int64

# View Feature Importance

In [55]:
# View a list of the features and their importance scores
list(zip(train[features], clf.feature_importances_))

[('year_built', 0.42213028582280465),
 ('adt', 0.15177086614971821),
 ('skew', 0.064741922211611319),
 ('span_length', 0.21412423293689908),
 ('deck_width', 0.14723269287896676)]

In [56]:
from sklearn.metrics import accuracy_score

In [57]:
accuracy_score(y, clf.predict(train[features]))

0.65070729053318821

In [58]:
accuracy_score(test['deck_rating'], preds)

0.42904290429042902