## RANDOM FOREST USING IRIS DATASET

This tutorial comes from the excellent blog yhat (http://blog.yhat.com/posts/random-forests-in-python.html)
I just added comments to facilitate a newcomer to understand the author's code.

Essentially, what we want to do is to use the flower characters of the plant genus Iris, i.e., sepal length, sepal width, petal length & petal width to predict which Iris species it is.

In [12]:
# Import necessary packages
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [13]:
# Load Iris data (https://en.wikipedia.org/wiki/Iris_flower_data_set)
iris = load_iris()
# Load iris into a dataframe and set the field names
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
# add a new column to dataframe with random 75% as true, and 25% as false
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df['is_train'].head()

0     True
1    False
2     True
3    False
4    False
Name: is_train, dtype: bool

In [15]:
# set a new column to specify specific epithet
# the original target is in integers, 
# pd.Categorical.from_codes change them into category names from the 2nd argument input
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
print df['species'].head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]


In [16]:
# split into two dataframes, one being just true (training), and another false (testing)
train, test = df[df['is_train']==True], df[df['is_train']==False]

print train.head(n=2)
print test.head(n=2)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
2                4.7               3.2                1.3               0.2   

  is_train species  
0     True  setosa  
2     True  setosa  
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
1                4.9               3.0                1.4               0.2   
3                4.6               3.1                1.5               0.2   

  is_train species  
1    False  setosa  
3    False  setosa  


In [17]:
# feature an index holding just the column names (1st 4 fields)
features = df.columns[:4]
features

Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
       u'petal width (cm)'],
      dtype='object')

In [18]:
# define the classifier
# n_job for multicore CPU; n_estimator for no. of decision trees
clf = RandomForestClassifier(n_jobs=2, n_estimators=100)
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
# Factorizing specific epithet (basically change categories into numbers)
# creates an array of factors & an index of specific epithet
y, epithet = pd.factorize(train['species'])
print y
print epithet

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Index([u'setosa', u'versicolor', u'virginica'], dtype='object')


### Time for the model, always rem (1) fit the model (2) predict the model (3) score the model

In [20]:
# FIT THE MODEL

# set 1st arguement as training dataframe with only first 4 fields (variables that predict)
# set 2nd arguement as species factor dataframe (what u want to predict (response))
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
# PREDICT THE MODEL

# predict test features
# change epithet factors to species names
preds = iris.target_names[clf.predict(test[features])]
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'virginica', 'virginica', 'versicolor', 'virginica',
       'virginica', 'virginica', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'virginica', 'virginica', 'virginica'], 
      dtype='|S10')

In [22]:
# SCORE THE MODEL

# compare actual vs predicted values, using a confusion matrix
pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
# Every run you made yields a different result, but its clear that the prediction is very high

preds,setosa,versicolor,virginica
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,12,0
virginica,0,5,10
