In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),is_train,species
0,5.1,3.5,1.4,0.2,True,setosa
1,4.9,3.0,1.4,0.2,False,setosa
2,4.7,3.2,1.3,0.2,True,setosa
3,4.6,3.1,1.5,0.2,True,setosa
4,5.0,3.6,1.4,0.2,False,setosa


In [15]:
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),is_train,species
145,6.7,3.0,5.2,2.3,True,virginica
146,6.3,2.5,5.0,1.9,True,virginica
147,6.5,3.0,5.2,2.0,False,virginica
148,6.2,3.4,5.4,2.3,True,virginica
149,5.9,3.0,5.1,1.8,True,virginica


In [16]:
train, test = df[df['is_train']==True], df[df['is_train']==False]

train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),is_train,species
0,5.1,3.5,1.4,0.2,True,setosa
2,4.7,3.2,1.3,0.2,True,setosa
3,4.6,3.1,1.5,0.2,True,setosa
5,5.4,3.9,1.7,0.4,True,setosa
8,4.4,2.9,1.4,0.2,True,setosa


In [17]:
test.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),is_train,species
1,4.9,3.0,1.4,0.2,False,setosa
4,5.0,3.6,1.4,0.2,False,setosa
6,4.6,3.4,1.4,0.3,False,setosa
7,5.0,3.4,1.5,0.2,False,setosa
13,4.3,3.0,1.1,0.1,False,setosa


In [18]:
len(train)

116

In [19]:
len(test)

34

In [20]:
features = df.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [21]:
rf = RandomForestClassifier(n_jobs=2, random_state=0)

X = train[features]
y = pd.factorize(train['species'])[0]
X_test = test[features]

rf.fit(X, y)

predictions = iris.target_names[rf.predict(X_test)]

predictions


array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'versicolor', 'virginica', 'virginica',
       'virginica', 'versicolor', 'virginica'],
      dtype='<U10')

In [22]:
print(pd.crosstab(test['species'], predictions, rownames=['actual'], colnames=['preds']))

preds       setosa  versicolor  virginica
actual                                   
setosa          14           0          0
versicolor       0           9          0
virginica        0           3          8


In [23]:
print(list(zip(train[features], rf.feature_importances_)))

[('sepal length (cm)', 0.096406174986892954), ('sepal width (cm)', 0.017860260872876198), ('petal length (cm)', 0.35893503862449189), ('petal width (cm)', 0.52679852551573902)]
