# Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [8]:
data = pd.DataFrame.from_csv('../../data/data.csv')
data = data[['year','shannon_entropy','mean_color_r','luminance','contrast', 'contour']]
data.head()

Unnamed: 0,year,shannon_entropy,mean_color_r,luminance,contrast,contour
0,1946,6.768127,195.046332,181.738624,0.827169,5.6e-05
1,1953,6.937948,228.983463,161.723046,0.587802,0.0
2,1944,6.519651,217.905564,214.564295,0.853499,0.003758
3,1951,6.451747,191.229599,184.080065,0.706196,6.3e-05
4,1949,5.738259,230.154804,224.799111,0.658131,0.0


In [9]:
bins = [1935, 1940, 1947, 1950, 1968, 1971]
data['year_bin']=pd.cut(data['year'], bins)
data.head()

Unnamed: 0,year,shannon_entropy,mean_color_r,luminance,contrast,contour,year_bin
0,1946,6.768127,195.046332,181.738624,0.827169,5.6e-05,"(1940, 1947]"
1,1953,6.937948,228.983463,161.723046,0.587802,0.0,"(1950, 1968]"
2,1944,6.519651,217.905564,214.564295,0.853499,0.003758,"(1940, 1947]"
3,1951,6.451747,191.229599,184.080065,0.706196,6.3e-05,"(1950, 1968]"
4,1949,5.738259,230.154804,224.799111,0.658131,0.0,"(1947, 1950]"


In [10]:
# Split into train and test data
features = data[['shannon_entropy','mean_color_r','luminance','contrast', 'contour']]
target = data['year_bin'].astype(str)
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=41)

In [15]:
features.columns.tolist()

['shannon_entropy', 'mean_color_r', 'luminance', 'contrast', 'contour']

In [11]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(features_train, target_train)
rf.score(features_train, target_train)

1.0

In [12]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.27881975, 0.18635546, 0.19413249, 0.1695234 , 0.17116889])

In [16]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, features.columns.tolist()), reverse=True)

[(0.2788197508893278, 'shannon_entropy'),
 (0.19413248939046046, 'luminance'),
 (0.18635546259048671, 'mean_color_r'),
 (0.1711688923721332, 'contour'),
 (0.16952340475759164, 'contrast')]

In [17]:
predicted = rf.predict(features_test)

In [18]:
rf.score(features_test, target_test)

0.7857142857142857

In [19]:
import sklearn
sklearn.metrics.accuracy_score(target_test,predicted)

0.7857142857142857