# Random Forest Classifier for Rothko art work

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import sklearn
import pickle

## Load the Rothko art metrics

In [40]:
data = pd.DataFrame.from_csv('../../data/data.csv')
data = data[['year','shannon_entropy','mean_color_r','luminance','contrast', 'contour']]
data.head()

Unnamed: 0,year,shannon_entropy,mean_color_r,luminance,contrast,contour
0,1946,6.768127,195.046332,181.738624,0.827169,5.6e-05
1,1953,6.937948,228.983463,161.723046,0.587802,0.0
2,1944,6.519651,217.905564,214.564295,0.853499,0.003758
3,1951,6.451747,191.229599,184.080065,0.706196,6.3e-05
4,1949,5.738259,230.154804,224.799111,0.658131,0.0


In [41]:
# bin the years
bins = [1935, 1940, 1947, 1950, 1968, 1971]
data['year_bin']=pd.cut(data['year'], bins)
data.head()

Unnamed: 0,year,shannon_entropy,mean_color_r,luminance,contrast,contour,year_bin
0,1946,6.768127,195.046332,181.738624,0.827169,5.6e-05,"(1940, 1947]"
1,1953,6.937948,228.983463,161.723046,0.587802,0.0,"(1950, 1968]"
2,1944,6.519651,217.905564,214.564295,0.853499,0.003758,"(1940, 1947]"
3,1951,6.451747,191.229599,184.080065,0.706196,6.3e-05,"(1950, 1968]"
4,1949,5.738259,230.154804,224.799111,0.658131,0.0,"(1947, 1950]"


In [42]:
# Split into train and test data
features = data[['shannon_entropy','mean_color_r','luminance','contrast', 'contour']]
target = data['year_bin'].astype(str)
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=41)

In [43]:
features.columns.tolist()

['shannon_entropy', 'mean_color_r', 'luminance', 'contrast', 'contour']

## Create the Random Forest model and train it

In [44]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(features_train, target_train)
rf.score(features_train, target_train)

1.0

In [45]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.28051883, 0.1908065 , 0.19000446, 0.17473661, 0.1639336 ])

In [46]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, features.columns.tolist()), reverse=True)

[(0.28051882772466064, 'shannon_entropy'),
 (0.19080650301747532, 'mean_color_r'),
 (0.19000445833529497, 'luminance'),
 (0.17473661214568456, 'contrast'),
 (0.16393359877688443, 'contour')]

## Predict using the model

In [47]:
# predict the test data
predicted = rf.predict(features_test)

In [48]:
# score on the test data
rf.score(features_test, target_test)

0.8333333333333334

In [49]:
# score on the test
sklearn.metrics.accuracy_score(target_test,predicted)

0.8333333333333334

In [50]:
# Save the model to a file
filename = "RothkoRandomForestModel.pkl"
pickle.dump(rf, open(filename, "wb"))

In [51]:
# load the model from disk
loaded_rf = pickle.load(open(filename, "rb"))

In [52]:
loaded_rf.score(features_test, target_test)

0.8333333333333334