# Decision Tree Model for Rothko Art

In [1]:
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import graphviz 
import pydotplus
import sklearn
import pickle

ModuleNotFoundError: No module named 'graphviz'

## Load art data

In [73]:
data = pd.DataFrame.from_csv('../../data/data.csv')

In [74]:
data = data[['year','shannon_entropy','mean_color_r','luminance','contrast', 'contour']]

In [103]:
# Bin the data into appropriate art timelines for the artist
bins = [1935, 1940, 1947, 1950, 1968, 1971]
data['year_bin']=pd.cut(data['year'], bins)

In [104]:
data.head()

Unnamed: 0,year,shannon_entropy,mean_color_r,luminance,contrast,contour,year_bin
0,1946,6.768127,195.046332,181.738624,0.827169,5.6e-05,"(1940, 1947]"
1,1953,6.937948,228.983463,161.723046,0.587802,0.0,"(1950, 1968]"
2,1944,6.519651,217.905564,214.564295,0.853499,0.003758,"(1940, 1947]"
3,1951,6.451747,191.229599,184.080065,0.706196,6.3e-05,"(1950, 1968]"
4,1949,5.738259,230.154804,224.799111,0.658131,0.0,"(1947, 1950]"


## Create the model

In [105]:
# Create the Decision Tree Classifier
clf = tree.DecisionTreeClassifier()

In [106]:
# Split the columns to separate the features from the result/target data
features = data[['shannon_entropy','mean_color_r','luminance','contrast', 'contour']]
target = data['year_bin'].astype(str)

# Split the data into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target,
                                                                            random_state=41)


In [107]:
# Train the model
clf.fit(features_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [108]:
# Plot the Decicion Tree
dot_data = tree.export_graphviz(
   clf, out_file=None,
   feature_names=['shannon_entropy','mean_color_r','luminance','contrast', 'contour'],  
   class_names=list(target.unique()),  
   filled=True, rounded=True,  
   special_characters=True)  

# Save the graphic representation of the tree into an image file
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png("RothkoTree.png")

# Display the tree
graph = graphviz.Source(dot_data)  

In [109]:
# Score the model on the training data
clf.score(features_train, target_train)

1.0

In [110]:
features_test

Unnamed: 0,shannon_entropy,mean_color_r,luminance,contrast,contour
117,6.773625,137.823469,64.834755,0.706567,0.000392
23,7.241842,89.652005,60.1911,0.600991,0.002914
9,6.670086,103.76043,68.823123,0.604119,0.001395
83,7.539614,206.147402,180.653881,0.988787,0.001054
75,5.487007,55.405362,49.964642,0.732593,0.002435
142,6.94011,224.723732,121.017543,0.593649,1.1e-05
68,7.544367,206.556442,127.426412,0.689048,0.000143
55,6.901499,94.989966,90.130445,0.978777,0.001258
22,7.386364,187.60711,102.312891,0.690988,0.000301
99,6.050708,152.454348,55.466972,0.731065,0.001153


In [111]:
# Predict the bins for the test data
p = clf.predict(features_test)
for i in range(0, len(p)):
    print(p[i] == target_test.tolist()[i], p[i], target_test.tolist()[i])

True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
True (1947, 1950] (1947, 1950]
True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
False (1947, 1950] (1950, 1968]
True (1950, 1968] (1950, 1968]
False (1968, 1971] (1940, 1947]
True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
False (1968, 1971] (1950, 1968]
True (1950, 1968] (1950, 1968]
True (1940, 1947] (1940, 1947]
True (1947, 1950] (1947, 1950]
True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
False (1950, 1968] (1935, 1940]
False (1940, 1947] (1935, 1940]
False (1947, 1950] (1940, 1947]
True (1950, 1968] (1950, 1968]
True (1947, 1950] (1947, 1950]
False (1968, 1971] (1950, 1968]
True (1947, 1950] (1947, 1950]
True (1950, 1968] (1950, 1968]
True (1950, 1968] (1950, 1968]
True (1968, 1971] (1968, 1971]
True (1950, 1968] (1950, 1968]
False (1950, 1968] (1947, 1950]
True (1950, 1968] (1950, 1968]
True (1940, 1947] (1940, 1947]


In [112]:
# Score the model on the test data
clf.score(features_test, target_test)

0.7857142857142857

In [113]:
# Check the accuracy score for the predicted values
sklearn.metrics.accuracy_score(target_test,p)

0.7857142857142857

In [114]:
# Save the model to a file
filename = "RothkoDecisionTree.pkl"
pickle.dump(clf, open(filename, "wb"))

In [115]:
# load the model from disk
loaded_clf = pickle.load(open(filename, "rb"))

In [116]:
loaded_clf.score(features_test, target_test)

0.7857142857142857