In [1]:
%pylab inline
import pandas as pd
import graphviz

Populating the interactive namespace from numpy and matplotlib


In [2]:
x_cols = []
for i in range(14):
    x_cols.append('F' + str(i))
print(x_cols)

['F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13']


In [3]:
x_train = pd.read_csv("MultiFontCharInput.csv", names=x_cols, header=None)
x_test = pd.read_csv("MultiFontCharInputTestData.csv", names=x_cols, header=None)

In [4]:
import string
y_cols = list(string.ascii_uppercase)
print(y_cols)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [5]:
y_train = pd.read_csv("MultiFontCharOutput.csv", names=y_cols, header=None)
y_test = pd.read_csv("MultiFontCharOutputTestData.csv", names=y_cols, header=None)

In [7]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=20 ,min_samples_leaf=3)
clf = clf.fit(x_train,y_train)
dp = clf.decision_path(x_train)
#print(dp)

In [36]:
gdata = tree.export_graphviz(clf, out_file="dt.png", filled=True, rounded=True)
graph = graphviz.Source(gdata)

In [8]:
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n")

    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
measure_performance(x_test,y_test,clf, show_classification_report=True, show_confusion_matrix=False)

Accuracy:0.718 

Classification report
             precision    recall  f1-score   support

          0       0.60      1.00      0.75         3
          1       0.75      1.00      0.86         3
          2       0.50      0.67      0.57         3
          3       0.25      0.33      0.29         3
          4       1.00      0.67      0.80         3
          5       1.00      1.00      1.00         3
          6       0.00      0.00      0.00         3
          7       1.00      1.00      1.00         3
          8       0.75      1.00      0.86         3
          9       0.33      0.33      0.33         3
         10       1.00      0.67      0.80         3
         11       1.00      0.67      0.80         3
         12       0.75      1.00      0.86         3
         13       1.00      1.00      1.00         3
         14       0.60      1.00      0.75         3
         15       0.67      0.67      0.67         3
         16       1.00      0.67      0.80         3
      

  'precision', 'predicted', average, warn_for)


In [9]:
from sklearn.model_selection import cross_val_score, LeaveOneOut
from scipy.stats import sem

def loo_cv(X_train,y_train,clf):
    # Perform Leave-One-Out cross validation
    # We are preforming 1313 classifications!
    loo = LeaveOneOut()
    scores=np.zeros(X_train[:].shape[0])
    for train_index,test_index in loo.split(X_train):
        X_train_cv, X_test_cv= X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_cv, y_test_cv= y_train.iloc[train_index], y_train.iloc[test_index]
        clf = clf.fit(X_train_cv,y_train_cv)
        y_pred=clf.predict(X_test_cv)
        scores[test_index]=metrics.accuracy_score(y_test_cv.astype(int), y_pred.astype(int))
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))

In [12]:
x_data = x_train.append(x_test, ignore_index=True)
y_data = y_train.append(y_test, ignore_index=True)
loo_cv (x_data, y_data, clf)

Mean score: 0.808 (+/-0.032)


In [13]:
for feature in zip(y_cols, clf.feature_importances_):
    print(feature)

('A', 0.28841504978004556)
('B', 0.1490237067841796)
('C', 0.1680994927103273)
('D', 0.08656464120637661)
('E', 0.0)
('F', 0.1627906040730638)
('G', 0.025529237127498997)
('H', 0.02878773295460032)
('I', 0.05444562672883674)
('J', 0.0)
('K', 0.0)
('L', 0.017127128192529054)
('M', 0.019216780442542054)
('N', 0.0)


In [24]:
from sklearn.model_selection import train_test_split
x_tr, x_te, y_tr, y_te = train_test_split(x_data, y_data, test_size=0.40, random_state=33)

In [25]:
clf = clf.fit(x_tr,y_tr)
measure_performance(x_te,y_te,clf, show_classification_report=True, show_confusion_matrix=False)

Accuracy:0.730 

Classification report
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      1.00      1.00         1
          2       0.43      1.00      0.60         3
          3       1.00      0.33      0.50         3
          4       1.00      1.00      1.00         3
          5       1.00      1.00      1.00         3
          6       1.00      0.67      0.80         3
          7       0.67      1.00      0.80         2
          8       0.75      1.00      0.86         3
          9       0.40      0.67      0.50         3
         10       1.00      1.00      1.00         1
         11       1.00      1.00      1.00         3
         12       1.00      1.00      1.00         1
         13       0.67      1.00      0.80         2
         14       0.00      0.00      0.00         1
         15       0.50      1.00      0.67         2
         16       0.00      0.00      0.00         4
      

  'precision', 'predicted', average, warn_for)
