### Imports needed for decision trees

In [81]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#imports for DT
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

#remove warnings
import warnings
warnings.filterwarnings("ignore")

## Base-DT 

From the results of the decision tree models created, the second model seems to have much better predictions, and overall has better performance in terms of recall, precision, f1-measure. This is most likely due to the high number of classes in DS1, as it has 26 classes in comparison to DS2, that only has 9. The decision tree will grow with a much higher complexity for DS1 because of the possible leaf nodes. Another thing to note is the training set for DS2 (7800) has much more instance in comparison to DS1 (1197). Which could allow for the training model to make predictons overall, based on the available instances.

Data Set 1: 
Based on the confusion matrix, the max the model misclassified 2 or lower. However, the avg recall and avg precision is fairly low (50%), therefore very few of the model's positive predictions are true and many of the positive values are never predicted.

Data Set 2:
The model predicted many of the labels correctly except for 5 & 9 as they had the most miclassifications. Both the avg recall and avg precision were fairly high (~80%).

## Data Set 1 

In [82]:
#Retrieve test data for dataset1 from csv file
test_set1 = pd.read_csv("Assig1-Dataset/test_with_label_1.csv", header=None)
#test_set1.shape #display number of (rows, columns) to ensure all the data was retrieved

#seperate values/labels from the test data
y_test1 = test_set1[1024] #the set of labels to all the data in x_test
x_test1 = test_set1.drop(1024, axis=1) #test data set

#Retrieve train set data for dataset1 from csv file
train_set1 = pd.read_csv("Assig1-Dataset/train_1.csv", header=None)

#seperate values/labels from the train data
target_train1 = train_set1[1024] #y_train, the set of labels to all the data in x_train
x_train1 = train_set1.drop(1024,axis=1) #the training data set

## Training / Prediction

In [83]:
#create model for ds1
dtree1 = DecisionTreeClassifier(criterion = "entropy") #Use entropy as the decision criterion for the tree, all other values are set to default
dtree1 = dtree1.fit(x_train1, target_train1) #fit is used to train the algo based on the training data
dt_pred1 = dtree1.predict(x_test1) #predictions based on the test data

## Plot confusion matrix

In [84]:
#print confustion matrix
print(confusion_matrix(y_test1, dt_pred1))

[[3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0]
 [0 0 1 0 

## Print Classification Report: Precision, Recall, F-1 Measure for each class

In [85]:
#print classificaton report
print(classification_report(y_test1, dt_pred1))

              precision    recall  f1-score   support

           0       0.50      0.75      0.60         4
           1       0.00      0.00      0.00         2
           2       0.60      0.75      0.67         4
           3       0.33      0.33      0.33         3
           4       0.50      0.50      0.50         2
           5       0.33      0.50      0.40         2
           6       0.25      0.25      0.25         4
           7       1.00      0.33      0.50         3
           8       1.00      0.33      0.50         3
           9       0.75      0.75      0.75         4
          10       0.00      0.00      0.00         3
          11       1.00      0.75      0.86         4
          12       0.00      0.00      0.00         3
          13       0.60      0.75      0.67         4
          14       0.40      0.67      0.50         3
          15       0.50      1.00      0.67         3
          16       0.50      0.33      0.40         3
          17       1.00    

## Data Set 2

In [86]:
#Retrieve test data for dataset2 from csv file
test_set2 = pd.read_csv("Assig1-Dataset/test_with_label_2.csv", header=None)

#seperate values/labels from the test data
y_test2 = test_set2[1024] #the set of labels to all the data in x_test
x_test2 = test_set2.drop(1024, axis=1) #test data set

#Retrieve training data for dataset2 from csv file
train_set2 = pd.read_csv("Assig1-Dataset/train_2.csv", header=None)

#seperate values/labels from the test data
target_train2 = train_set2[1024] #y_train, the set of labels to all the data in x_train
x_train2 = train_set2.drop(1024,axis=1) #the training data set


## Training / Prediction

In [87]:
#create model for ds2
dtree2 = DecisionTreeClassifier(criterion = "entropy") #Use entropy as the decision criterion for the tree, all other values are set to default
dtree2 = dtree2.fit(x_train2, target_train2) #fit is used to train the algo based on the training data
dt_pred2 = dtree2.predict(x_test2) #predictions based on the test data

## Plot Confusion Matrix

In [88]:
#print out confusion matrix
print(confusion_matrix(y_test2, dt_pred2))

[[ 43   0   1   0   7   0   0   0   1   3]
 [  1 114   0   5   0   2   1   0   2   0]
 [  0   0   7   0   1   2   0   0   3   2]
 [  0   3   0  10   0   1   0   1   0   0]
 [  3   1   0   2  27   2   1   0   1  13]
 [  0   0   1   0   5  42   0   0   0   7]
 [  0   1   2   0   1   0   7   0   2   2]
 [  0   1   0   0   0   0   0  13   1   0]
 [  4   1   0   0   1   0   0   0  43   1]
 [  1   2   1   0   9  14   0   0   1  97]]


## Print Classification Report: Precision, Recall, F-1 Measure for each class

In [89]:
print("\n")
print(classification_report(y_test2, dt_pred2))



              precision    recall  f1-score   support

           0       0.83      0.78      0.80        55
           1       0.93      0.91      0.92       125
           2       0.58      0.47      0.52        15
           3       0.59      0.67      0.62        15
           4       0.53      0.54      0.53        50
           5       0.67      0.76      0.71        55
           6       0.78      0.47      0.58        15
           7       0.93      0.87      0.90        15
           8       0.80      0.86      0.83        50
           9       0.78      0.78      0.78       125

    accuracy                           0.78       520
   macro avg       0.74      0.71      0.72       520
weighted avg       0.78      0.78      0.77       520



## Writing Results to CSV for both data sets

## Creating CSV file Base-DT-DS1

In [92]:
filename = './outputs/Base-DT-DS1.csv'

Base_DT_DS1 = pd.DataFrame(data=dt_pred1)
Base_DT_DS1.index = Base_DT_DS1.index + 1
Base_DT_DS1.to_csv(filename, header=None)

writer = open(filename, 'a')

writer.write("\nConfusion Matrix:\n\n")
writer.write(str(confusion_matrix(y_test1, dt_pred1)))
writer.write("\n\n\n")
writer.write("\nClassification Report:\n\n")
writer.write(str(classification_report(y_test1, dt_pred1)))

writer.close()
print("Output file Base-DT-DS1 created!")

Output file Base-DT-DS1 created!


## Creating CSV file Base-DT-DS2

In [91]:
filename = './outputs/Base-DT-DS2.csv'

Base_DT_DS2 = pd.DataFrame(dt_pred2)
Base_DT_DS2.index = Base_DT_DS2.index + 1
Base_DT_DS2.to_csv(filename, header=None)

writer = open(filename, 'a')

writer.write("\nConfusion Matrix:\n\n")
writer.write(str(confusion_matrix(y_test2, dt_pred2)))
writer.write("\n\n\n")
writer.write("\nClassification Report:\n\n")
writer.write(str(classification_report(y_test2, dt_pred2)))

writer.close()
print("Output file Base-DT-DS2 created!")

Output file Base-DT-DS2 created!
