## Imports needed for decision trees

In [41]:
import pandas as pd
import numpy as np
import csv
%matplotlib inline

# Imports for DT
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

#remove warnings
import warnings
warnings.filterwarnings("ignore")

## Data Set 1

In [42]:
#Retrieve test data for dataset1 from csv file
test_set1 = pd.read_csv("Assig1-Dataset/test_with_label_1.csv", header=None)
#test_set1.shape #display number of (rows, columns) to ensure all the data was retrieved

#seperate values/labels from the test data
y_test1 = test_set1[1024] #the set of labels to all the data in x_test
x_test1 = test_set1.drop(1024, axis=1) #test data set

#Retrieve train set data for dataset1 from csv file
train_set1 = pd.read_csv("Assig1-Dataset/train_1.csv", header=None)

#seperate values/labels from the train data
target_train1 = train_set1[1024] #y_train, the set of labels to all the data in x_train
x_train1 = train_set1.drop(1024,axis=1) #the training data set


#Setting Classifier parameters
parameters = {'criterion': ['gini', 'entropy'],
              'max_depth': [10, None],
              'min_samples_split': [1,2,3,4,5,6,7,8,9,10],
              'min_impurity_decrease': [0.0,0.1, 0.2, 0.3, 0.4],
              'class_weight': ['none', 'balanced']}

#DecisionTreeClassifier
clf = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters, n_jobs=-1)

best_dt_1 = clf.fit(x_train1, target_train1) #fit is used to train the algo based on the training data
best_dt_pred1 = best_dt_1.predict(x_test1) #predictions based on the test data

#print best hyperparameters
print("Best parameters: ")
print(best_dt_1.best_params_)

#print confustion matrix
print(confusion_matrix(y_test1, best_dt_pred1))
#print classificaton report
print(classification_report(y_test1, best_dt_pred1))

Best parameters: 
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 2}
[[3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0

## Creating CSV file Best-DT-DS1

In [44]:
filename = './outputs/Best-DT-DS1.csv'

Best_DT_DS1 = pd.DataFrame(data=best_dt_pred1)
Best_DT_DS1.index = Best_DT_DS1.index + 1
Best_DT_DS1.to_csv(filename, header=None)

writer = open(filename, 'a')

writer.write("\nConfusion Matrix:\n\n")
writer.write(str(confusion_matrix(y_test1, best_dt_pred1)))
writer.write("\n\n\n")
writer.write("\nClassification Report:\n\n")
writer.write(str(classification_report(y_test1, best_dt_pred1)))

writer.close()
print("Output file Best-DT-DS1 created!")

Output file Best-DT-DS1 created!


## Data Set 2

In [None]:
#Retrieve test data for dataset1 from csv file
test_set2 = pd.read_csv("Assig1-Dataset/test_with_label_2.csv", header=None)
#test_set1.shape #display number of (rows, columns) to ensure all the data was retrieved

#seperate values/labels from the test data
y_test2 = test_set2[1024] #the set of labels to all the data in x_test
x_test2 = test_set1.drop(1024, axis=1) #test data set

#Retrieve train set data for dataset1 from csv file
train_set2 = pd.read_csv("Assig1-Dataset/train_2.csv", header=None)

#seperate values/labels from the train data
target_train2 = train_set2[1024] #y_train, the set of labels to all the data in x_train
x_train2 = train_set2.drop(1024,axis=1) #the training data set


#DecisionTreeClassifier
#clf = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters, n_jobs=-1)

best_dt_2 = clf.fit(x_train2, target_train2) #fit is used to train the algo based on the training data
best_dt_pred2 = best_dt_2.predict(x_test2) #predictions based on the test data

#print best hyperparameters
print("Best parameters: ")
print(best_d2_2.best_params_)

#print confustion matrix
print(confusion_matrix(y_test2, best_dt_pred2))
#print classificaton report
print(classification_report(y_test2, best_dt_pred2))

## Creating CSV file Best-DT-DS2

In [None]:
filename = './outputs/Best-DT-DS2.csv'

Best_DT_DS2 = pd.DataFrame(data=best_dt_pred2)
Best_DT_DS2.index = Best_DT_DS2.index + 1
Best_DT_DS2.to_csv(filename, header=None)

writer = open(filename, 'a')

writer.write("\nConfusion Matrix:\n\n")
writer.write(str(confusion_matrix(y_test2, best_dt_pred2)))
writer.write("\n\n\n")
writer.write("\nClassification Report:\n\n")
writer.write(str(classification_report(y_test2, best_dt_pred2)))

writer.close()
print("Output file Best-DT-DS2 created!")