## Classification (Target Label Generation) using Decision Tree

Handling Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Loading Data from CSV

In [2]:
cluster_data = pd.read_csv("exported.csv", index_col=0)
cluster_data.head()

Unnamed: 0,IMG_NUM,SOURCE_DATASET,AB,AC,AD,AE,BC,BD,BE,CD,CE,DE,TARGET
0,28,A,133.454112,7.071068,105.095195,10.0,154.003247,88.617154,5.830952,9.055385,7.071068,11.661904,dense
1,42,A,62.801274,84.504438,42.059482,31.622777,152.947703,339.676317,59.3043,57.45433,385.149322,336.154726,dense
2,14,A,41.629317,29.732137,31.622777,26.172505,40.607881,35.22783,287.14108,215.520301,33.136083,368.98916,dense
3,21,A,41.868843,63.600314,69.46222,285.91782,35.22783,293.586444,368.110038,42.544095,22.472205,38.013156,dense
4,19,A,330.15148,3.162278,6.0,164.003049,169.002959,510.553621,4.472136,187.024063,4.123106,349.322201,dense


Preparing X and Y slices of data, where X is our source data and Y contains the Target Class

In [12]:
X = cluster_data.values[:, 2:-1]
Y = cluster_data.values[:, -1]

Splitting data into Train and Test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

Setting up functions to create decision tree

In [5]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=5, min_samples_leaf=5)
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

# Function to make predictions
def prediction(X_test, clf_object):  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred

# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):      
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print ("Accuracy : ", accuracy_score(y_test,y_pred) * 100)
    print("Report : ", classification_report(y_test, y_pred))

In [7]:
clf_gini = train_using_gini(X_train, X_test, y_train)

print(X_test)
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)

[[615.0292675962665 273.99452549275503 79.1012010022604 103.0194156457898
  197.2536438193221 423.70980635335786 773.1474632953276
  74.06078584514209 423.0011820314454]
 [324.2221460665511 51.0881590977792 649.8099722226491 50.93132631298737
  367.78390394360656 541.052677657176 122.06555615733704 486.231426380484
  242.26638231500465]
 [41.86884283091664 63.60031446463138 69.46221994724903 285.9178203610261
  35.22782990761707 293.58644382872995 368.11003789627904
  42.5440947723653 22.47220505424423]
 [330.15148038438355 3.1622776601683795 6.0 164.00304875214974
  169.00295855398508 510.5536210820564 4.47213595499958 187.0240626229684
  4.123105625617661]
 [411.8859065323794 899.4676203177077 620.0169352525784 67.4166151627327
  411.3404915638624 104.47966309287182 291.547594742265 514.7475109216168
  854.4430934825326]
 [186.14241859393576 71.19691004531025 160.0312469488381
  143.13629868066312 507.6662289339325 195.2434377898525
  102.59142264341595 203.5018427434995 487.06570398