In [446]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import time

In [35]:
iris = load_iris()
iris_y = iris['target']
iris_X = iris['data']

In [34]:
zoo = pd.read_csv('zoo.data', names = np.arange(18))
zoo_y = zoo[17]
zoo_X = zoo.drop(columns=[0, 17])

In [444]:
iters = 5000

def print_outcomes(string, score, time):
    print(string + ' score:', score, f'/ {iters}')
    print('time taken:', time, 'seconds\n')
    
dtc = DecisionTreeClassifier(criterion='gini')

score_sum = 0
t1 = time.time()
for i in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(zoo_X, zoo_y, 
                                                        test_size=0.1, random_state=i)
    dtc = dtc.fit(X_train, y_train)
    score_sum += dtc.score(X_test, y_test)
t2 = int(time.time() - t1)
print_outcomes('gini ZOO', score_sum, t2)

score_sum = 0
t1 = time.time()
for i in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, 
                                                        test_size=0.1, random_state=i)
    dtc = dtc.fit(X_train, y_train)
    score_sum += dtc.score(X_test, y_test)
t2 = int(time.time() - t1)
print_outcomes('gini IRIS', score_sum, t2)

dtc.criterion='entropy'

score_sum = 0
t1 = time.time()
for i in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(zoo_X, zoo_y, 
                                                        test_size=0.1, random_state=i)
    dtc = dtc.fit(X_train, y_train)
    score_sum += dtc.score(X_test, y_test)
t2 = int(time.time() - t1)
print_outcomes('entropy ZOO', score_sum, t2)

score_sum = 0
t1 = time.time()
for i in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, 
                                                        test_size=0.1, random_state=i)
    dtc = dtc.fit(X_train, y_train)
    score_sum += dtc.score(X_test, y_test)
t2 = int(time.time() - t1)
print_outcomes('entropy IRIS', score_sum, t2)


gini ZOO score: 4744.545454545597 / 5000
time taken: 30 seconds

gini IRIS score: 4733.666666666804 / 5000
time taken: 7 seconds

entropy ZOO score: 4721.72727272742 / 5000
time taken: 30 seconds

entropy IRIS score: 4732.93333333347 / 5000
time taken: 7 seconds



### Outcome
After rerunning this test multiple times, Gini regularly provides a minimal performance boost in zoo dataset, with no significant difference on iris dataset. 

### Conclusion
Two possible conclusions:
1. Gini shows slightly better results on high dimensional datasets?
2. Gini shows slightly better results on non-float data?

In [None]:
iters = 100

def print_outcomes(string, score, time):
    print(string + ' score:', score, f'/ {iters}')
    print('time taken:', time, 'seconds\n')
    
rfc = RandomForestClassifier()

score_sum = 0
t1 = time.time()
for i in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(zoo_X, zoo_y, 
                                                        test_size=0.1, random_state=i)
    rfc = rfc.fit(X_train, y_train)
    score_sum += rfc.score(X_test, y_test)
t2 = int(time.time() - t1)
print_outcomes('RFC ZOO', score_sum, t2)

score_sum = 0
t1 = time.time()
for i in range(iters):
    X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, 
                                                        test_size=0.1, random_state=i)
    rfc = rfc.fit(X_train, y_train)
    score_sum += rfc.score(X_test, y_test)
t2 = int(time.time() - t1)
print_outcomes('RFC IRIS', score_sum, t2)


RFC ZOO score: 96.3636363636363 / 100
time taken: 19 seconds

