# Decision Trees

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot

### Read Data

In [2]:
# means per kid: sensordata

data = pd.read_csv("new.csv")

In [3]:
# means per kid: sensordata, age, gender

data2 = pd.read_csv("../spsdata/test.csv")

In [4]:
# all rows: sensordata

data3 = pd.read_csv("../spsdata/new_all.csv")

In [5]:
# all rows: sensordata, age, gender

df = pd.read_csv("../spsdata/roadrunner.csv")
clean = df.drop(columns=["index_runner","side_runner","square_runner","python_tijd","spel","permutation","round","level","times_level_played_before","snelheid","mabc_percentile_score"])
data4 = clean.round({"age_precise": 0})

### Create File

In [53]:
df = pd.DataFrame({"ID": [],"acc": [],"ax_f": [],"ay_f": [],"az_f": [],"costheta": [],"jerk": [],"label": [],"gender": [],"age_precise": []})
for i in clean.ID.unique():
    row = data4[data4["ID"] == i].mean()
    df = df.append(row, ignore_index = True)
with open('../spsdata/test.csv', 'w') as f:
    df.to_csv(f, header=True, index=None)

## Decision Tree Means

In [7]:
# decision tree

def decision_tree(data):
    #print("\n\nsplitting...")
    xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns=["ID","label"]), data["label"])
    #print("making tree...")
    dtree = DecisionTreeClassifier()
    #print("training tree...")
    dtree.fit(xtrain, ytrain)
    #print("getting predictions...")
    ypred = dtree.predict(xtest)
    #print("calculating accuracy...\n")
    acc = accuracy_score(ytest,ypred)
    print("drawing tree...\n")
    dot_data = StringIO()
    tree.export_graphviz(dtree, out_file=dot_data, feature_names=list(data)[1:-1], class_names=["0","1"], label="root")
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph[0].write_pdf("test.pdf")
    return acc * 100

In [42]:
decision_tree(data2)

drawing tree...



58.333333333333336

In [48]:
# run decision tree n times
# gives average accuracy

def run(n, data):
    total = 0
    for i in range(n):
        score = decision_tree(data)
        #print("{} %".format(round(score, 2)))
        total += score
    print("Average accuracy is {} %".format(total/float(n)))

## Decision Tree All Rows

In [22]:
# split, ratio is test size

def split(data, ratio):
    ugh = []
    for i in data.ID.unique():
        ugh.append(data[data["ID"] == i])
    np.random.shuffle(ugh)
    trainlist = ugh[int(len(ugh)*ratio):]
    testlist = ugh[:int(len(ugh)*ratio)]
    train = pd.DataFrame(dict((col,[]) for col in list(data)))
    test = pd.DataFrame(dict((col,[]) for col in list(data)))
    for item in trainlist:
        train = train.append(item, ignore_index = True)
    xtrain = train.drop(columns=["ID","label"])
    ytrain = train["label"]
    for item in testlist:
        test = test.append(item, ignore_index = True)
    xtest = test.drop(columns=["ID","label"])
    ytest = test.set_index("ID")["label"]
    return xtrain, xtest, ytrain, ytest

#print(split(data4, 0.3))

In [51]:
# decision tree complex

def decision_tree_complex(data, ratio=0.1):
    #print("splitting...")
    #xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns=["label"]), data["label"], test_size=0.5)
    xtrain, xtest, ytrain, ytest = split(data, ratio)
    with open("../spsdata/hype.csv", "w") as f:
        ytest.to_csv(f, header=True, index=True)
    foo = pd.read_csv("../spsdata/hype.csv")
    #print("making tree...")
    dtree = DecisionTreeClassifier()
    #print("training tree...")
    dtree.fit(xtrain, ytrain)
    #print("getting predictions...")
    ypred = dtree.predict(xtest)
    #print(ypred)
    se = pd.Series(ypred)
    foo['predict'] = se.values
    #print(foo)
    labels = []
    predicts = []
    for i in foo.ID.unique():
        label = foo[foo["ID"] == i]["label"].mean()
        labels.append(label)
        predict = foo[foo["ID"] == i]["predict"].mean()
        predicts.append(predict)
    #print([(labels[i], predicts[i]) for i in range(len(labels))])
    #print([(labels[i], round(predicts[i])) for i in range(len(labels))])
    with open("../spsdata/hype.csv", "w") as f:
        foo.to_csv(f, header=True, index=None)
    #print("calculating accuracy...\n")
    acc = accuracy_score(labels,[round(predict) for predict in predicts])
    #print("drawing tree...\n")
    #dot_data = StringIO()
    #tree.export_graphviz(dtree, out_file=dot_data, feature_names=list(data)[1:-1], class_names=["0","1"], label="root")
    #graph = pydot.graph_from_dot_data(dot_data.getvalue())
    #graph.write_pdf("test.pdf")
    #acc = accuracy_score(ytest, ypred)
    return acc * 100

In [47]:
decision_tree_complex(data3)

splitting...
making tree...
training tree...
getting predictions...
calculating accuracy...



55.55555555555556

In [49]:
# run decision tree n times
# gives average accuracy

def run_complex(n, data):
    total = 0
    for i in range(n):
        score = decision_tree_complex(data)
        print("{} %".format(round(score, 2)))
        total += score
    print("Average accuracy is {} %".format(total/float(n)))

In [54]:
run_complex(10, data3)

22.22 %
66.67 %
44.44 %
66.67 %
66.67 %
55.56 %
22.22 %
33.33 %
66.67 %
44.44 %
Average accuracy is 48.8888888889 %
