<br>
# Assignment 5: Decision Trees & Ensembles
<br>

**Team:**
- Kevin Huber
- Markus Kaufmann
- Luca Rava
<br>

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import arff

from sklearn import tree
from itertools import product
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

<br>
## Exercise 2 - Decision Trees & Random Forests 

*A competition on Kaggle is concerned about the alcohol consumption of secondary school
students. Based on various information the task is simply to predict whether a student is a
frequent drinker or not. In a publication available here, the authors used decision trees in Weka
and KNIME for their analysis and reported an accuracy of 92%. However, there is no indication
in the paper that random forests have been evaluated as well. The following data description
table is borrowed from the paper cited above:*

![table1](table1.jpg)

<br>
### Exercise
<br>
***Implement your own decision tree predictor in Python and investigate the potential to
improve accuracy using random forests.***

<br>
### Import training data

In [33]:
dataset = arff.load(open('train_data.arff', 'r'))
attributes = []
for attribute in dataset['attributes']:
    attributes.append(attribute[0])
data = dataset['data']
train = pd.DataFrame(data=data, columns=attributes)
print("rows %d" % train.shape[0])
print("cols %d" % train.shape[1])
print(train.columns)
train.head()

rows 844
cols 32
Index(['school', 'sex', 'age', 'address', 'famsize', 'pstatus', 'medu', 'fedu',
       'mjob', 'fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout',
       'health', 'absences', 'g1', 'g2', 'g3', 'drinker'],
      dtype='object')


Unnamed: 0,school,sex,age,address,famsize,pstatus,medu,fedu,mjob,fjob,...,romantic,famrel,freetime,goout,health,absences,g1,g2,g3,drinker
0,GP,M,16.0,U,GT3,T,3.0,1.0,other,other,...,no,4.0,3.0,2.0,5.0,2.0,13.0,11.0,11.0,no
1,GP,F,16.0,U,GT3,A,3.0,4.0,services,other,...,no,3.0,2.0,1.0,5.0,16.0,12.0,11.0,11.0,no
2,MS,M,20.0,R,GT3,T,1.0,1.0,other,other,...,yes,4.0,4.0,3.0,4.0,12.0,8.0,11.0,10.0,no
3,GP,M,15.0,U,LE3,T,4.0,4.0,health,services,...,no,4.0,3.0,3.0,5.0,0.0,12.0,13.0,12.0,no
4,GP,M,15.0,U,LE3,T,1.0,2.0,other,at_home,...,no,4.0,3.0,2.0,5.0,0.0,14.0,13.0,14.0,no


<br>
### Import test data

In [34]:
dataset = arff.load(open('test_data.arff', 'r'))
attributes = []
for attribute in dataset['attributes']:
    attributes.append(attribute[0])
data = dataset['data']
test = pd.DataFrame(data=data, columns=attributes)
print("rows %d" % test.shape[0])
print("cols %d" % test.shape[1])
print(test.columns)
test.head()

rows 200
cols 32
Index(['school', 'sex', 'age', 'address', 'famsize', 'pstatus', 'medu', 'fedu',
       'mjob', 'fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout',
       'health', 'absences', 'g1', 'g2', 'g3', 'drinker'],
      dtype='object')


Unnamed: 0,school,sex,age,address,famsize,pstatus,medu,fedu,mjob,fjob,...,romantic,famrel,freetime,goout,health,absences,g1,g2,g3,drinker
0,GP,F,19.0,U,LE3,T,1.0,2.0,services,services,...,yes,4.0,2.0,4.0,3.0,0.0,9.0,9.0,0.0,
1,GP,M,16.0,U,GT3,T,3.0,3.0,services,other,...,yes,4.0,2.0,3.0,3.0,2.0,12.0,13.0,12.0,
2,GP,M,16.0,U,GT3,T,4.0,4.0,teacher,teacher,...,no,5.0,4.0,4.0,5.0,2.0,15.0,15.0,16.0,
3,GP,F,17.0,U,GT3,T,1.0,1.0,other,services,...,no,4.0,3.0,4.0,5.0,4.0,11.0,10.0,11.0,
4,GP,F,17.0,U,GT3,T,1.0,1.0,at_home,other,...,no,5.0,3.0,3.0,3.0,4.0,5.0,5.0,6.0,


<br>
### Normalize data

In [35]:
train = pd.get_dummies(train)
train = train.drop(['schoolsup_no', 'famsup_no', 'paid_no', 'activities_no', 'nursery_no', 'higher_no', 'internet_no', 
                    'romantic_no', 'drinker_no'], axis=1)
test = pd.get_dummies(test)
test = test.drop(['schoolsup_no', 'famsup_no', 'paid_no', 'activities_no', 'nursery_no', 'higher_no', 'internet_no', 
                    'romantic_no'], axis=1)

<br>
### Separate target and features

In [40]:
X_train = train.drop('drinker_yes', axis=1)
# X_test = test.drop('drinker_yes', axis=1)

y_train = train.drinker_yes
# y_test = test.drinker_yes

<br>
### Realise decision tree classifier

In [41]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# acc = accuracy_score(y_test, y_pred)
# acc

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')