In [177]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [270]:
df = pd.read_csv('survey_responses.csv')

### 1. Load and prepare the data

In [271]:
df.columns = ['gender','household_type','state','favorite_food','exercise_regime','mode_of_travel','arrival_time','music_choice','purchase_history','age']
df.head()

Unnamed: 0,gender,household_type,state,favorite_food,exercise_regime,mode_of_travel,arrival_time,music_choice,purchase_history,age
0,Male,I live with my family.,DC,Steak,Keep fit through yoga and jogging.,Walk or bike,9:00:00 AM,Heavy metal / rock,,
1,Female,I live by myself or with housemates.,DC,Mexican,Keep fit through jogging or gym.,Walk or bike,7:30:00 AM,Classical music,Theater tickets,35-44
2,Male,I live by myself or with housemates.,DC,Home cooking,Keep fit through jogging or gym.,Train,9:30:00 AM,Classical music,Theater tickets,25-34
3,Male,I live with my family.,DC,Home cooking,Keep fit through jogging or gym.,Walk or bike,9:00:00 AM,Classical music,"Gardening or home improvement gear, Bluetooth ...",35-44
4,Female,I live by myself or with housemates.,DC,Home cooking,Keep fit through jogging or gym.,Walk or bike,8:30:00 AM,Classical music,"Gardening or home improvement gear, Bluetooth ...",25-34


In [272]:
df.purchase_history = df.purchase_history.astype(str)
df.purchase_history = [cell.split(',') for cell in df.purchase_history]

In [273]:
items = ['Theater tickets','Yoga mat','Bluetooth speaker / headphones',
             'Gardening or home improvement gear',' Presents for my kids']

In [274]:
for item in items:
    df[item] = [item in cell for cell in df.purchase_history]
    df[item] = df[item].map({False:0,True:1})

### 2. Create a machine-readable set of features
This includes transforming categorical variables into dummies and taking the mid-point of the age ranges.

In [275]:
X = pd.DataFrame({'age': df.age.map({'25-34':29.5,'35-44':39})})

In [276]:
X.age.fillna(X.age.mean(),inplace=True)

In [277]:
# Transform the following to dummies: gender, household_type, state, food, exercise
for feature in ['gender','household_type','state','favorite_food','exercise_regime']:
    a = pd.get_dummies(df[feature])
    X = pd.concat([X,a], axis=1)

In [278]:
X = pd.concat([X,df[items]],axis=1)
X.head()

Unnamed: 0,age,Female,Male,I live by myself or with housemates.,I live with my family.,DC,Maryland,Virginia,Home cooking,Mexican,...,Vegetarian,I play team sports / run races.,Keep fit through jogging or gym.,Keep fit through yoga and jogging.,Netflix is my most strenuous exercise.,Theater tickets,Yoga mat,Bluetooth speaker / headphones,Gardening or home improvement gear,Presents for my kids
0,34.25,0,1,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,39.0,1,0,1,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
2,29.5,0,1,1,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
3,39.0,0,1,0,1,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
4,29.5,1,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [279]:
y_transport = df.mode_of_travel.map({'Train':0,'Car':0,'Walk or bike':1})
pd.DataFrame({'Train or car':0,'Walk or bike':1},index=["we'll denote it like this:"])

Unnamed: 0,Train or car,Walk or bike
we'll denote it like this:,0,1


In [281]:
y_music = df.music_choice.map({'Classical music':0,'Heavy metal / rock':1})
pd.DataFrame({'Classical':0,'Heavy metal / rock':1},index=["we'll denote it like this:"])

Unnamed: 0,Classical,Heavy metal / rock
we'll denote it like this:,0,1


### 3. Divide into training and test sets

In [309]:
target_variable = y_music
X_train, X_test, y_train, y_test = train_test_split(X, target_variable, test_size=0.4)

In [310]:
print("rows and columns in X_train: {}".format(X_train.shape))
print("rows and columns in y_train: {}".format(y_train.shape))
print("rows and columns in X_test: {}".format(X_test.shape))
print("rows and columns in y_test: {}".format(y_test.shape))

rows and columns in X_train: (16, 22)
rows and columns in y_train: (16,)
rows and columns in X_test: (12, 22)
rows and columns in y_test: (12,)


### 4. Build decision tree, predict for the test set

In [311]:
tree = DecisionTreeClassifier(max_depth=6)

In [312]:
tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [318]:
# build random forest for comparison

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train,y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [319]:
# make predictions

y_pred_tree = tree.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [320]:
n_samples = len(y_pred_tree)

In [321]:
print("Decision tree predicted {} zeros and {} ones".format(n_samples-sum(y_pred_tree),sum(y_pred_tree)))
print("RF predicted {} zeros and {} ones".format(n_samples-sum(y_pred_rf),sum(y_pred_rf)))

Decision tree predicted 9 zeros and 3 ones
RF predicted 8 zeros and 4 ones


### 5. Evaluate model performance

In [322]:
n_correct_tree = sum(y_pred_tree == y_test)
accuracy_tree = n_correct_tree / n_samples * 100

n_correct_rf = sum(y_pred_rf == y_test)
accuracy_rf = n_correct_rf / n_samples * 100

print("DECISION TREE - We predicted {} right out of {} examples. That's a {:.1f} % accuracy rate.".format(
    n_correct_tree,n_samples,accuracy_tree))
print("RANDOM FOREST - We predicted {} right out of {} examples. That's a {:.1f} % accuracy rate.".format(
    n_correct_rf,n_samples,accuracy_rf))

DECISION TREE - We predicted 7 right out of 12 examples. That's a 58.3 % accuracy rate.
RANDOM FOREST - We predicted 8 right out of 12 examples. That's a 66.7 % accuracy rate.


In [299]:
pd.DataFrame(confusion_matrix(y_test, y_pred_rf), columns=['predicted 0','predicted 1'],index=['actual 0','actual 1'])

Unnamed: 0,predicted 0,predicted 1
actual 0,3,3
actual 1,0,6


### 6. Visualize graphs

In [184]:
export_graphviz(tree,'tree.dot',feature_names=X.columns,class_names=['classical','metal'],rounded=True,filled=True)

In [180]:
visualize_tree(tree,X.columns)

Index(['age', 'Female', 'Male', 'I live by myself or with housemates.',
       'I live with my family.', 'DC', 'Maryland', 'Virginia', 'Home cooking',
       'Mexican', 'Steak', 'Sushi', 'Vegetarian',
       'I play team sports / run races.', 'Keep fit through jogging or gym.',
       'Keep fit through yoga and jogging.',
       'Netflix is my most strenuous exercise.'],
      dtype='object')