# Decision Tree

This notebook tries different models on the preprocessed airbnb dataset using Decision Tree Classifier.

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import tree
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

data = pd.read_csv('preprocessed_airbnb_train.csv')
labels = data.loc[:,'country_destination']
data = data.drop(['country_destination'], axis=1)

def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(te_labels, predict, k):
    lb = LabelBinarizer()
    lb.fit(range(len(predict) + 1))
    T = lb.transform(te_labels)
    scores = []

    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)  
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)


### Varying the quality of split

In [2]:
data_nodfb = data.drop(['dfb_year', 'dfb_month', 'dfb_day'], axis=1)
data_nodfb = pd.DataFrame(preprocessing.StandardScaler().fit_transform(data_nodfb))
foldnum=0
results = pd.DataFrame()
for train, test in cross_validation.KFold(len(data_nodfb), shuffle=True, n_folds=10,
                                           random_state=20160217):
    foldnum+=1
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(data_nodfb, labels, train, test)
    for criterion in ['entropy', 'gini']:
        dtree = tree.DecisionTreeClassifier(random_state=20160121, criterion=criterion)
        dtree = dtree.fit(tr_data, tr_labels.values.ravel())
        dtree_predict = dtree.predict_proba(te_data) 
        score = ndcg_score(te_labels.as_matrix(), dtree_predict, k=5)
        print 'Fold : {}, Criterion : {}, Score : {}'.format(foldnum, criterion, score )
        results.loc[foldnum, 'criterion=%s'%criterion ] = score

Fold : 1, Criterion : entropy, Score : 0.508978660528
Fold : 1, Criterion : gini, Score : 0.508286770335
Fold : 2, Criterion : entropy, Score : 0.509414322253
Fold : 2, Criterion : gini, Score : 0.511634670379
Fold : 3, Criterion : entropy, Score : 0.508166467162
Fold : 3, Criterion : gini, Score : 0.514350523465
Fold : 4, Criterion : entropy, Score : 0.510487152992
Fold : 4, Criterion : gini, Score : 0.507182109847
Fold : 5, Criterion : entropy, Score : 0.504497241285
Fold : 5, Criterion : gini, Score : 0.510397868822
Fold : 6, Criterion : entropy, Score : 0.506438441105
Fold : 6, Criterion : gini, Score : 0.507322444109
Fold : 7, Criterion : entropy, Score : 0.510874175103
Fold : 7, Criterion : gini, Score : 0.509888305434
Fold : 8, Criterion : entropy, Score : 0.515694869526
Fold : 8, Criterion : gini, Score : 0.51294124663
Fold : 9, Criterion : entropy, Score : 0.513282732152
Fold : 9, Criterion : gini, Score : 0.508767538266
Fold : 10, Criterion : entropy, Score : 0.510810460734
F

In [3]:
results.mean()

criterion=entropy    0.509864
criterion=gini       0.510245
dtype: float64

It can be seen that gini performs slightly better thatn entropy. 

### Varying the max-depth

In [4]:
results = pd.DataFrame()
foldnum=0
for train, test in cross_validation.KFold(len(data_nodfb), shuffle=True, n_folds=10,
                                           random_state=20160217):
    foldnum+=1
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(data_nodfb, labels, train, test)

    for max_depth in range(3, 8):
        dtree = tree.DecisionTreeClassifier(random_state=20160121, criterion='gini')
        dtree = dtree.fit(tr_data, tr_labels.values.ravel())
        dtree_predict = dtree.predict_proba(te_data) 
        score = ndcg_score(te_labels.as_matrix(), dtree_predict, k=5)
        print 'Fold : {}, Max-depth : {}, Score : {}'.format( foldnum, max_depth, score )
        results.loc[foldnum, 'max_depth=%d'%max_depth ] = score

Fold : 1, Max-depth : 3, Score : 0.508286770335
Fold : 1, Max-depth : 4, Score : 0.508286770335
Fold : 1, Max-depth : 5, Score : 0.508286770335
Fold : 1, Max-depth : 6, Score : 0.508286770335
Fold : 1, Max-depth : 7, Score : 0.508286770335
Fold : 2, Max-depth : 3, Score : 0.511634670379
Fold : 2, Max-depth : 4, Score : 0.511634670379
Fold : 2, Max-depth : 5, Score : 0.511634670379
Fold : 2, Max-depth : 6, Score : 0.511634670379
Fold : 2, Max-depth : 7, Score : 0.511634670379
Fold : 3, Max-depth : 3, Score : 0.514350523465
Fold : 3, Max-depth : 4, Score : 0.514350523465
Fold : 3, Max-depth : 5, Score : 0.514350523465
Fold : 3, Max-depth : 6, Score : 0.514350523465
Fold : 3, Max-depth : 7, Score : 0.514350523465
Fold : 4, Max-depth : 3, Score : 0.507182109847
Fold : 4, Max-depth : 4, Score : 0.507182109847
Fold : 4, Max-depth : 5, Score : 0.507182109847
Fold : 4, Max-depth : 6, Score : 0.507182109847
Fold : 4, Max-depth : 7, Score : 0.507182109847
Fold : 5, Max-depth : 3, Score : 0.51039

In [5]:
results.mean()

max_depth=3    0.510245
max_depth=4    0.510245
max_depth=5    0.510245
max_depth=6    0.510245
max_depth=7    0.510245
dtype: float64

It can be seen that there is no change in the nDCG score with the change in max_depth of the decision tree and also the nDCG score is not so good.

### Varying the min_samples_leaf

In [6]:
results = pd.DataFrame()
foldnum=0
for train, test in cross_validation.KFold(len(data_nodfb), shuffle=True, n_folds=10,
                                           random_state=20160217):
    foldnum+=1
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(data_nodfb, labels, train, test)

    for min_sample_leafs in [5, 20, 50]:
        dtree = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=min_sample_leafs)
        dtree = dtree.fit(tr_data, tr_labels.values.ravel())
        dtree_predict = dtree.predict_proba(te_data) 
        score = ndcg_score(te_labels.as_matrix(), dtree_predict, k=5)
        print 'Fold : {}, Min_S_Leafs : {}, Score : {}'.format( foldnum, min_sample_leafs, score )
        results.loc[foldnum, 'min_s_leafs=%d'%min_sample_leafs ] = score

Fold : 1, Min_S_Leafs : 5, Score : 0.693051160319
Fold : 1, Min_S_Leafs : 20, Score : 0.792821379991
Fold : 1, Min_S_Leafs : 50, Score : 0.812100920541
Fold : 2, Min_S_Leafs : 5, Score : 0.696067859086
Fold : 2, Min_S_Leafs : 20, Score : 0.7912779994
Fold : 2, Min_S_Leafs : 50, Score : 0.811791004661
Fold : 3, Min_S_Leafs : 5, Score : 0.699703513226
Fold : 3, Min_S_Leafs : 20, Score : 0.791866924402
Fold : 3, Min_S_Leafs : 50, Score : 0.809613566783
Fold : 4, Min_S_Leafs : 5, Score : 0.699046191381
Fold : 4, Min_S_Leafs : 20, Score : 0.792603471535
Fold : 4, Min_S_Leafs : 50, Score : 0.810104860706
Fold : 5, Min_S_Leafs : 5, Score : 0.695535938181
Fold : 5, Min_S_Leafs : 20, Score : 0.791506374242
Fold : 5, Min_S_Leafs : 50, Score : 0.811754519079
Fold : 6, Min_S_Leafs : 5, Score : 0.698089469128
Fold : 6, Min_S_Leafs : 20, Score : 0.792980252989
Fold : 6, Min_S_Leafs : 50, Score : 0.809155165248
Fold : 7, Min_S_Leafs : 5, Score : 0.699492869762
Fold : 7, Min_S_Leafs : 20, Score : 0.79

In [7]:
results.mean()

min_s_leafs=5     0.697631
min_s_leafs=20    0.792182
min_s_leafs=50    0.810603
dtype: float64

There is a drastic increase in the nDCG score with the increase in the min_samples_leaf and it performs the best when min_samples_leaf is 50

### Varying max_leaf_nodes

In [8]:
results = pd.DataFrame()
foldnum=0
for train, test in cross_validation.KFold(len(data_nodfb), shuffle=True, n_folds=10,
                                           random_state=20160217):
    foldnum+=1
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(data_nodfb, labels, train, test)

    for max_leaf_nodes in range(5, 12):
        dtree = tree.DecisionTreeClassifier(criterion='gini', max_leaf_nodes=max_leaf_nodes )
        dtree = dtree.fit(tr_data, tr_labels.values.ravel())
        dtree_predict = dtree.predict_proba(te_data) 
        score = ndcg_score(te_labels.as_matrix(), dtree_predict, k=5)
        print 'Fold : {}, max_leaf_nodes : {}, Score : {}'.format( foldnum, max_leaf_nodes, score )
        results.loc[foldnum, 'max_leaf_nodes=%d'%max_leaf_nodes ] = score

Fold : 1, max_leaf_nodes : 5, Score : 0.822145233814
Fold : 1, max_leaf_nodes : 6, Score : 0.822145233814
Fold : 1, max_leaf_nodes : 7, Score : 0.822145233814
Fold : 1, max_leaf_nodes : 8, Score : 0.822145233814
Fold : 1, max_leaf_nodes : 9, Score : 0.822163356779
Fold : 1, max_leaf_nodes : 10, Score : 0.822199602709
Fold : 1, max_leaf_nodes : 11, Score : 0.822181479744
Fold : 2, max_leaf_nodes : 5, Score : 0.823966010511
Fold : 2, max_leaf_nodes : 6, Score : 0.823966010511
Fold : 2, max_leaf_nodes : 7, Score : 0.823966010511
Fold : 2, max_leaf_nodes : 8, Score : 0.823966010511
Fold : 2, max_leaf_nodes : 9, Score : 0.823966010511
Fold : 2, max_leaf_nodes : 10, Score : 0.823893515256
Fold : 2, max_leaf_nodes : 11, Score : 0.823875391442
Fold : 3, max_leaf_nodes : 5, Score : 0.821608880654
Fold : 3, max_leaf_nodes : 6, Score : 0.821608880654
Fold : 3, max_leaf_nodes : 7, Score : 0.821608880654
Fold : 3, max_leaf_nodes : 8, Score : 0.821608880654
Fold : 3, max_leaf_nodes : 9, Score : 0.82

In [9]:
results.mean()

max_leaf_nodes=5     0.822126
max_leaf_nodes=6     0.822126
max_leaf_nodes=7     0.822126
max_leaf_nodes=8     0.822126
max_leaf_nodes=9     0.822132
max_leaf_nodes=10    0.822139
max_leaf_nodes=11    0.822128
dtype: float64

The nDCG score is good when the max_leaf_nodes is set and it is the best when max_leaf_nodes=10.

### AdaBoost Classifier for Decision Tree

In [11]:
from sklearn.ensemble import AdaBoostClassifier

results = pd.DataFrame()
foldnum=0
for train, test in cross_validation.KFold(len(data_nodfb), shuffle=True, n_folds=10,
                                           random_state=20160217):
    foldnum+=1
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(data_nodfb, labels, train, test)
    
    dtree =  AdaBoostClassifier(tree.DecisionTreeClassifier(criterion='gini', max_leaf_nodes=10), 
                                random_state=20160202)
    dtree = dtree.fit(tr_data, tr_labels.values.ravel())
    dtree_predict = dtree.predict_proba(te_data) 
    score = ndcg_score(te_labels.as_matrix(), dtree_predict, k=5)
    print 'Fold : {},  Score : {}'.format( foldnum, score )
    results.loc[foldnum, 'adaboost'] = score

Fold : 1,  Score : 0.818476724739
Fold : 2,  Score : 0.820920214732
Fold : 3,  Score : 0.81920483286
Fold : 4,  Score : 0.819283118322
Fold : 5,  Score : 0.821489339445
Fold : 6,  Score : 0.818936698299
Fold : 7,  Score : 0.82089684951
Fold : 8,  Score : 0.820542404937
Fold : 9,  Score : 0.818866494592
Fold : 10,  Score : 0.817795161637


In [12]:
results.mean()

adaboost    0.819641
dtype: float64

AdaBoost with Decision tree didn't make the nDCG score better. So the best nDCG score( 0.822139 ) obtained with decision tree is when the max_leaf_nodes is set to 10.