# Ensemble - Voting Classifier

This notebook makes a VotingClassifier based on the model made by each one of us. 

In [8]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import tree
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

data = pd.read_csv('preprocessed_airbnb_train.csv')
labels = data.loc[:,'country_destination']
data = data.drop(['country_destination'], axis=1)

def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(te_labels, predict, k):
    lb = LabelBinarizer()
    lb.fit(range(len(predict) + 1))
    T = lb.transform(te_labels)
    scores = []

    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)  
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)


In [9]:
data_nodfb = data.drop(['dfb_year', 'dfb_month', 'dfb_day'], axis=1)
data_nodfb = pd.DataFrame(preprocessing.StandardScaler().fit_transform(data_nodfb))

In [10]:
from sklearn.ensemble import VotingClassifier

foldnum=0
results = pd.DataFrame()
for train, test in cross_validation.KFold(len(data_nodfb), shuffle=True, n_folds=10,
                                           random_state=20160217):
    foldnum+=1
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(data_nodfb, labels, train, test)
    lr = LogisticRegression(solver='liblinear', penalty='l1')
    dtree = tree.DecisionTreeClassifier(criterion='gini', max_leaf_nodes=10)
    lda = LDA()
    vclf = VotingClassifier(estimators=[('lr', lr), ('dtree', dtree), ('lda', lda)], voting='soft')
    vclf = vclf.fit(tr_data, tr_labels.values.ravel())
    vclf_predict = vclf.predict_proba(te_data) 
    score = ndcg_score(te_labels.as_matrix(), vclf_predict, k=5)
    print 'Fold : {}, Score : {}'.format( foldnum, score )
    results.loc[foldnum, 'voting ensemble' ] = score

Fold : 1, Score : 0.809589552543
Fold : 2, Score : 0.812621984435
Fold : 3, Score : 0.811497845515
Fold : 4, Score : 0.809999930544
Fold : 5, Score : 0.811094357633
Fold : 6, Score : 0.809554808346
Fold : 7, Score : 0.809155676903
Fold : 8, Score : 0.809756483481
Fold : 9, Score : 0.808593910069
Fold : 10, Score : 0.8098782337




In [11]:
results.mean()

voting ensemble    0.810174
dtype: float64