In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict, KFold
from sklearn.pipeline import make_pipeline
from pylab import rcParams

%matplotlib inline
params = {'axes.labelsize': 14,'axes.titlesize':14, 'text.fontsize': 14, 'legend.fontsize': 14,
          'xtick.labelsize': 13, 'ytick.labelsize': 14}
rcParams['figure.figsize'] = 6.5, 4

matplotlib.rcParams.update(params)



In [2]:
data = pd.read_csv('C:/Users/John/Desktop/DM/Data Mining VU data/training_set_VU_DM_2014.csv')

In [206]:
sample = data.head(1000)

## nDCG Implementation

In [207]:
#Add a column that contains each row's score
sample = sample.assign(Score=pd.Series(np.zeros(len(sample['booking_bool']))).values)

In [209]:
#Score = value_of['click_bool'] + 4 * value_of['booking_bool'] (coefficients based on given example)
sample.iloc[:,-1] = sample.iloc[:,51] + 4 * sample.iloc[:,53]

In [210]:
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [212]:
#Calculate the nDCG for each srch_ID

ndcg_list = []

for x in enumerate(sample['srch_id'].unique()):
    ndcg_list.append(ndcg_at_k(sample['Score'][sample['srch_id']==x[1]], sample['srch_id'][sample['srch_id']==x[1]].count(), method=1))
    print('Srch_ID',x[1],': ',ndcg_at_k(sample['Score'][sample['srch_id']==x[1]], sample['srch_id'][sample['srch_id']==x[1]].count(), method=1))


Srch_ID 1 :  0.262649535037
Srch_ID 4 :  0.198239863171
Srch_ID 6 :  0.430676558073
Srch_ID 8 :  0.430676558073
Srch_ID 11 :  0.386852807235
Srch_ID 12 :  0.208014597677
Srch_ID 17 :  0.221064729458
Srch_ID 21 :  0.239812466568
Srch_ID 25 :  0.386852807235
Srch_ID 28 :  0.430676558073
Srch_ID 29 :  1.0
Srch_ID 30 :  0.218104291986
Srch_ID 31 :  0.630929753571
Srch_ID 36 :  0.5
Srch_ID 39 :  0.239812466568
Srch_ID 40 :  0.270238154427
Srch_ID 42 :  0.435152391237
Srch_ID 43 :  0.215338279037
Srch_ID 44 :  0.630929753571
Srch_ID 45 :  0.270238154427
Srch_ID 46 :  0.269314527558
Srch_ID 47 :  0.23137821316
Srch_ID 49 :  0.356207187108
Srch_ID 52 :  0.278942945651
Srch_ID 53 :  0.315464876786
Srch_ID 56 :  0.430676558073
Srch_ID 57 :  0.208014597677
Srch_ID 60 :  0.630929753571
Srch_ID 61 :  0.289064826318
Srch_ID 63 :  0.333333333333
Srch_ID 64 :  0.215338279037
Srch_ID 65 :  0.235408913367
Srch_ID 66 :  0.289064826318
Srch_ID 67 :  0.5
Srch_ID 71 :  0.20584683246
Srch_ID 72 :  0.24465054

In [214]:
#Calculate nDCG of the whole model
np.mean(ndcg_list)

0.33438650982443996