**data**: query_id F_context [ F_doc(i) ] Doc_choosen


map:

**hard_rank**: id F_context F_doc | was_choosen

**soft_rank**: id F_context F_doc | relevance


reduce:

**pointwise**:   hard_rank

**pointwise++**: hard_rank + F_doc*F_context

**pairwise**: id F_context (F_docA - F_docB) (label_A - label_B) [ for different labels ]


https://gist.github.com/fabianp/2020955

In [174]:
#!/usr/bin/env python

import sys
import numpy as np
import pandas as pd
import heapq

from itertools import izip
from sklearn.metrics import *

def _d(array, name):
    print "----", name
    for x in array: print x

## Hash trick

In [325]:

hash_lookup = {}

def hash_trick(x, namespaces, n=2**20):
    global hash_lookup
    sparse_x = []
    for row in x:
        hashed_x = {}
        for v,s in zip(row, namespaces):
            my_hash = hash(str(v)+s) % n
            hash_lookup[ my_hash ] = "%s=%s" % ( s, v ) 
            hashed_x[ my_hash ]    = 1
        sparse_x.append( defaultdict(lambda: 0, hashed_x) )
    return sparse_x

def interpret_coefs(coefs, n=None):
    # TODO
    return True


## Transformations

In [398]:
import itertools as it
from collections import defaultdict

def relevance_mapper(X, Y, scorer=None):
    scorer = scorer or ( lambda x,y: int(x[-1] == y)*1000000 )
    return map(scorer, X, Y)

def pairwise_hashed(X, Y, context=None, balanced=True):
    X_new, Y_new = [], []
    pairs = it.combinations(range(len(X)), 2)

    for i, (a,b) in enumerate(pairs):
        y  = np.sign(Y[a] - Y[b])
        if y == 0: continue
        
        x = X[a].copy()
        for k in X[b].keys(): x[k] -= 1
        
        if balanced and y != (-1)**i:
            y *= -1
            for k in x.keys(): x[k] *= -1
    
        X_new.append(x)
        Y_new.append(y)

    return X_new, Y_new

def to_vw(qid, X,Y):
    for x,y in zip(X,Y):
        sparse_feat = [ "%d:%d" % (k,v) for k,v in x.items() if v ]
        print "{} '{} |p {}".format(y, qid, " ".join(sparse_feat))

## Processing

In [405]:
# helpers

current_key     = None
current_samples = []
def process_by_key(x, y, key=None, callback=None, **callback_params):
    global current_key, current_samples

    if current_key and current_key != key:
        callback( current_key, current_samples, **callback_params) 
        current_samples = [ ]
    
    current_samples.append( (x, y) )
    current_key     = key
    

def process(qid, rows, pairwise=False):
    if qid is None: return

    X, Y = zip(*rows)
    X_hashed = hash_trick(X, header, n=hash_b)
    
    if pairwise:
        scr  = relevance_mapper(X, Y)
        X, Y = pairwise_hashed(X_hashed, scr)
    
    to_vw(qid, X_hashed, Y)

In [408]:
_file  = open('test.l2b.tsv')
header  = _file.readline().strip().split(sep)
sep     = " "
hash_b = 2**20

choosen_i = -1 
query_i   = 0

# notebook fix
current_key     = None
current_samples = []

for line in _file:
    row   = line.strip().split(sep)
    x = row[query_i+1:choosen_i]
    y = row[choosen_i]
    process_by_key(x, y, key=row[query_i], callback=process, pairwise=True)

process( current_key, current_samples, pairwise=True)

1 'b1 |p 62667:1 611565:1 909087:1
-1 'b1 |p 62667:1 31045:1 909087:1


## Evaluating

In [None]:
# to_vw -c -pairwise train.csv | vw -i train.csv --link logistic -b 25 -f vw.model
# to_vw -c -hashed test.csv    | vw -t test.csv -p | eval labels.csv

def truth_mask(truch_doc, all_docs):
    np.zeros(len(all_docs))
    all_docs[ docs.index(truth_doc) ] = 1
    return

# how far we need to go to predict the true label?
coverage_k = []

for i in data:
    if current_key == row[key]: continue
    process no pairwise
    truth  = get chosen doc
    y_true = [ int(truth == i) for i in docs_to_consider ]
    y_pred = [ predict for i in docs_to_consider ]
    k      = coverage_error([ y_true ], [ y_pred ])
    coverage_k.append(k)
    print np.mean(coverage_k < 3), np.mean(coverage_k < 5), np.mean(coverage_k < 7)


## Testing HashTrick

In [343]:
from sklearn.datasets import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.feature_extraction import *

df = pd.read_csv('car.data.csv')
x = df[df.columns[0:-1]]
y = (df.unacc == 'unacc').astype('int')

In [353]:
x_enc = DictVectorizer(sparse=False).fit_transform(x.to_dict(orient='records'))
train_i, test_i = train_test_split(range(0, len(x_enc)), test_size=.2)

lr = LogisticRegression()
lr.fit(x_enc[train_i], y[train_i])
lr.score(x_enc[test_i], y[test_i]), lr.coef_

(0.94219653179190754,
 array([[ 5.41572917, -2.16359832, -1.9361105 ,  1.17767831,  0.22744353,
         -0.07430495, -0.01479654, -2.73114337,  5.27330833, -1.22614461,
         -0.51857301,  0.17152639,  1.66306697,  0.75924602, -0.97830128,
         -0.88392402,  2.41899963,  1.22386666, -1.33869994, -1.0184942 ,
          2.44934783]]))

In [367]:
b = 3000

dfx = pd.DataFrame(hash_trick( x.values, x.columns, n=b ))
dfx = dfx.fillna(0)
print dfx.shape

lr1 = LogisticRegression()
lr1.fit(dfx.ix[train_i], y[train_i])
print lr1.score(dfx.ix[test_i], y[test_i]), lr1.coef_

(1727, 21)
0.942196531792 [[-0.07430495 -2.16359832  0.75924602 -0.97830128  0.17152639 -2.73114337
   2.44934783  1.17767831 -1.9361105  -1.0184942  -0.51857301  2.41899963
   0.22744353 -1.33869994  1.22386666 -1.22614461  1.66306697  5.41572917
  -0.88392402  5.27330833 -0.01479654]]


In [369]:
np.array(sorted(lr.coef_)) - np.array(sorted(lr1.coef_))

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])