### Learning2Rank

In [35]:
# Adapted from https://gist.github.com/coreylynch/4150976

import itertools
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from scipy import stats
import pickle

def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking

    Transforms a n-class ranking problem into a two-class classification
    problem. Subclasses implementing particular strategies for choosing
    pairs should override this method.

    In this method, all pairs are choosen, except for those that have the
    same target value. The output is an array of balanced classes, i.e.
    there are the same number of -1 as +1

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data
    y : array, shape (n_samples,) or (n_samples, 2)
        Target labels. If it's a 2D array, the second column represents
        the grouping of samples, i.e., samples with different groups will
        not be considered.

    Returns
    -------
    X_trans : array, shape (k, n_feaures)
        Data as pairs
    y_trans : array, shape (k,)
        Output class labels, where classes have values {-1, +1}
    """
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    comb = itertools.combinations(range(X.shape[0]), 2)
    for k, (i, j) in enumerate(comb):
        if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
            # skip if same target or different group
            continue
        X_new.append(X[i] - X[j])
        y_new.append(np.sign(y[i, 0] - y[j, 0]))
        # output balanced classes
        if y_new[-1] != (-1) ** k:
            y_new[-1] = - y_new[-1]
            X_new[-1] = - X_new[-1]
    return np.asarray(X_new), np.asarray(y_new).ravel()


class RankSVM(SGDClassifier):
    """Performs pairwise ranking with an underlying SGDClassifer model

    Input should be a n-class ranking problem, this object will convert it
    into a two-class classification problem, a setting known as
    `pairwise ranking`.

    Authors: Fabian Pedregosa <fabian@fseoane.net>
             Alexandre Gramfort <alexandre.gramfort@inria.fr>

    https://gist.github.com/2071994

    """

    def fit(self, X, y):
        """
        Fit a pairwise ranking model.

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        y : array, shape (n_samples,) or (n_samples, 2)

        Returns
        -------
        self
        """
        X_trans, y_trans = transform_pairwise(X, y)
        super(RankSVM, self).fit(X_trans, y_trans)
        return self

    def predict(self, X):
        pred = super(RankSVM, self).predict(X)
        # preds are mapped to {-1,1}
        # FIXME only works in this example!!!
        pred[pred == -1] = 0
        return pred

    def score(self, X, y):
        """
        Because we transformed into a pairwise problem, chance level is at 0.5
        """
        X_trans, y_trans = transform_pairwise(X, y)
        return np.mean(super(RankSVM, self).predict(X_trans) == y_trans)


In [36]:
user_vector_LDA = pickle.load(open('user_id_question_map.pkl', 'rb'))
user_vector_LLDA = pickle.load(open('user_id_question_map_QANQA.pkl', 'rb'))
user_vector_doc2vec = pickle.load(open('user_id_question_map_doc2vec.pkl', 'rb'))
groundTruth = pickle.load(open('groundtruth.pkl', 'rb'))

In [37]:
#print user_vector_LDA

In [38]:
#print user_vector_LLDA

In [39]:
#print user_vector_doc2vec

In [40]:
print groundTruth

{1: [80, 20, 10, 116, 14, 137, 153, 116, 102, 106], 2052: [1465], 5: [107, 107, 7, 50, 3, 49, 70, 199, 137, 267, 267, 267, 267, 3, 3, 137, 382, 15], 6: [116, 14, 10, 110534], 2049: [], 2056: [48471], 2058: [981], 11: [10, 80, 102], 2060: [96277], 2062: [1465], 2064: [16575], 17: [116, 10], 2067: [7088, 16575, 16575, 501, 501, 63995, 501, 501], 25: [107, 158, 1250, 20, 482], 26: [102, 3], 2075: [], 2077: [12442, 27870, 267], 31: [107, 102, 137, 137, 501, 80871, 267], 32: [14, 102], 34: [116, 267], 2084: [], 37: [116, 10], 38: [107, 156], 2090: [267], 43: [107, 80], 48: [116, 20], 2098: [1465], 2100: [1465], 53: [137, 155, 201, 3, 116, 267], 57: [267, 27870], 59: [10, 107, 482, 482], 64: [267, 267], 66: [10], 2115: [], 2116: [97987], 2118: [981], 71: [2, 117, 156], 2120: [16575, 96277, 12442], 76: [1], 82: [116], 2131: [76613], 2133: [], 2134: [32093], 2137: [96277], 2140: [92056], 93: [267, 15], 2142: [16575, 12442, 44325], 96: [267], 2145: [1465], 98: [10, 3], 99: [10], 2151: [76613, 9

In [41]:
# The number of methods
methods = 3

# Get the complete user list
users = user_vector_LDA[user_vector_LDA.keys()[0]]

# Top K under consideration
top_k = 40

X = np.zeros((len(users), methods))
y = np.zeros((len(users)))

In [42]:
clf = RankSVM(n_iter=100, alpha=0.01, loss='hinge')
print clf

for question_ids in user_vector_LDA.keys():
    X = np.zeros((len(users), methods))
    y = np.zeros((len(users)))
    
    top_lda_result = user_vector_LDA[question_ids][:top_k]
    top_llda_result = user_vector_LLDA[question_ids][:top_k]
    top_doc2vec_result = user_vector_doc2vec[question_ids][:top_k]
    top_doc2vec_result = [p for p,q in top_doc2vec_result]
    
    # LDA
    for u in top_lda_result:
        X[users.tolist().index(u), 0] = 1

    # LLDA
    for u in top_llda_result:
        X[users.tolist().index(u), 1] = 1
    
    # Doc2Vec
    for u in top_doc2vec_result:
        X[users.tolist().index(u), 2] = 1
        
    # Ground truth
    correct_users = groundTruth[question_ids]
    if len(correct_users)==0:
        continue
    for u in correct_users:
        y[users.tolist().index(u)] = 1    
    
    # Training L2R
    print question_ids
    
    clf.fit(X, y)
    

RankSVM(alpha=0.01, average=False, class_weight=None, epsilon=0.1, eta0=0.0,
    fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
    loss='hinge', n_iter=100, n_jobs=1, penalty='l2', power_t=0.5,
    random_state=None, shuffle=True, verbose=0, warm_start=False)
1634
1638
1639
1643
1648
1651
1652
1656
1659
1661
1664
1667
1671
1675
1678
1681
1686
1687
1690
1692
1694
1696
1698
1700
1702
1706
1709
1711
1715
1716
1720
1722
1724
1727
1728
1824
1731
1733
1735
1739
1741
1745
1747
1749
1751
1752
1755
1759
1761
1763
1767
1769
1771
1774
1776
1778
1781
1782
1784
1786
1789
1794
1796
2313
1802
1806
1811
1813
2326
1815
1818
1822
2336
1827
1829
1833
1839
2352
1846
1851
1855
1857
1858
1860
1878
1880
1883
1885
1887
1892
1897
1899
1901
1903
1905
1908
1913
2367
1916
1919
1925
1926
1931
1935
1938
1941
1945
1948
1954
1955
1957
1962
1964
1968
1971
1976
1982
1984
1987
1990
1994
1997
1998
2000
2004
2007
2010
2011
2013
2017
2019
2022
2025
2026
2030
2032
2034
2037
2042


In [43]:
groundTruth[1675]

[16575]

In [44]:
def calculate_MRR_per_qn(pred, actual):
    if len(pred)==0:
        return 0
    for i, p in enumerate(pred):
        if(p in actual):
            return (1./(i+1))
    return 0

def calculate_precision_per_qn(pred, actual):
    if len(pred)==0:
        return 0
    n_common_items = len(set(pred) - (set(pred) - set(actual)))
    return (n_common_items * 1.) / len(pred)

In [45]:
mrr_s=[]
precision5_s=[]
precision10_s=[]
for question_ids in user_vector_LDA.keys():
    X = np.zeros((len(users), methods))
    y = np.zeros((len(users)))
    
    top_lda_result = user_vector_LDA[question_ids][:top_k]
    top_llda_result = user_vector_LDA[question_ids][:top_k]
    top_doc2vec_result = user_vector_doc2vec[question_ids][:top_k]
    top_doc2vec_result = [p for p,q in top_doc2vec_result]
    
    # LDA
    for u in top_lda_result:
        X[users.tolist().index(u), 0] = 1

    # LLDA
    for u in top_llda_result:
        X[users.tolist().index(u), 1] = 1

    # Doc2Vec
    for u in top_doc2vec_result:
        X[users.tolist().index(u), 2] = 1
        
    # Ground truth
    correct_users = groundTruth[question_ids]
    if len(correct_users)==0:
        continue
    for u in correct_users:
        y[users.tolist().index(u)] = 1    
    
    #Predict L2R
    print question_ids
    preds_y=clf.predict(X)
    print(preds_y)
    userids=[]
    for i,pred_y in enumerate(preds_y):
        if int(pred_y)==1:
            userids.append(users[i])
#     print(userids)
    
    if len(userids)>9:
        userids=userids[:10]
#     print(userids)   
    mrr_s.append(calculate_MRR_per_qn(userids, groundTruth[question_ids]))
    precision10_s.append(calculate_precision_per_qn(userids, groundTruth[question_ids]))
    precision5_s.append(calculate_precision_per_qn(userids[:5], groundTruth[question_ids]))

    print ('(current)Mean Reciprocal Rank: ', np.mean(mrr_s))
    print ('(current)Precision @ 5', np.mean(precision5_s))
    print ('(current)Precision @ 10', np.mean(precision10_s))
    print ('\n')
    
    

1634
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1638
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1639
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1643
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1648
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1651
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1652
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1656
[0. 0. 0. ... 0. 0. 0.]
('(current)M

('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


2313
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1802
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1806
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1811
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1813
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


2326
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


1815
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.0)


[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.01327433628318584)
('(current)Precision @ 5', 0.003539823008849558)
('(current)Precision @ 10', 0.0026548672566371685)


1968
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.013157894736842105)
('(current)Precision @ 5', 0.0035087719298245615)
('(current)Precision @ 10', 0.0026315789473684214)


1971
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.013043478260869565)
('(current)Precision @ 5', 0.0034782608695652175)
('(current)Precision @ 10', 0.0026086956521739132)


1976
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.01293103448275862)
('(current)Precision @ 5', 0.003448275862068966)
('(current)Precision @ 10', 0.0025862068965517245)


1982
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: ', 0.01282051282051282)
('(current)Precision @ 5', 0.003418803418803419)
('(current)Precision @ 10', 0.0025641025641025645)


1984
[0. 0. 0. ... 0. 0. 0.]
('(current)Mean Reciprocal Rank: