In [34]:
import os
import json
import itertools
from argparse import Namespace
from collections import OrderedDict
from functools import partial


import torch
import numpy as np
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from tqdm.notebook import tqdm, trange
from torch.utils.data import Dataset, DataLoader


import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

import pandas as pd

# import dataset
import evaluate



## Read data

In [2]:
TRAIN_DATA = "../data/set_neg_zero_aff1.csv"
all_data = pd.read_csv(TRAIN_DATA)

# Read the data into memory
# training_data = pd.read_csv(TRAIN_DATA,nrows= 2500000)
# validation_data = pd.read_csv(TRAIN_DATA,skiprows=2500000,nrows = 1000000, header=None, names= training_data.columns)
# test_data = pd.read_csv(TRAIN_DATA,skiprows= 3500000,nrows = 1000000, header=None,names= training_data.columns)

In [5]:
# Create label column

def relevance(a):
    if a[0] == a[1] == 1:
        return 5
    elif a[0] == 1 and a[1] == 0:
        return 1
    else:
        return 0

all_data['label'] = all_data[['click_bool', 'booking_bool']].apply(relevance,axis = 1)

In [7]:
# Divide into splits 
training_data = all_data[0:100000]
validation_data = all_data[2500000:2600000]
test_data = all_data[3500000:4500000]

In [77]:
training_data.head

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,label
0,1,2013-04-04 08:32:15,12,187,-10.0,-10.0,219,893,3,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0,,0,0
1,1,2013-04-04 08:32:15,12,187,-10.0,-10.0,219,10404,4,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,,0,0
2,1,2013-04-04 08:32:15,12,187,-10.0,-10.0,219,21315,3,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0,,0,0
3,1,2013-04-04 08:32:15,12,187,-10.0,-10.0,219,27348,2,4.0,...,0.0,0.0,0.0,-1.0,0.0,5.0,0,,0,0
4,1,2013-04-04 08:32:15,12,187,-10.0,-10.0,219,29604,4,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0,,0,0


In [81]:
col_names = list(training_data.columns)
col_names.remove('click_bool')
col_names.remove('booking_bool')
col_names.remove('srch_id')
col_names.remove('date_time')
col_names.remove('gross_bookings_usd')
col_names.remove('label')

inputdf = training_data[['label'] + ['srch_id'] + col_names]

# df = df[ ['Mid'] + [ col for col in df.columns if col != 'Mid' ] ]

# print(col_names)
# print(training_data[training_data['srch_id']==1][col_names].values)
inputdf.values

array([[ 0.0e+00,  1.0e+00,  1.2e+01, ...,  0.0e+00,  0.0e+00,  0.0e+00],
       [ 0.0e+00,  1.0e+00,  1.2e+01, ...,  0.0e+00,  0.0e+00,  0.0e+00],
       [ 0.0e+00,  1.0e+00,  1.2e+01, ...,  0.0e+00,  0.0e+00,  0.0e+00],
       ...,
       [ 0.0e+00,  6.7e+03,  5.0e+00, ...,  0.0e+00,  0.0e+00,  0.0e+00],
       [ 0.0e+00,  6.7e+03,  5.0e+00, ...,  0.0e+00,  0.0e+00,  0.0e+00],
       [ 0.0e+00,  6.7e+03,  5.0e+00, ..., -1.0e+00,  0.0e+00,  1.4e+01]])

In [9]:
def read_data(data, column_names): 
    """
    This function reads the data set. 
    Input: 
        data: pandas dataframe 
        column_names: names of the values you want to use         
    """
    features = []
    labels = []
    queries = data.srch_id.unique()
    for query in queries: 
        features_i = data[data['srch_id']==query][column_names].values
        features_i = torch.FloatTensor(features_i)
        features.append(features_i)
        labels_i = torch.FloatTensor(data['label'].values)
        labels.append(labels_i)
        
    return queries, features, labels       
    

In [71]:
queries, features, train_labels = read_data(training_data, col_names)

In [66]:
for q,l in zip(queries, labels): 
    print(q)
    print(l)
    break

1
tensor([0., 0., 0.,  ..., 0., 0., 0.])


In [15]:
validation_data = all_data[2500000:2600000]
q_test, f_test, l_test = read_data(validation_data, col_names)

In [42]:
f_test[0].shape

torch.Size([20, 49])

In [None]:
class QueryGroupedLTRData(Dataset):
    def __init__(self, data, split):
        self.split = {
            "train": data.train,
            "validation": data.validation,
            "test": data.test
        }.get(split)
        assert self.split is not None, "Invalid split!"
    
    def __len__(self):
        return self.split.num_queries()

    def __getitem__(self, q_i):
        feature = torch.FloatTensor(self.split.query_feat(q_i))
        labels = torch.FloatTensor(self.split.query_labels(q_i))
        return q_i, feature, labels

# the return types are different from what pytorch expects, 
# so we will define a custom collate function which takes in
# a batch and returns tensors (qids, features, labels) 
def qg_collate_fn(batch):
    
    qids = []
    features = []
    labels = []
    
    for (q, f, l) in batch:
        qids.append(q)
        features.append(f)
        labels.append(l)
    
    return qids, features, labels
    
    
## example - NOTE the collate_fn argument!
train_dl = DataLoader(QueryGroupedLTRData(data, "train"), batch_size=1, shuffle=True, collate_fn=qg_collate_fn)
# this is how you would use it to quickly iterate over the train/val/test sets 
for (qids, x, y) in train_dl:
    # different from the previous data loader, qids, x and y aren't tensors, but lists!
    for q_i, features_i, labels_i in zip(qids, x, y):
        print(f"Query {q_i} has {len(features_i)} query-document pairs")
        print(f"Shape of features for Query {q_i}: {features_i.size()}")
        break
    break
        

## LambdaRank

In [47]:
# CHANGE: THE NUMBER OF FEATURES 501 TO WHATEVER WE HAVE

class NeuralModule(nn.Module):
    def __init__(self, output_dim):
        """
        Initializes the Pointwise neural network. 
        Input: output_dim: The dimension of the output layer. In this assignment, 
                it is either 1 (regression) or 5 (classification)
        """
        
        super(NeuralModule, self).__init__()
        self.output_dim = output_dim    
        self.layer = nn.Sequential(nn.Linear(49, 16), 
                                   nn.ReLU(), 
                                   nn.Linear(16, self.output_dim))
    
    def forward(self, x):
        """
        Takes in an input feature vector (of size 501) and produces the (regression/classification) output 
        Input: x: a [N, 501] tensor
        Output: a [N, output_dim] tensor
        """
        
        result = self.layer(x)
        return result
        

In [26]:
def clf_pred(inp, net):
    """
    The output of the classifier network produces a [Nx5] output corresponding to 
    the relevance labels (each row does *not* add to 1!)
    This function should predict the most probable relevance from the relevance labels
    
    inp: The input [N, num_features]
    net: the neural network, takes in [N, num_features] and outputs [N, 5]
    
    return: a [N, 1] (long) tensor, the relevance labels
    """

    output = net(inp)
    
    prediction = output.argmax(dim=1).detach()
    
    return prediction
    

In [27]:
# TODO: Implement this! (30 points)
def compute_lambda_ij(scores, labels): 
    
    N = len(scores)
    if N < 2:
        return None
    
    labels = labels.reshape(N, 1)
    scores = scores.reshape(N, 1)
    
    # Create matrices from scores and labels
    scores_i = torch.cat(N*[scores], dim=1)
    scores_j= torch.transpose(scores_i, 0, 1)
    labels_i = torch.cat(N*[labels], dim=1)
    labels_j= torch.transpose(labels_i, 0, 1)
    
    # Compute Sij
    Sij = torch.ones_like(labels_i)
    Sij[labels_i == labels_j] = 0
    Sij[labels_i<labels_j] = -1
    
    # Compute lambda_ij
    lambda_ij = 0.5*(1-Sij) - 1/(1+torch.exp(scores_i-scores_j))
    
    return lambda_ij
    

def listwise_loss(scores, labels):
    
    """
    Compute the LambdaRank loss. (assume sigma=1.)
    
    scores: tensor of size [N, 1] (the output of a neural network), where N = length of <query, document> pairs
    labels: tensor of size [N], contains the relevance labels 
    
    returns: a tensor of size [N, 1]
    """
    
    N = len(scores)
    # YOUR CODE HERE
    if N < 2: 
        return None
    
    # calculate lambda_ij
    lambda_ij = compute_lambda_ij(scores, labels)
    
    # calculate idcg
    sorted_labels, _ = torch.sort(labels, descending=True)
    ranks = torch.arange(2., N+2.)
    idcg = torch.sum((2**sorted_labels-1) / torch.log2(ranks)) + 0.01
    
    # reshape labels
    labels = labels.view(N,1)
    
    # create tensor with ranking of scores 
    _, indices = torch.sort(scores, descending=True, dim=0)
    indices = indices.view(N,1)
    rank_i = torch.cat(N*[indices.float()], dim=1)
    rank_j = torch.transpose(rank_i, 0, 1)
    
    # create tensor with relevance scores
    rel = labels[indices.view(N)]
    rel_i = torch.cat(N*[rel], dim=1)
    rel_j = torch.transpose(rel_i, 0, 1)
    
    # calculate difference in dcg for i, j    
    diff_dcg = (2**rel_i - 2**rel_j)/torch.log2(rank_i+2) + (2**rel_j - 2**rel_i)/torch.log2(rank_j+2) 
    
    ndcg = torch.abs(diff_dcg/idcg)
    
    loss_ij = ndcg * lambda_ij
    
    loss_i = torch.sum(loss_ij, 1)
    
    return loss_i.reshape(N, 1)
    


In [35]:
# this function evaluates a model, on a given split
def evaluate_model(pred_fn, features, labels):
    
    scores = []
    np_labels = []
    for x, y in zip(features, labels): 
        np_labels.append(y.squeeze().numpy)
        with torch.no_grad(): 
            score = pred_fn(features)
            scores.append(score.numpy())
        
    results = evaluate.evaluate2(np.asarray(scores), np.asarray(np_labels))
    
    return results 

In [67]:
# TODO: Implement this! (50 points)
def train_listwise(net, params, queries, features, labels):
    """
    This function should train the given network using the listwise (LambdaRank) loss
    
    Note: Do not change the function definition! 
    Note: You can assume params.batch_size will always be equal to 1
    
    
    net: the neural network to be trained
    
    params: params is an object which contains config used in training 
        (eg. params.epochs - the number of epochs to train). 
        For a full list of these params, see the next cell. 
        
    Returns: a dictionary containing: "metrics_val" (a list of dictionaries) and 
             "metrics_train" (a list of dictionaries). 
             
             "metrics_val" should contain metrics (the metrics in params.metrics) computed
             after each epoch on the validation set (metrics_train is similar). 
             You can use this to debug your models
    """
    
    val_metrics_epoch = []
    train_metrics_epoch = []

    optimizer = torch.optim.Adam(net.parameters(), lr=params.lr)
    criterion = listwise_loss
    pred_fn = partial(clf_pred, net=net)
    
    for i in range(params.epochs):         
        net.train()
        for q_i, features_i, labels_i in zip(queries, features, labels):
            print((q_i), (features_i.shape))
                
            net.zero_grad()
                
            scores = net(features_i)
            print(scores.shape)
            print(labels_i.shape)
                
            loss = criterion(scores, labels_i)
                
            if loss is None: 
                continue
                
            torch.autograd.backward(scores, loss)
                
            optimizer.step()

        net.eval()
    
        with torch.no_grad():  
            
            train_metrics = evaluate_model(pred_fn, features, labels)
            eval_metrics = evaluate_model(pred_fn, f_test, l_test)

            
        train_m = {m: train_metrics[m] for m in params.metrics}
        eval_m = {m: eval_metrics[m] for m in params.metrics}
            
        train_metrics_epoch.append(train_m)
        val_metrics_epoch.append(eval_m)
        
    
    return {
        "metrics_val": val_metrics_epoch,
        "metrics_train": train_metrics_epoch
    }

In [72]:
listwise_params_test = Namespace(epochs=1, lr=1e-3, batch_size=1, metrics={"ndcg"})
listwise_net = NeuralModule(1)
train_listwise(listwise_net, listwise_params_test, queries, features, train_labels)
# evaluate_model(listwise_net, "test", print_results=True)

1 torch.Size([28, 49])
torch.Size([28, 1])
torch.Size([100000])


RuntimeError: shape '[28, 1]' is invalid for input of size 100000

In [75]:
len(train_labels)
len(queries)
len(features)

4046