In [11]:
from argparse import Namespace
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm.notebook import trange

from ltr.utils import seed

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Chapter 2: Counterfactual LTR

Loading the dataset:

In [12]:
from ltr.dataset import load_data

data = load_data()

We assume that there is a logging policy that shows the results for each query to the users and logs the user clicks.
For that, we provide a logging policy simulator `LoggingPolicy`.
Our logging policy only shows top 20 documents to the users.
You can use this simulator to:
- Get the position of the documents for a query in the SERP: `query_positions`.
- Gather the (simulated) clicks of users for a query: `gather_clicks`.


In [3]:
from ltr.logging_policy import LoggingPolicy

logging_policy = LoggingPolicy()

# Gather the clicks on the SERP for query 20
for i in range(10):
    clicked_docs = np.where(logging_policy.gather_clicks(20))[0]
    clicked_positions = logging_policy.query_positions(20)[clicked_docs]
    print(f'clicks for session {i+1} on documents', clicked_docs, 'on positions', clicked_positions)



clicks for session 1 on documents [32 83 99] on positions [ 3 11  0]
clicks for session 2 on documents [68 99] on positions [10  0]
clicks for session 3 on documents [99] on positions [0]
clicks for session 4 on documents [33 99] on positions [1 0]
clicks for session 5 on documents [33 99] on positions [1 0]
clicks for session 6 on documents [83 99] on positions [11  0]
clicks for session 7 on documents [99] on positions [0]
clicks for session 8 on documents [33 99] on positions [1 0]
clicks for session 9 on documents [33] on positions [1]
clicks for session 10 on documents [33 99] on positions [1 0]


---

## Utils

### Click data loader
First, we need to have a data loader that feeds the model with features and click data.
In this data loader, you have to select `topk=20` items for each query, and return three tensors:
- Feature vectors of the selected documents,
- One instance of the clicks over the selected documents, using the `gather_clicks(qid)` function, and
- The positions of the selected documents in the SERP.

**IMPORTANT** Here you *should not* use the `labels` for training. It is assumed that we cannot observe the real labels and want to use the `clicks` to train our LTR model instead.


In [4]:
from ltr.dataset import ClickLTRData

clickdataset = ClickLTRData(data, logging_policy)
clickdataset[0]
train_dl = DataLoader(clickdataset, batch_size=1, shuffle=True)

for features, clicks, positions in train_dl:
    print(features.shape, clicks.shape, positions.shape)
    assert positions.dtype == torch.long
    print('clicks:', clicks)
    print('positions:', positions)
    break

torch.Size([1, 20, 15]) torch.Size([1, 20]) torch.Size([1, 20])
clicks: tensor([[0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])
positions: tensor([[ 3,  4, 18, 19,  1, 10,  6,  2, 12, 15,  5, 16,  7, 11, 17,  9,  8,  0,
         14, 13]])


### LTR model
Further, let's modify the `LTRModel` from previous chapter and take the width of the middle layer as an argument:

In [5]:
from ltr.model import LTRModel

net = LTRModel(data.num_features, width=20)
print(net)

LTRModel(
  (layers): Sequential(
    (0): Linear(in_features=15, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=1, bias=True)
  )
)


---

## ListNet

In the previous chapter, you have implemented different loss functions for LTR.
Here we use another well known listwise loss funtion, called `ListNet`, and will use it for our unbiased LTR model.
The idea behind ListNet is very simple:
To solve the discontinuity issue of NDCG, in **ListNet**, the loss function is based on probability distribution on permutations.

Define a family of distributions on permutation of scores $z$, $P_z(\pi)$, s.t. $\sum_{\pi\in\Omega} P_z(\pi)=1$, where $\Omega$ is the set of all $n!$ permutations.
Ideally, we want the scores of our LTR model lead to the same permutation distribution as the labels $y$, i.e.,

$$
\min KL(P_y,P_z)=-\sum_{\pi\in\Omega} P_y(\pi) \log P_z(\pi)
$$

Plackett-Luce distribution gives a general formula for calculating the permutation distribution:

$$
P_z(\pi) = \prod_{j=1}^{n} \frac{\exp(z_{\pi(j)})}{\sum_{k=j}^{n} \exp(z_{\pi(k)})}
$$
In ListNet, instead of calculating $n!$ permutation probabilities, the top one probability of each document is calculated:

$$
P_z(j) = \sum_{\pi(1)=j, \pi\in\Omega} P_z(\pi) = \frac{\exp(z_{j})}{\sum_{k=1}^{n} \exp(z_{k})},
$$
which is the softmax function.

Then, the loss is defined as follows:

$$
\mathcal{L}_{\text{ListNet}}=-\sum_{j=1}^{n} P_y(j) \log P_z(j),
$$
where the softmax function is used to calculate $P_y(j)$ and $P_z(j)$ from the labels and predictions, respectively.

### ListNet loss function
Implement the ListNet loss function.

In [6]:
from ltr.loss import listNet_loss
 
biased_net = LTRModel(data.num_features, width=20)

for features, clicks, positions in train_dl:
    print(features.shape, clicks.shape, positions.shape)
    output = biased_net(features)
    print(output.shape, clicks.shape)
    loss = listNet_loss(output, clicks, grading=True)
    print(loss)
    break

torch.Size([1, 20, 15]) torch.Size([1, 20]) torch.Size([1, 20])
torch.Size([1, 20, 1]) torch.Size([1, 20])
(tensor(3.8103, grad_fn=<NegBackward0>), {'preds_smax': tensor([[0.0577, 0.0187, 0.0224, 0.0042, 0.0119, 0.0175, 0.0008, 0.0712, 0.0093,
         0.0091, 0.1041, 0.0005, 0.1686, 0.0065, 0.0135, 0.1378, 0.1765, 0.0942,
         0.0441, 0.0315]], grad_fn=<AddBackward0>), 'true_smax': tensor([[0.0427, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427,
         0.1160, 0.1160, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427, 0.0427,
         0.0427, 0.0427]]), 'preds_log': tensor([[-2.8527, -3.9800, -3.7996, -5.4684, -4.4338, -4.0461, -7.1720, -2.6417,
         -4.6736, -4.7045, -2.2624, -7.5139, -1.7805, -5.0344, -4.3082, -1.9819,
         -1.7343, -2.3624, -3.1208, -3.4591]], grad_fn=<LogBackward0>)})


In [7]:
from ltr.loss import listNet_loss
 
biased_net = LTRModel(data.num_features, width=20)

for features, clicks, positions in train_dl:
    print(features.shape, clicks.shape, positions.shape)
    output = biased_net(features)
    print(output.shape, clicks.shape)
    loss = listNet_loss(output, clicks, grading=True)
    print(loss)
    break

torch.Size([1, 20, 15]) torch.Size([1, 20]) torch.Size([1, 20])
torch.Size([1, 20, 1]) torch.Size([1, 20])
(tensor(3.8466, grad_fn=<NegBackward0>), {'preds_smax': tensor([[2.4379e-03, 3.2630e-02, 6.0620e-02, 5.7648e-02, 1.1180e-01, 5.1229e-02,
         2.1655e-02, 6.4110e-02, 8.4351e-03, 6.0620e-02, 6.4110e-02, 4.5062e-03,
         3.5525e-02, 3.5713e-05, 1.0009e-01, 6.4110e-02, 8.9582e-02, 8.0152e-02,
         1.0553e-02, 8.0152e-02]], grad_fn=<AddBackward0>), 'true_smax': tensor([[0.1252, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460,
         0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460, 0.0460,
         0.0460, 0.0460]]), 'preds_log': tensor([[ -6.0166,  -3.4225,  -2.8031,  -2.8534,  -2.1911,  -2.9714,  -3.8325,
          -2.7471,  -4.7753,  -2.8031,  -2.7471,  -5.4023,  -3.3375, -10.2400,
          -2.3017,  -2.7471,  -2.4126,  -2.5238,  -4.5514,  -2.5238]],
       grad_fn=<LogBackward0>)})


### Biased ListNet training
Now use `listNet_loss` to train an LTR model. Since we use `clicks` instead of `relevance`, and do not correct for the bias, this would be a biased model.

In [8]:
from ltr.train import train_biased_listNet

params = Namespace(epochs=10, 
                    lr=1e-4,
                    batch_size=1,
                    metrics={"ndcg@10", "precision@10", "recall@10"})

biased_net = LTRModel(15, width=20)
train_biased_listNet(biased_net, params, data)

                                                         

{'metrics_val': [{'dcg': (8.26553863412485, 10.431545847952842),
   'dcg@03': (2.310110406337027, 5.32496192253306),
   'dcg@05': (2.7169339562659, 5.73921879602527),
   'dcg@10': (3.4228964878172508, 6.427633209476259),
   'dcg@20': (4.419667724619148, 7.11888648036029),
   'ndcg': (0.32144478682182104, 0.21623656760878693),
   'ndcg@03': (0.10862760481278075, 0.25926073996031457),
   'ndcg@05': (0.12029562799900961, 0.2584086063180539),
   'ndcg@10': (0.1457863699458244, 0.26263358222164407),
   'ndcg@20': (0.1775675757660967, 0.26232603188446946),
   'precision@01': (0.10989010989010989, 0.3127527356210485),
   'precision@03': (0.0695970695970696, 0.1602537938038634),
   'precision@05': (0.054945054945054944, 0.11698007368941771),
   'precision@10': (0.04450549450549451, 0.08285779869737818),
   'precision@20': (0.035989010989010986, 0.05750164727797976),
   'recall@01': (0.06439810189810188, 0.2195271097072441),
   'recall@03': (0.10671233528376385, 0.2737938435660403),
   'recall@

### Saving the results
Since we randomly simulate clicks and use them to train our model, for the evaluation we train and save 10 different models and inspect the average and std over them.

**IMPORTANT** Run the following cell to store your models and results. After it finishes, make sure to push the results to the git repo.

_Estimated time on Codespaces_: 5m

In [9]:
from ltr.utils import create_results
from ltr.train import train_biased_listNet

seed(42)
params = Namespace(epochs=20, 
                    lr=1e-4,
                    batch_size=1,
                    metrics={"ndcg@10", "precision@10", "recall@10"})


for i in range(10):
    print('Training Model', i)
    biased_net = LTRModel(15, width=20)
    create_results(data, biased_net, 
                train_biased_listNet, 
                biased_net,
                f"./outputs/biased_listNet_{i}.json",
                params)

    torch.save(biased_net.state_dict(), f"./outputs/biased_listNet_{i}")
    
# biased_net = LTRModel(15, width=20)
# biased_net.load_state_dict(torch.load('./outputs/biased_listNet_0'))

Training Model 0


                                                         

	"ndcg@10": (0.6773043080431509, 0.3250625909905689)
	"recall@10": (0.8044218238637209, 0.3324953027741621)
	"precision@10": (0.15389755011135864, 0.13377905601844914)
Training Model 1


                                                         

	"ndcg@10": (0.6559571370785954, 0.3518663062901273)
	"recall@10": (0.7547113530311487, 0.3653888759062017)
	"precision@10": (0.14476614699331852, 0.13653524019868285)
Training Model 2


                                                         

	"ndcg@10": (0.7121973859644857, 0.3233688039552177)
	"recall@10": (0.8128065989064286, 0.3264564892489743)
	"precision@10": (0.15345211581291765, 0.1292185629725136)
Training Model 3


                                                         

	"ndcg@10": (0.6882138274139388, 0.33620359041156)
	"recall@10": (0.7873332433466063, 0.34294387414555866)
	"precision@10": (0.15077951002227175, 0.13416596420057986)
Training Model 4


                                                         

	"ndcg@10": (0.6868901551987109, 0.3243101864424063)
	"recall@10": (0.8024668480419823, 0.3338772023165869)
	"precision@10": (0.15389755011135858, 0.1331114649825888)
Training Model 5


                                                         

	"ndcg@10": (0.6943624060753261, 0.3187839072166773)
	"recall@10": (0.8052154375749293, 0.3251552745070508)
	"precision@10": (0.15122494432071273, 0.12943105138073518)
Training Model 6


                                                         

	"ndcg@10": (0.6567824568334288, 0.34787731060031546)
	"recall@10": (0.7715321532333193, 0.35952661118414475)
	"precision@10": (0.14565701559020044, 0.1327625874763277)
Training Model 7


                                                         

	"ndcg@10": (0.7181465398251373, 0.31849599673377504)
	"recall@10": (0.8190578785233573, 0.3245620826980737)
	"precision@10": (0.15902004454342988, 0.13924685119253463)
Training Model 8


                                                         

	"ndcg@10": (0.7014993169731559, 0.33141985907797666)
	"recall@10": (0.8060706452230099, 0.32921584520384944)
	"precision@10": (0.15278396436525618, 0.13263661734062054)
Training Model 9


                                                         

	"ndcg@10": (0.592810160767414, 0.3511124024234709)
	"recall@10": (0.7485331585456045, 0.3751283420964054)
	"precision@10": (0.14565701559020044, 0.13994933206896656)


---

## Unbiased ListNet

### Unbiased ListNet loss function

Now, we use IPS to have an unbiased ListNet:

In [13]:
from ltr.loss import unbiased_listNet_loss

unbiased_net = LTRModel(data.num_features, width=20)
propensity = logging_policy.propensity



for features, clicks, positions in train_dl:
    print(features.shape, clicks.shape, positions.shape)
    output = biased_net(features)
    print(output.shape, clicks.shape)
    loss = unbiased_listNet_loss(output, clicks, propensity[positions.data.numpy()])
    print(loss)
    break



torch.Size([1, 20, 15]) torch.Size([1, 20]) torch.Size([1, 20])
torch.Size([1, 20, 1]) torch.Size([1, 20])
tensor(0., grad_fn=<SumBackward0>)


In [75]:

output = torch.tensor(
    [
        [
            [0.3367],
            [0.1288],
            [0.2345],
            [0.2303],
            [-1.1229],
            [-0.1863],
            [2.2082],
            [-0.6380],
            [0.4617],
            [0.2674],
        ]
    ]
)
target = torch.tensor(
    [
        [
            0.5349,
            0.8094,
            1.1103,
            -1.6898,
            -0.9890,
            0.9580,
            1.3221,
            0.8172,
            -0.7658,
            -0.7506,
        ]
    ]
)
propensity = torch.tensor(
    [
        [
            1.3525,
            0.6863,
            -0.3278,
            0.7950,
            0.2815,
            0.0562,
            0.5227,
            -0.2384,
            -0.0499,
            0.5263,
        ]
    ]
)

loss = unbiased_listNet_loss(output, target, propensity)
print(loss)
assert torch.allclose(loss, torch.tensor(2.6961), atol=0.1)

tensor(2.6960)


### Unbiased ListNet training
Now use `unbiased_listNet_loss` to train an LTR model.

In [11]:
from ltr.train import train_unbiased_listNet

params = Namespace(epochs=10, 
                    lr=1e-4,
                    batch_size=1,
                    propensity=logging_policy.propensity,
                    metrics={"ndcg@10", "precision@10", "recall@10"})

biased_net = LTRModel(15, width=20)
train_unbiased_listNet(biased_net, params, data)

                                                         

{'metrics_val': [{'dcg': (12.369525123706113, 12.030741031272468),
   'dcg@03': (7.1829902546886935, 8.018866305452054),
   'dcg@05': (7.626980132490122, 8.428239520968798),
   'dcg@10': (8.491479450305436, 8.746365443461205),
   'dcg@20': (9.20914525509095, 9.376538354705115),
   'ndcg': (0.525809381585071, 0.30793461049494947),
   'ndcg@03': (0.3642994280154696, 0.4027074415067183),
   'ndcg@05': (0.3657143267508853, 0.39671391744035),
   'ndcg@10': (0.39120069590421835, 0.38355251902695814),
   'ndcg@20': (0.4155510304995906, 0.3722456194923025),
   'precision@01': (0.4230769230769231, 0.4940474068717357),
   'precision@03': (0.19963369963369962, 0.22061045871207083),
   'precision@05': (0.13626373626373625, 0.15655387972314838),
   'precision@10': (0.08901098901098901, 0.09428801964445115),
   'precision@20': (0.055219780219780225, 0.06212983608396073),
   'recall@01': (0.28068419675562534, 0.3985784275780513),
   'recall@03': (0.3569204604918891, 0.42564070437699375),
   'recall@0

### Saving the results
Similar to the biased model, here we train 10 different unbiased models and save them to inspect the average and std over them.

**IMPORTANT** Run the following cell to store your models and results. After it finishes, make sure to push the results to the git repo.

_Estimated time on Codespaces_: 5m

In [12]:
from ltr.utils import create_results
from ltr.train import train_unbiased_listNet

seed(42)
params = Namespace(epochs=20, 
                    lr=1e-4,
                    batch_size=1,
                    propensity=logging_policy.propensity,
                    metrics={"ndcg@10", "precision@10", "recall@10"})

for i in range(10):
    print('Training Model', i)
    unbiased_net = LTRModel(15, width=20)
    create_results(data, unbiased_net, 
                train_unbiased_listNet, 
                unbiased_net,
                f"./outputs/unbiased_listNet_{i}.json",
                params)

    torch.save(unbiased_net.state_dict(), f"./outputs/unbiased_listNet_{i}")



Training Model 0


                                                         

	"ndcg@10": (0.7345720591260813, 0.31756238607806153)
	"recall@10": (0.8227963427753812, 0.32144787027025856)
	"precision@10": (0.15879732739420938, 0.13894087570485597)
Training Model 1


                                                         

	"ndcg@10": (0.744471287469003, 0.31796050619668503)
	"recall@10": (0.8309308934947609, 0.31334906131372936)
	"precision@10": (0.16191536748329624, 0.14266593733177377)
Training Model 2


                                                         

	"ndcg@10": (0.7592335779266325, 0.30142357102180245)
	"recall@10": (0.8465454963686327, 0.29785618775360834)
	"precision@10": (0.16481069042316263, 0.14113766920086235)
Training Model 3


                                                         

	"ndcg@10": (0.7427452270998334, 0.31529649487536143)
	"recall@10": (0.8191097732202147, 0.31911737488495256)
	"precision@10": (0.15924276169265036, 0.14098083506342018)
Training Model 4


                                                         

	"ndcg@10": (0.7138324657246984, 0.3225163236353201)
	"recall@10": (0.8192312389417066, 0.3245247352412128)
	"precision@10": (0.15879732739420938, 0.1382982039733555)
Training Model 5


                                                         

	"ndcg@10": (0.757167630272573, 0.3075349321424616)
	"recall@10": (0.8388255993391589, 0.3050047418050537)
	"precision@10": (0.16436525612472164, 0.143840453079388)
Training Model 6


                                                         

	"ndcg@10": (0.7271636373737903, 0.3182060787987736)
	"recall@10": (0.8232024740474891, 0.315630779288533)
	"precision@10": (0.15946547884187084, 0.13695927665944377)
Training Model 7


                                                         

	"ndcg@10": (0.7278832913283729, 0.3195563083768278)
	"recall@10": (0.8245173618323101, 0.31718508135982215)
	"precision@10": (0.16035634743875282, 0.13802894199920715)
Training Model 8


                                                         

	"ndcg@10": (0.7269579126368901, 0.318835533058269)
	"recall@10": (0.8223376435055985, 0.31983318116936726)
	"precision@10": (0.15790645879732743, 0.13542338189407746)
Training Model 9


                                                         

	"ndcg@10": (0.7568805118809107, 0.3087979241018758)
	"recall@10": (0.8326419722757992, 0.310980819112721)
	"precision@10": (0.16191536748329624, 0.14328902049400014)


---

## Propensity estimation

In training our unbiased ListNet model, we assumed that we know propensity values.
In practice, however, the propensity values have to be estimated from the clicks.
There are several methods for estimating the propensities, such as dual learning algorithm (DLA) and regression-based EM.
Here, we focus on DLA.

### DLA

IPS is based on the examination hypothesis that says $P(c=1)=P(r=1)\times P(e=1)$, where $c$, $r$ and $e$ are click, relevance and examination signals, respectively.
Initially, we are interested in $P(r=1)$, so in IPS we substitute $c$ with $\hat{r}=\frac{c}{P(e=1)}$.
In practice, $P(e=1)$ is not given and should be estimated.
DLA solves this by noticing that $\hat(e)=\frac{r}{P(r=1)}$ is also an unbiased estimation for the examination probability.
This means that in DLA (as the name suggests), two models are trained at the same time:
- Relevance prediction: A function $f$, modeled by `LTRModel` here, that estimates the relevance from the feature vectors.
- Propensity prediction: A function $g$, modeled by `PropLTRModel` here, that estimates the propensity from the positions.

Using the `unbiased_listNet_loss` loss function with the following signature:
$$
\mathcal{L}_{\text{unbiased}}\big(\text{predictions}, \text{clicks}, \text{propensities}\big),
$$

the overall loss function is as follows:
$$
\mathcal{L}_{\text{DLA}} = \underbrace{\mathcal{L}_{\text{unbiased}}\bigg(f(x), c, \sigma\big(g(p)\big)\bigg)}_{\text{relevance estimation}} + \underbrace{\mathcal{L}_{\text{unbiased}} \bigg(g(p), c, \sigma\big(f(x)\big)\bigg)}_{\text{propensity estimation}},
$$
which means that the predictions of $g$ are used as the propensities for optimizing $f$, and the predictions of $f$ are used as the propensities for optimizing $g$.
The $\sigma()$ function is used to transform the logits to valid probability valules, as the propensities should be between 0 and 1.

### Logits to prob
First, we need a function to transform the logits to valid probability values (between 0 and 1).
Use the sigmoid function for this transformation.

In [13]:
from ltr.train import logit_to_prob

logits = 10 * torch.rand(10)
probs = logit_to_prob(logits)

# Print the propensities
print('probabilities:', probs.squeeze())

probabilities: tensor([0.7130, 0.6204, 0.9876, 0.9999, 0.9999, 0.9989, 0.9999, 0.9061, 0.9996,
        0.9997])


### Propensity estimation LTR model
Then, we need a wrapper around the `LTRModel` that takes as input the positions (Long tensor) and outputs the logits for propensities.
This new model uses one hot embedding as the input features.

In [14]:
from ltr.model import PropLTRModel

prop_net = PropLTRModel(logging_policy.topk, width=200)

logits = prop_net(torch.arange(17))
probs = logit_to_prob(logits)

# Print the propensities
print('probabilities:', probs.T)

# Print the normalized propensities
print('normalized with the first position:', probs.T/probs.squeeze()[0])        

probabilities: tensor([[0.4836, 0.4840, 0.5121, 0.4901, 0.5034, 0.4754, 0.5023, 0.4817, 0.4813,
         0.5083, 0.5199, 0.5127, 0.4997, 0.4980, 0.5145, 0.4967, 0.4706]],
       grad_fn=<PermuteBackward0>)
normalized with the first position: tensor([[1.0000, 1.0010, 1.0590, 1.0136, 1.0409, 0.9830, 1.0388, 0.9961, 0.9954,
         1.0512, 1.0751, 1.0603, 1.0334, 1.0299, 1.0640, 1.0271, 0.9732]],
       grad_fn=<DivBackward0>)


### DLA training
Now we have all we need for the DLA implementation.

In [15]:
from ltr.train import train_DLA_listNet

params = Namespace(epochs=1, 
                    lr=1e-4,
                    batch_size=1,
                    prop_lr=1e-3,
                    prop_net=PropLTRModel(logging_policy.topk, width=256),
                    metrics={"ndcg@10", "precision@10", "recall@10"})

biased_net = LTRModel(15, width=256)
print('True (unknown to the model) propensities:', logging_policy.propensity.data.numpy())
train_DLA_listNet(biased_net, params, data)

True (unknown to the model) propensities: [1.         0.5        0.33333334 0.25       0.2        0.16666667
 0.14285715 0.125      0.11111111 0.1        0.09090909 0.08333334
 0.07692308 0.07142857 0.06666667 0.0625     0.05882353 0.05555556
 0.05263158 0.05      ]


                                                         

{'metrics_val': [{'dcg': (15.672601131673868, 12.670944416171588),
   'dcg@03': (10.864637640109986, 8.58744034478823),
   'dcg@05': (11.951197321225818, 9.052342534575159),
   'dcg@10': (12.856002950055794, 9.493796897597036),
   'dcg@20': (13.769623783952575, 10.392400962867471),
   'ndcg': (0.7034074589413044, 0.2858312977126358),
   'ndcg@03': (0.5817174209223405, 0.4020492719066576),
   'ndcg@05': (0.6040791414674389, 0.3778705578683698),
   'ndcg@10': (0.6255722197220839, 0.35694013726806995),
   'ndcg@20': (0.6495938246295985, 0.336058457903514),
   'precision@01': (0.5824175824175825, 0.49316056422674454),
   'precision@03': (0.3223443223443223, 0.2517365788469107),
   'precision@05': (0.2351648351648352, 0.18299920517069107),
   'precision@10': (0.13901098901098904, 0.11609540359714718),
   'precision@20': (0.08296703296703298, 0.08211482854528097),
   'recall@01': (0.4201881451881452, 0.43685418020079353),
   'recall@03': (0.5881767042481328, 0.4347198820938236),
   'recall@0

### Saving the results
Similar to the biased model, here we train 10 different unbiased models and save them to inspect the average and std over them.

**IMPORTANT** Run the following cell to store your models and results. After it finishes, make sure to push the results to the git repo.

_Estimated time on Codespaces_: < 10m

In [16]:
from ltr.utils import create_results
from ltr.train import train_DLA_listNet

seed(42)
params = Namespace(epochs=20, 
                    lr=1e-4,
                    batch_size=1,
                    prop_lr=1e-3,
                    prop_net=None,
                    metrics={"ndcg@10", "precision@10", "recall@10"})

for i in range(10):
    print('Training Model', i)
    dla_net = LTRModel(15, width=256)
    params.prop_net = PropLTRModel(logging_policy.topk, width=256)
    create_results(data, dla_net, 
                train_DLA_listNet, 
                dla_net,
                f"./outputs/DLA_listNet_{i}.json",
                params)

    torch.save(dla_net.state_dict(), f"./outputs/DLA_listNet_{i}")
    torch.save(params.prop_net.state_dict(), f"./outputs/DLA_listNet_prop_{i}")

Training Model 0


                                                         

	"ndcg@10": (0.7431118123278635, 0.31142344826215723)
	"recall@10": (0.8355003032980237, 0.30792983563105447)
	"precision@10": (0.1616926503340758, 0.13856658474983816)
Training Model 1


                                                         

	"ndcg@10": (0.7021966213701517, 0.3278832524082581)
	"recall@10": (0.7859916336629292, 0.3428878275734683)
	"precision@10": (0.1534521158129176, 0.1369176204493416)
Training Model 2


                                                         

	"ndcg@10": (0.7268707763157533, 0.31510894956654284)
	"recall@10": (0.823162939801221, 0.3198831063691657)
	"precision@10": (0.1579064587973274, 0.13443300109930062)
Training Model 3


                                                         

	"ndcg@10": (0.7435726515506391, 0.3068126764734612)
	"recall@10": (0.8363906549220319, 0.30673302488659776)
	"precision@10": (0.16191536748329624, 0.14109619192704181)
Training Model 4


                                                         

	"ndcg@10": (0.7337240326761805, 0.31205820352730673)
	"recall@10": (0.8237666768650654, 0.3143967788085563)
	"precision@10": (0.15746102449888646, 0.13561298240807956)
Training Model 5


                                                         

	"ndcg@10": (0.7187305309410691, 0.3196967358282181)
	"recall@10": (0.8195859532260685, 0.3172723262131661)
	"precision@10": (0.15501113585746104, 0.1312202872875018)
Training Model 6


                                                         

	"ndcg@10": (0.7439000859327877, 0.3110453211973936)
	"recall@10": (0.8342762674971506, 0.3099251648508929)
	"precision@10": (0.16213808463251672, 0.1415500213689873)
Training Model 7


                                                         

	"ndcg@10": (0.6678732834088622, 0.3425555855417103)
	"recall@10": (0.7637974828496248, 0.3586684557811937)
	"precision@10": (0.14543429844098, 0.12970018804563668)
Training Model 8


                                                         

	"ndcg@10": (0.7153890366679488, 0.3149084607180106)
	"recall@10": (0.8246399462074819, 0.3174006322267088)
	"precision@10": (0.15746102449888646, 0.13279172673877104)
Training Model 9


                                                         

	"ndcg@10": (0.722976072711674, 0.32058479864376066)
	"recall@10": (0.8190228705163854, 0.3219972982306116)
	"precision@10": (0.15501113585746107, 0.13572559228680917)


---

## Comparing the models

You have implemented three models: biased, unbiased with oracle propensity values, and unbiased with DLA-estimated propensity values.
Given the training results and evaluation results, please elaborate on the ranking performance of these three models in `analysis.md`. See that file for further details.

Note that you need to submit the result files created in `outputs/` for full credit.

In [17]:
import json

def aggregate_results(model_name):
    aggregated_metrics = {}
    for i in range(10):
        with open(f"./outputs/{model_name}_{i}.json", "r") as reader:
            result = json.load(reader)
            for metric, (v, std) in result['test_metrics'].items():
                aggregated_metrics.setdefault(metric, []).append(v)
    return {metric: np.mean(vals) for metric, vals in aggregated_metrics.items()}

biased = aggregate_results('biased_listNet')
unbiased = aggregate_results('unbiased_listNet')
DLA = aggregate_results('DLA_listNet')

# save the aggregated output files
for model_avg_results, model_name in zip([biased, unbiased, DLA], ["biased_listNet", "unbiased_listNet", "DLA_listNet"]):
    json.dump(model_avg_results, open(f"outputs/{model_name}_avg.json", "wt"))

# display a handful of metrics
print_metrics = ["ndcg", "ndcg@20", "precision@05", "recall@20"]
print_biased = {metric: v for metric, v in biased.items() if metric in print_metrics}
print_unbiased = {metric: v for metric, v in unbiased.items() if metric in print_metrics}
print_DLA = {metric: v for metric, v in DLA.items() if metric in print_metrics}

import pandas as pd
pd.set_option("display.precision", 3)
df = pd.DataFrame([print_biased, print_unbiased, print_DLA], index=["biased", "unbiased", "DLA"])
print(df)

from IPython.display import display, HTML
display(df)

           ndcg  ndcg@20  precision@05  recall@20
biased    0.743    0.702         0.247      0.866
unbiased  0.794    0.760         0.265      0.894
DLA       0.779    0.743         0.258      0.883


Unnamed: 0,ndcg,ndcg@20,precision@05,recall@20
biased,0.743,0.702,0.247,0.866
unbiased,0.794,0.76,0.265,0.894
DLA,0.779,0.743,0.258,0.883


In [18]:
# remember to submit your outputs!