In [None]:
# Create a filtered training file so that train_qrels are available for train_topics

if not os.path.isfile("data/subsample_train.txt"):

    # Reading train topics and training qrels
    train_topics = dataset.get_topics('train')
    train_qrels = dataset.get_qrels('train')

    # Retrieve 100 qids from train_qrels file
    train_qids = list(train_qrels['qid'].unique())
    train_qids_subsample = train_qids[:100]

    # Retrieve corresponding 100 topics
    train_topics_subsample = train_topics[train_topics['qid'].isin(train_qids_subsample)]

    # Create train topics file
    if not os.path.isdir("data"):
        !mkdir "data"

    with open("data/subsample_train.txt", "w") as output:
        for index, row in train_topics_subsample.iterrows():
            row_query = row['qid'] + ":" + row['query']
            output.write(row_query + '\n')
			
	
# Batch Feature Retrieval for BM25 
dataset = pt.get_dataset("trec-deep-learning-passages")
index = dataset.get_index('terrier_stemmed')

BM25_withFeatures = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"]) % 100


#Fit Random Forest over BM25
rf = RandomForestRegressor(n_estimators=100)
rf_pipe = BM25_withFeatures >> pt.ltr.apply_learned_model(rf)
start_time = time.time()

rf_pipe.fit(pt.io.read_topics("data/subsample_train.txt", format="singleline"), 
            dataset.get_qrels("train"))

current_time = time.time()
fitting_time = current_time - start_time
print(fitting_time)  


# Evaluate BM25 and RandomForestRegressor using different metrics
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

start_time = time.time()

results = pt.Experiment([bm25, 
                        rf_pipe ],
                           dataset.get_topics("test-2019"), 
                       dataset.get_qrels("test-2019"),
                       eval_metrics=["ndcg_cut_10","map", "mrt"],
                       names=["BM25", "RF pipeline" ]
                  )

current_time = time.time()
evaluation_time = current_time - start_time
print(evaluation_time)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=eeb7fcb4-84cb-4f4a-9830-9d1fcbc47e1d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>