In [2]:
import pyterrier as pt
import os
import numpy as np
import pandas as pd
import fastrank
import requests
import datetime

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

In [3]:
if not pt.started():
    pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


## Downloading the trec covid dataset

In [4]:
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
pt_index_path = './indices/cord19'

if not os.path.exists(pt_index_path + "/data.properties"):
    indexer = pt.index.IterDictIndexer(pt_index_path, blocks=True)
    index_ref = indexer.index(dataset.get_corpus_iter(), 
                            fields=['title', 'doi', 'abstract'], 
                            meta=('docno',))
else:
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

In [5]:
metadata = pd.read_csv('~/.ir_datasets/cord19/2020-07-16/metadata.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
index = pt.IndexFactory.of(index_ref)
topics = dataset.get_topics('title')
qrels = dataset.get_qrels()

In [7]:
qrels

Unnamed: 0,qid,docno,label,iteration
0,1,005b2j4b,2,4.5
1,1,00fmeepz,1,4
2,1,010vptx3,2,0.5
3,1,0194oljo,1,2.5
4,1,021q9884,1,4
...,...,...,...,...
69313,50,zvop8bxh,2,5
69314,50,zwf26o63,1,5
69315,50,zwsvlnwe,0,5
69316,50,zxr01yln,1,5


In [8]:
len(metadata['journal'].unique())

18112

### How do pipes work

We set up a two-stage ranking pipeline with a BM25 first stage ranker whose outputs will be reranked by a ML method given the predefined featues.

In [9]:
BM25 = pt.BatchRetrieve(index, controls={"wmodel": "BM25"})
TF_IDF = pt.BatchRetrieve(index, controls = {"wmodel":"TF_IDF"})
PL2 = pt.BatchRetrieve(index, controls = {"wmodel": "PL2"})

We make a pipe by transforming the BM25 outputs with the help of PL2 and TFIDF

In [10]:
pipe = BM25 >>(TF_IDF ** PL2)

Alternative: FeatureBatchRetrieve object

In [11]:
fbr = pt.FeaturesBatchRetrieve(index, controls = {"wmodel": "BM25"}, features=["WMODEL:TF_IDF", "WMODEL:PL2"]) 
(fbr % 5).search("coronavirus immunity")

Unnamed: 0,qid,query,docid,rank,features,docno,score
0,1,coronavirus immunity,187945,0,"[4.540221677858543, 3.477440707307063]",sp212tai,10.118259
1,1,coronavirus immunity,126990,1,"[4.377525236902022, 3.3080236155861185]",e1mw9lx1,10.00147
2,1,coronavirus immunity,179948,2,"[4.559628851404359, 3.5124279884117215]",ltmuw6f8,9.974369
3,1,coronavirus immunity,156456,3,"[4.600248221319108, 3.649953983552013]",1oruu33o,9.955978
4,1,coronavirus immunity,94922,4,"[4.490926556339275, 3.312256733881456]",5jl6ltfj,9.73464


### LTR

Split into train test using cross validation

In [12]:
train_topics, validation_topics, test_topics = np.split(topics, [int(.6*len(topics)), int(.8*len(topics))])

In [13]:
train_min = train_topics['qid'].astype(int).min()
train_max = train_topics['qid'].astype(int).max()
train_qrels = qrels[(qrels['qid'].astype(int) >= train_min) & (qrels['qid'].astype(int) <= train_max)]

val_min = validation_topics['qid'].astype(int).min()
val_max = validation_topics['qid'].astype(int).max()
validation_qrels = qrels[(qrels['qid'].astype(int) >= val_min) & (qrels['qid'].astype(int) <= val_max)]

test_min = test_topics['qid'].astype(int).min()
test_max = test_topics['qid'].astype(int).max()
test_qrels = qrels[(qrels['qid'].astype(int) >= test_min) & (qrels['qid'].astype(int) <= test_max)]

# How well do the algorithm work without any features?

Following, the five testes ranking algorithms will be put to the test. We are using the PL2 algorithm as a baseline and compare it to LambdaMart, LightGBM, Random Forest, FastRank with Gradient Ascent and FastRank with Random Forest.

In [14]:
#fbr = pt.BatchRetrieve(index, wmodel="BM25")
pl2 = pt.BatchRetrieve(index, wmodel="PL2")
fbr = pt.FeaturesBatchRetrieve(index, controls = {"wmodel": "BM25"}, features=["WMODEL:BM25"])

#p = fbr >> pt.apply.doc_features(_features)
# p = br >> pt.apply.doc_features(_features)

In [15]:
lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42,
      num_round =  10)

In [16]:
import lightgbm as lgb
# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(task="train",
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=100,
    max_bin=255,
    num_leaves=7,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[1, 3, 5, 10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=10)

In [17]:
train_request = fastrank.TrainRequest.coordinate_ascent()
params = train_request.params
train_request.measure = 'ndcg'
params.init_random = True
params.normalize = True
params.seed = 1234567

In [18]:
train_request_forest = fastrank.TrainRequest.random_forest()
params = train_request_forest.params
train_request_forest.measure = 'ndcg'
params.num_trees = 10
params.feature_sampling_rate = 0.5
params.instance_sampling_rate = 0.5
params.seed = 1234567

In [19]:
rf = RandomForestRegressor(n_estimators= 300)
rf_pipe = fbr >> pt.ltr.apply_learned_model(rf)

In [21]:
lmart_x_pipe = fbr >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
lmart_x_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

lmart_l_pipe = fbr >> pt.ltr.apply_learned_model(lmart_l, form="ltr")
lmart_l_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

rf_pipe = fbr >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train_topics, qrels)

ca_pipe = fbr >> pt.ltr.apply_learned_model(train_request, form="fastrank")
ca_pipe.fit(train_topics, qrels)

fr_rf_pipe = fbr >> pt.ltr.apply_learned_model(train_request_forest, form="fastrank")
fr_rf_pipe.fit(train_topics, qrels)

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

In [26]:
results = pt.Experiment([PL2, BM25, lmart_x_pipe, lmart_l_pipe, rf_pipe, ca_pipe, fr_rf_pipe], test_topics, qrels, ["ndcg", "map", "recip_rank"], 
                        names=["PL2 (Baseline)", "BM25 (base ranker)", "LMART", "LMART LightGBM", "Random Forest", "Fastrank Coordinate Ascend", "Fastrank Random Forest"], baseline = 0, filter_by_topics = True)

In [27]:
results

Unnamed: 0,name,map,recip_rank,ndcg,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,ndcg +,ndcg -,ndcg p-value
0,PL2 (Baseline),0.313517,0.909091,0.523699,,,,,,,,,
1,BM25 (base ranker),0.307945,0.925,0.527753,7.0,3.0,0.551904,1.0,0.0,0.343436,6.0,4.0,0.461508
2,LMART,0.166378,0.406146,0.436964,0.0,10.0,0.005058,0.0,8.0,0.001764,1.0,9.0,0.004475
3,LMART LightGBM,0.132631,0.480101,0.421087,0.0,10.0,0.004582,0.0,7.0,0.00652,1.0,9.0,0.003567
4,Random Forest,0.148353,0.378407,0.423232,0.0,10.0,0.002986,0.0,8.0,0.001986,1.0,9.0,0.002671
5,Fastrank Coordinate Ascend,0.283973,0.914286,0.519278,1.0,9.0,0.042773,1.0,0.0,0.343436,2.0,8.0,0.471011
6,Fastrank Random Forest,0.154931,0.365593,0.424146,0.0,10.0,0.001263,1.0,7.0,0.003387,1.0,9.0,0.000867


**Summarizing**:
Based on three measures (MAP, Recip Rank and the ndcg) we can observe that PL2 outperformed all models except the Fastrank Coordinate Ascend. The models are currently just a reranking the BM25 score.

## Applying recency boost

In [20]:
metadata['publish_year'] = pd.to_datetime(metadata['publish_time'])
metadata['publish_year'] = metadata['publish_year'].dt.strftime('%Y')
metadata['publish_year'] = metadata['publish_year'].fillna(0)
metadata['publish_year'] = metadata['publish_year'].astype(str).astype(int)

metadata['publish_year_feature'] = 0

metadata.loc[metadata['publish_year'] < 2019, 'publish_year_feature'] = 1
metadata.loc[metadata['publish_year'] == 2019, 'publish_year_feature'] = 2
metadata.loc[metadata['publish_year'] >= 2020, 'publish_year_feature'] = 3


In [21]:
date_dict = {}

for id, dates in zip(metadata['cord_uid'], metadata['publish_year_feature']):
    date_dict[id] = dates


## Applying Journal impact

In [22]:
journal_impact = pd.read_csv('./data/scimagojr 2020.csv', sep = ';', header = None)
journal_abbrev = pd.read_csv('./data/wos_abbrev_table.csv', sep = ';', header = None)
journal_impact=journal_impact.T.set_index(0).T
journal_abbrev=journal_abbrev.T.set_index(0).T
journal_abbrev = journal_abbrev[['full', 'abbrev']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [23]:
journal_impact = journal_impact[['Title', 'SJR', 'H index']]

In [24]:
journals = metadata[['cord_uid','journal']]
journals = journals.drop_duplicates()
journals = pd.DataFrame(journals)

In [25]:
journals['journal'] = journals['journal'].str.lower()
journal_impact['Title'] = journal_impact['Title'].str.lower()
journal_abbrev['full'] = journal_abbrev['full'].str.lower()
journal_abbrev['abbrev'] = journal_abbrev['abbrev'].str.lower()

In [26]:
journals = pd.merge(journals, journal_abbrev, left_on='journal', right_on='abbrev', how='left')

In [27]:
journals = journals[['cord_uid', 'journal', 'full', 'abbrev']]
journals['full'] = journals['full'].fillna(journals['journal'])
journals['abbrev'] = journals['abbrev'].fillna(journals['journal'])

In [28]:
journals

Unnamed: 0,cord_uid,journal,full,abbrev
0,ug7v899j,bmc infect dis,bmc infectious diseases,bmc infect dis
1,02tnwd4m,respir res,respiratory research,respir res
2,ejv2xln0,respir res,respiratory research,respir res
3,2b73a28n,respir res,respiratory research,respir res
4,9785vg6d,respir res,respiratory research,respir res
...,...,...,...,...
217298,z4ro6lmh,infection,infection,infection
217299,hi8k8wvb,physica b: condensed matter,physica b: condensed matter,physica b: condensed matter
217300,ma3ndg41,catheter cardiovasc interv,catheter cardiovasc interv,catheter cardiovasc interv
217301,wh10285j,ann surg,annals of surgery,ann surg


In [29]:
journals['cord_uid'] = journals['cord_uid'].drop_duplicates()

In [30]:
metadata_journals = pd.merge(metadata, journals , left_on='cord_uid', right_on='cord_uid', how='left')

In [31]:
metadata_journals = metadata_journals.merge(journal_impact, how = 'left', left_on = "full", right_on = "Title")

In [32]:
#merged = metadata.merge(journal_impact, how = 'left', left_on = "journal", right_on = "Title")
metadata_journals['H index'] = [float(str(i).replace(",", "")) for i in metadata_journals['H index']]
#merged['H index'] = merged['H index'].fillna(merged['H index'].mean())
#merged['norm_H_index']=(merged['H index']-merged['H index'].min())/(merged['H index'].max()-merged['H index'].min())

In [33]:
metadata_journals['H index'] = [float(str(i).replace(",", "")) for i in metadata_journals['H index']]
metadata_journals = metadata_journals.rename(columns={"H index": "H_index"})

In [34]:
metadata_journals['SJR'] = [float(str(i).replace(",", "")) for i in metadata_journals['SJR']]


In [35]:
metadata_journals['H_index'].describe()

count    88969.000000
mean       170.107476
std        189.732422
min          0.000000
25%         70.000000
50%        116.000000
75%        205.000000
max       1276.000000
Name: H_index, dtype: float64

In [36]:
lower = metadata_journals['H_index'].describe()[4]
middle = metadata_journals['H_index'].describe()[5]
upper = metadata_journals['H_index'].describe()[6]

In [37]:
lower, middle, upper

(70.0, 116.0, 205.0)

In [38]:
metadata_journals['H_index'].fillna(value = lower, inplace = True)

metadata_journals['H_index'].loc[metadata_journals['H_index'] < lower] = 1
metadata_journals['H_index'].loc[metadata_journals['H_index'] == lower] = 1.5
metadata_journals.loc[metadata_journals['H_index'].between(lower, middle), 'H_index'] = 2.0
metadata_journals.loc[metadata_journals['H_index'].between(middle,upper), 'H_index'] = 2.5
metadata_journals.loc[metadata_journals['H_index'].between(upper, 1276), 'H_index'] = 3.5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [39]:
lower = metadata_journals['SJR'].describe()[4]
middle = metadata_journals['SJR'].describe()[5]
upper = metadata_journals['SJR'].describe()[6]

In [40]:
lower, middle, upper

(751.0, 1175.0, 2134.0)

In [41]:
metadata_journals['SJR'].fillna(value = lower, inplace = True)

metadata_journals['SJR'].loc[metadata_journals['SJR'] < lower] = 1
metadata_journals['SJR'].loc[metadata_journals['SJR'] == lower] = 1.5
metadata_journals.loc[metadata_journals['SJR'].between(lower, middle), 'SJR'] = 2.0
metadata_journals.loc[metadata_journals['SJR'].between(middle,upper), 'SJR'] = 2.5
metadata_journals.loc[metadata_journals['SJR'].between(upper, 37461), 'SJR'] = 3.5

In [42]:
metadata_journals['H_index'].describe()

count    192847.000000
mean          1.846661
std           0.715208
min           1.000000
25%           1.500000
50%           1.500000
75%           2.000000
max           3.500000
Name: H_index, dtype: float64

In [43]:
metadata_journals['SJR'].describe()

count    192847.000000
mean          1.842694
std           0.714493
min           1.000000
25%           1.500000
50%           1.500000
75%           2.000000
max           3.500000
Name: SJR, dtype: float64

In [44]:
sjr_dict = {}
h_dict = {}

for id, sjr in zip(metadata_journals['cord_uid'], metadata_journals['SJR']):
    sjr_dict[id] = sjr

for id, h_id in zip(metadata_journals['cord_uid'], metadata_journals['H_index']):
    h_dict[id] = h_id


The SJR and H index helped Fastrank however not LMAR or RF. Generally, about 88000 columns have an associated journal ranking now. The others are 

## Citation rank

# Applying the topic cites to all qrels

In [166]:
def get_cites(input_df):
    df = input_df
    # merge the fbr (pooled documents) with our metadata and remove duplicates to minimize API calls
    df = input_df.merge(metadata, left_on = "docno", right_on = "cord_uid", how = "left") 
    df = df[['qid', 'docno', 'label', 'iteration', 'cord_uid', 'title', 'doi',
                    'abstract', 'publish_time', 'authors', 'journal']]
    #fbr_meta = fbr_meta.drop_duplicates(subset='doi', keep="first")
    #fbr_meta = fbr_meta.drop_duplicates(subset='title', keep="first")
    df = df.reset_index()
    df = df.drop(columns = 'index')
    #fbr_meta.update('"' + fbr_meta[['title']].astype(str) + '"')
    df = df.drop_duplicates(subset='title', keep="first")

    cites_df = pd.DataFrame()
    for i in range(len(df)):
        try:
            title_information = requests.get(
                'https://api.openalex.org/works?filter=title.search:'+df['title'][i]
                ).json()['results'][0]
            current_title = pd.DataFrame.from_dict(title_information, orient='index')
            current_title = current_title.transpose()
            cites_df = cites_df.append(current_title)
            print("Request number:", i)
        # in case DOI and title is missing, no information is printed
        except:
            try:
                doi_information = requests.get(
                    'https://api.openalex.org/works?filter=doi:https://doi.org/'+df['doi'][i]
                    ).json()['results'][0]
                    #print('Retrieved:',fbr_meta['doi'][i])
                    # append the current document information to our main dataframe
                current_doi = pd.DataFrame.from_dict(doi_information, orient='index')
                current_doi = current_doi.transpose()
                cites_df = cites_df.append(current_doi)
                print("Request number:", i)
            except:
                print("No title or doi information on OpenAlex found")
    # finally, we are merging our OpenAlex data with our fbr and meta data by merging over the doi
    cites_df = cites_df[['doi', 'title', 'referenced_works', 'related_works', 'cited_by_count', 'counts_by_year']]
    cites_df = cites_df.reset_index()
    cites_df = cites_df.drop(columns = 'index')
    cites_df['doi'] = cites_df.doi.str.replace('https://doi.org/', '')
    #fbr_meta_cites = fbr_meta.merge(cites_df, left_on = 'title', right_on = 'title')
    return cites_df

In [46]:
qrels['docno'].drop_duplicates(keep="first")

0        005b2j4b
1        00fmeepz
2        010vptx3
3        0194oljo
4        021q9884
           ...   
69305    zn10rnrm
69308    zstmdt4n
69310    zth8ffy3
69312    zv4nbz9p
69313    zvop8bxh
Name: docno, Length: 37924, dtype: object

In [47]:
# creating the main cites dataframe
cites_df1 = pd.read_csv('../WIR-Project/data/cites_df1.csv')
cites_df2 = pd.read_csv('../WIR-Project/data/cites_df2.csv')
cites_df3 = pd.read_csv('../WIR-Project/data/cites_df3.csv')
cites_df4 = pd.read_csv('../WIR-Project/data/cites_df4.csv')

cites_df = cites_df1.append(cites_df2)
cites_df = cites_df.append(cites_df3)
cites_df = cites_df.append(cites_df4)
qrels = dataset.get_qrels()

In [48]:
# preprocessing the title to lowercase and merging the data
cites_df['title'] = cites_df['title'].str.lower()

meta = metadata
meta['title'] = meta['title'].str.lower()

meta_cites = cites_df.merge(meta, left_on = 'title', right_on = 'title', how = 'right')
meta_cites = meta_cites[['cited_by_count', 'cord_uid']]
meta_cites = meta_cites.rename(columns={"cord_uid": "docno"})

In [49]:
# Adding an indexer to the qrels. The merges create about 500 duplicated values which can not be deleted otherwise
qrels = qrels.reset_index()
qrels = qrels.rename(columns={"index":"indexer"})

In [50]:
qrels

Unnamed: 0,indexer,qid,docno,label,iteration
0,0,1,005b2j4b,2,4.5
1,1,1,00fmeepz,1,4
2,2,1,010vptx3,2,0.5
3,3,1,0194oljo,1,2.5
4,4,1,021q9884,1,4
...,...,...,...,...,...
69313,69313,50,zvop8bxh,2,5
69314,69314,50,zwf26o63,1,5
69315,69315,50,zwsvlnwe,0,5
69316,69316,50,zxr01yln,1,5


In [51]:
meta_cites = meta_cites.drop_duplicates()
merged_df = pd.merge(qrels, meta_cites, on='docno', how='left', indicator=True)

merged_df = merged_df.drop_duplicates(subset='indexer', keep="first")
merged_df = merged_df.reset_index()
merged_df = merged_df.drop(columns = 'index')

qrels = qrels.drop(columns=['indexer'])
result = qrels.eq(merged_df[['qid', 'docno', 'label', 'iteration']], axis='index')
result = result.all(axis=1)

In [55]:
# check if the dataframes are actually completely identical

a = merged_df[['qid', 'docno', 'label', 'iteration']]
b = qrels

if a.equals(b):
    print("The DataFrames are identical.")
else:
    print("The DataFrames are not identical.")

The DataFrames are identical.


In total 48306 of 69318 qrels are associated with their citations

In [56]:
merged_df['cited_by_count'].describe()

count    48306.000000
mean       169.759926
std        757.097184
min          0.000000
25%         11.000000
50%         35.000000
75%        111.000000
max      30662.000000
Name: cited_by_count, dtype: float64

In [57]:
# assigning the features impact

lower = merged_df['cited_by_count'].describe()[4]
middle = merged_df['cited_by_count'].describe()[5]
upper = merged_df['cited_by_count'].describe()[6]

merged_df['cited_by_count'] = merged_df['cited_by_count'].fillna(value=lower)
merged_df['cited_by_count'].loc[merged_df['cited_by_count'] < lower] = 1
merged_df.loc[merged_df['cited_by_count'].between(lower, middle), 'cited_by_count'] = 2
merged_df.loc[merged_df['cited_by_count'].between(middle, upper), 'cited_by_count'] = 3
merged_df['cited_by_count'].loc[merged_df['cited_by_count'] > upper] = 4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [60]:
# finally, we have to merge it again with the metadata, since some docno actually do not appear in the qrels. This is weird however, should be the minority. Other documents have the cited by count 0
docno_cites = merged_df[['docno', 'cited_by_count']]
full_data = meta.merge(docno_cites, left_on= "cord_uid", right_on = "docno", how = "outer")
full_data['cited_by_count'] = full_data['cited_by_count'].fillna(value= 0)

In [61]:
citation_dic = {}

for id, cites in zip(full_data['cord_uid'], full_data['cited_by_count']):
    citation_dic[id] = cites

# Final functions to choose which features to include in the reranking procedure

In [117]:
def dates(docno):
    raw_dates = date_dict[docno]
    return raw_dates

def sjr(docno):
    raw_sjr = sjr_dict[docno]
    return raw_sjr

def h_idx(docno):
    raw_h_idx = h_dict[docno]
    return raw_h_idx

def cites(docno):
    raw_cites = citation_dic[docno]
    return raw_cites

In [None]:
def _features(row):
    f1 = dates(row["docno"])
    f2 = sjr(row["docno"])
    f3 = h_idx(row["docno"])
    f4 = cites(row["docno"])
    features = np.append(row['features'], np.array([f2, f3, f4]))
    return features

In [113]:
fbr = pt.FeaturesBatchRetrieve(index, controls = {"wmodel": "BM25"}, features=["WMODEL:BM25"]) 
fbr >> pt.apply.doc_features(_features)

p = fbr >> pt.apply.doc_features(_features)

p.transform("coronavirus origin")

  topics = m.transform(topics)


Unnamed: 0,qid,query,docid,rank,features,docno,score
0,1,coronavirus origin,122804,0,"[7.243034202814337, 1.5, 1.5, 3.0]",75773gwg,11.578681
1,1,coronavirus origin,122805,1,"[7.252674838560409, 1.5, 1.5, 3.0]",kn2z7lho,11.578681
2,1,coronavirus origin,122806,2,"[7.256040044023486, 1.5, 1.5, 3.0]",4fb291hq,11.578681
3,1,coronavirus origin,135326,3,"[7.256040044023486, 3.5, 3.5, 4.0]",ne5r4d4b,11.452880
4,1,coronavirus origin,187888,4,"[7.267153309588337, 1.5, 1.5, 3.0]",hl967ekh,11.428047
...,...,...,...,...,...,...,...
995,1,coronavirus origin,186498,995,"[8.965273511062476, 2.0, 2.0, 4.0]",w8cvq0m5,7.256041
996,1,coronavirus origin,10395,996,"[9.358174003925521, 2.5, 1.5, 2.0]",l8a7lzhb,7.256040
997,1,coronavirus origin,134989,997,"[7.81745407521776, 2.0, 2.5, 0.0]",nh06fp82,7.256040
998,1,coronavirus origin,73423,998,"[8.711893939558887, 1.5, 1.5, 2.0]",i758v1vb,7.252675


In [114]:
lmart_x_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
lmart_x_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

lmart_l_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(lmart_l, form="ltr")
lmart_l_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

rf_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train_topics, qrels)

ca_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(train_request, form="fastrank")
ca_pipe.fit(train_topics, qrels)

fr_rf_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(train_request_forest, form="fastrank")
fr_rf_pipe.fit(train_topics, qrels)

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

In [115]:
results = pt.Experiment([PL2, BM25, lmart_x_pipe, lmart_l_pipe, rf_pipe, ca_pipe, fr_rf_pipe], test_topics, qrels, ["ndcg", "map", "recip_rank"], 
                        names=["PL2 (Baseline)", "BM25 (base ranker)", "LMART", "LMART LightGBM", "Random Forest", "Fastrank Coordinate Ascend", "Fastrank Random Forest"], baseline = 0, filter_by_topics = True)

# Final results summary:

In [128]:
def train_models():
    fbr = pt.FeaturesBatchRetrieve(index, controls = {"wmodel": "BM25"}, features=["WMODEL:BM25"]) 
    fbr >> pt.apply.doc_features(_features)

    lmart_x_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
    lmart_x_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

    lmart_l_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(lmart_l, form="ltr")
    lmart_l_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

    rf_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(rf)
    rf_pipe.fit(train_topics, qrels)

    ca_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(train_request, form="fastrank")
    ca_pipe.fit(train_topics, qrels)

    fr_rf_pipe = fbr >> pt.apply.doc_features(_features) >> pt.ltr.apply_learned_model(train_request_forest, form="fastrank")
    fr_rf_pipe.fit(train_topics, qrels)

    results = pt.Experiment([PL2, BM25, lmart_x_pipe, lmart_l_pipe, rf_pipe, ca_pipe, fr_rf_pipe], test_topics, qrels, ["ndcg", "map", "recip_rank"], 
                        names=["PL2 (Baseline)", "BM25 (base ranker)", "LMART", "LMART LightGBM", "Random Forest", "Fastrank Coordinate Ascend", "Fastrank Random Forest"], baseline = 0, filter_by_topics = True)
    return(results)

# Results using the recency ranking

In [129]:
def _features(row):
    f1 = dates(row["docno"])
    f2 = sjr(row["docno"])
    f3 = h_idx(row["docno"])
    f4 = cites(row["docno"])
    features = np.append(row['features'], np.array([f1]))
    return features

In [130]:
results = train_models()
results

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

Unnamed: 0,name,map,recip_rank,ndcg,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,ndcg +,ndcg -,ndcg p-value
0,PL2 (Baseline),0.313517,0.909091,0.523699,,,,,,,,,
1,BM25 (base ranker),0.307945,0.925,0.527753,7.0,3.0,0.551904,1.0,0.0,0.343436,6.0,4.0,0.461508
2,LMART,0.168598,0.450476,0.445947,2.0,8.0,0.012282,0.0,8.0,0.002137,2.0,8.0,0.014904
3,LMART LightGBM,0.132631,0.480101,0.421087,0.0,10.0,0.004582,0.0,7.0,0.00652,1.0,9.0,0.003567
4,Random Forest,0.176817,0.537165,0.447239,0.0,10.0,0.006322,1.0,5.0,0.01871,1.0,9.0,0.011067
5,Fastrank Coordinate Ascend,0.321934,0.95,0.539068,5.0,5.0,0.456311,1.0,0.0,0.343436,6.0,4.0,0.068947
6,Fastrank Random Forest,0.197146,0.286493,0.451229,0.0,10.0,0.001746,0.0,10.0,1.9e-05,1.0,9.0,0.001036


# Results using the recency ranking and citation ranking

In [131]:
def _features(row):
    f1 = dates(row["docno"])
    f2 = sjr(row["docno"])
    f3 = h_idx(row["docno"])
    f4 = cites(row["docno"])
    features = np.append(row['features'], np.array([f1, f4]))
    return features

In [132]:
results = train_models()
results

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

Unnamed: 0,name,map,recip_rank,ndcg,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,ndcg +,ndcg -,ndcg p-value
0,PL2 (Baseline),0.313517,0.909091,0.523699,,,,,,,,,
1,BM25 (base ranker),0.307945,0.925,0.527753,7.0,3.0,0.551904,1.0,0.0,0.343436,6.0,4.0,0.461508
2,LMART,0.268962,0.5625,0.493576,5.0,5.0,0.293422,1.0,6.0,0.006738,3.0,7.0,0.203719
3,LMART LightGBM,0.132631,0.480101,0.421087,0.0,10.0,0.004582,0.0,7.0,0.00652,1.0,9.0,0.003567
4,Random Forest,0.284691,0.741667,0.503122,4.0,6.0,0.290228,1.0,4.0,0.322573,4.0,6.0,0.22956
5,Fastrank Coordinate Ascend,0.323965,1.0,0.530233,5.0,5.0,0.657003,1.0,0.0,0.343436,6.0,4.0,0.631138
6,Fastrank Random Forest,0.243468,0.421786,0.471134,3.0,7.0,0.111819,1.0,7.0,0.002722,2.0,8.0,0.0362


# Results using the journal ranking and citations ranking:

In [133]:
def _features(row):
    f1 = dates(row["docno"])
    f2 = sjr(row["docno"])
    f3 = h_idx(row["docno"])
    f4 = cites(row["docno"])
    features = np.append(row['features'], np.array([f2, f3, f4]))
    return features

In [134]:
results = train_models()
results

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

Unnamed: 0,name,map,recip_rank,ndcg,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,ndcg +,ndcg -,ndcg p-value
0,PL2 (Baseline),0.313517,0.909091,0.523699,,,,,,,,,
1,BM25 (base ranker),0.307945,0.925,0.527753,7.0,3.0,0.551904,1.0,0.0,0.343436,6.0,4.0,0.461508
2,LMART,0.240453,0.524167,0.476597,4.0,6.0,0.11907,1.0,6.0,0.007108,3.0,7.0,0.070523
3,LMART LightGBM,0.132631,0.480101,0.421087,0.0,10.0,0.004582,0.0,7.0,0.00652,1.0,9.0,0.003567
4,Random Forest,0.260412,0.758333,0.492577,2.0,8.0,0.126203,1.0,4.0,0.359013,2.0,8.0,0.125132
5,Fastrank Coordinate Ascend,0.300243,0.95,0.51614,4.0,6.0,0.613932,1.0,0.0,0.343436,6.0,4.0,0.594724
6,Fastrank Random Forest,0.230142,0.472248,0.461707,3.0,7.0,0.095253,0.0,6.0,0.014404,3.0,7.0,0.032159


# Results using all features:

In [135]:
def _features(row):
    f1 = dates(row["docno"])
    f2 = sjr(row["docno"])
    f3 = h_idx(row["docno"])
    f4 = cites(row["docno"])
    features = np.append(row['features'], np.array([f1, f2, f3, f4]))
    return features

In [136]:
results = train_models()
results

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

Unnamed: 0,name,map,recip_rank,ndcg,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,ndcg +,ndcg -,ndcg p-value
0,PL2 (Baseline),0.313517,0.909091,0.523699,,,,,,,,,
1,BM25 (base ranker),0.307945,0.925,0.527753,7.0,3.0,0.551904,1.0,0.0,0.343436,6.0,4.0,0.461508
2,LMART,0.268962,0.5625,0.493576,5.0,5.0,0.293422,1.0,6.0,0.006738,3.0,7.0,0.203719
3,LMART LightGBM,0.132631,0.480101,0.421087,0.0,10.0,0.004582,0.0,7.0,0.00652,1.0,9.0,0.003567
4,Random Forest,0.282913,0.648333,0.500921,5.0,5.0,0.292225,1.0,4.0,0.052567,3.0,7.0,0.173703
5,Fastrank Coordinate Ascend,0.312918,0.85,0.521301,5.0,5.0,0.982266,1.0,2.0,0.694982,6.0,4.0,0.895849
6,Fastrank Random Forest,0.25058,0.659401,0.479832,4.0,6.0,0.219739,1.0,3.0,0.094922,4.0,6.0,0.136605


# Results using only the citation ranking:

In [137]:
def _features(row):
    f1 = dates(row["docno"])
    f2 = sjr(row["docno"])
    f3 = h_idx(row["docno"])
    f4 = cites(row["docno"])
    features = np.append(row['features'], np.array([f4]))
    return features

In [139]:
results = train_models()
results

Parameters: { "num_round", "verbose" } are not used.





[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[2]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[3]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[4]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[5]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[6]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[7]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.133333	valid_0's ndcg@3: 0.148976	valid_0's ndcg@4: 0.157555	valid_0's ndcg@5: 0.150003
[8]	valid_0's ndcg@1: 0.133

Unnamed: 0,name,map,recip_rank,ndcg,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,ndcg +,ndcg -,ndcg p-value
0,PL2 (Baseline),0.313517,0.909091,0.523699,,,,,,,,,
1,BM25 (base ranker),0.307945,0.925,0.527753,7.0,3.0,0.551904,1.0,0.0,0.343436,6.0,4.0,0.461508
2,LMART,0.240453,0.524167,0.476597,4.0,6.0,0.11907,1.0,6.0,0.007108,3.0,7.0,0.070523
3,LMART LightGBM,0.132631,0.480101,0.421087,0.0,10.0,0.004582,0.0,7.0,0.00652,1.0,9.0,0.003567
4,Random Forest,0.253948,0.654444,0.483952,2.0,8.0,0.095582,1.0,5.0,0.177202,2.0,8.0,0.060403
5,Fastrank Coordinate Ascend,0.300818,1.0,0.517634,4.0,6.0,0.632396,1.0,0.0,0.343436,4.0,6.0,0.672087
6,Fastrank Random Forest,0.256256,0.403535,0.47584,2.0,8.0,0.080719,0.0,8.0,0.000491,1.0,9.0,0.018443


# Creating a Knowledge Graph out of qrel 1: Coronavirus Origin

In [246]:
df = pd.DataFrame(qrels)

In [249]:
df = df[df['qid'] == '1']
merged = pd.merge(df, metadata, how = 'left', left_on = 'docno', right_on='cord_uid')
merged = merged[['title', 'source_x']]

In [254]:
topic_df = pd.DataFrame()
#for i in range(0, len(merged)):
    try:
        title_information = requests.get(
           'https://api.openalex.org/works?filter=title.search:'+merged['title'][i]
            ).json()['results'][0]
        current_title = pd.DataFrame.from_dict(title_information, orient='index')
        current_title = current_title.transpose()
        topic_df = topic_df.append(current_title)
        print(i)
    except:
        print("No title found")

0
1
No title found
3
4
5
No title found
7
8
9
No title found
11
12
13
14
15
16
17
No title found
No title found
20
No title found
22
23
24
25
No title found
27
28
29
30
31
32
33
No title found
35
36
37
38
39
40
No title found
No title found
No title found
44
45
46
No title found
No title found
No title found
50
51
No title found
53
No title found
55
56
57
58
59
60
No title found
62
No title found
64
65
66
67
68
69
70
71
72
73
No title found
No title found
76
No title found
78
79
No title found
81
82
83
84
85
86
87
88
89
90
91
No title found
No title found
94
95
No title found
97
98
99
100
101
102
103
104
105
106
107
No title found
No title found
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
No title found
126
127
128
129
130
131
132
133
134
135
136
No title found
138
139
140
141
No title found
No title found
144
145
146
No title found
148
149
No title found
No title found
152
153
154
No title found
156
157
158
159
160
161
162
No title found
164
No title found
No title fou

In [257]:
topic_df = topic_df[['title', 'publication_year', 'concepts']]
for i in range(len(topic_df)):
    try:
        topic_df.iloc[i, topic_df.columns.get_loc('concepts')] = topic_df.iloc[i]['concepts'][0]['display_name']
    except:
        topic_df.iloc[i, topic_df.columns.get_loc('concepts')] = "Unknown"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [275]:
topic_df = topic_df.reset_index(drop=True)


In [279]:
topic_df = topic_df.reset_index(drop=True)
topic_df.to_csv(r'../WIR-Project/data/graph.csv', index=False, header=True) 