In [1]:
import pandas as pd
from collections import OrderedDict

In [2]:
data = pd.read_csv("../multilingual-dataset-survey/standardization.tsv", sep='\t')
data.columns

Index(['dataset name', 'title', 'link to paper', 'data link',
       'motivation of the paper writer (how they were originally intended)',
       'task type', 'has train data?',
       'data size (rough avg # of examples PER language, excluding english)',
       'input data source ', 'crowdsource platforms / background (if any)',
       'original language', 'input data - automatic processing',
       'translation ', 'label source',
       'label language (at collection time / language used by annotators)',
       'language', 'publication year', 'published venue',
       'reusing existing datasets?', 'who created the dataset?', '# citation',
       'in_huggingface', 'dataset released?', 'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')

In [3]:
title2link = {}
d = OrderedDict()
for index, row in data.iterrows():
    if str(row['title']) == 'nan':
        continue
    if str(row['title']) == '156':
        continue
    title2link[row['title']] = row['link to paper']
    if row['publication year'] not in d:
        d[row['publication year']] = set()
    d[row['publication year']].add(row['title'])
yrs2titles = OrderedDict(sorted(d.items(), reverse=True))
yrs2titles.keys()

odict_keys([2021.0, 2020.0, 2019.0, 2018.0, 2017.0, 2016.0, 2015.0, 2014.0, 2013.0, 2011.0, 2010.0, 2008.0])

In [4]:
md_string = '''---
layout: archive
title: "Papers"
permalink: /datalist/
author_profile: false
---
'''

In [5]:
for yr in yrs2titles:
    md_string += f'## {int(yr)}\n\n'
    for title in yrs2titles[yr]:
        md_string += f'[{title}]({title2link[title]})\n\n'
    md_string += '\n\n'

In [6]:
with open('_pages/datalist.md', 'w+') as fout:
    fout.write(md_string)