## Section 1: Load OpenData file

In [None]:
from pandas import read_csv

opendata = read_csv('opendata.tsv', sep='\t')
opendata['doi'] = [url.replace('https://doi.org/','') for url in opendata.paper_url]

for doi in opendata.doi:
    print(doi)

In [None]:
opendata.shape

## Section 2: RIS file

In [None]:
import rispy
from pathlib import Path
from pandas import DataFrame
p = Path('references.ris')
ris = DataFrame(rispy.load(p, encoding='utf-8'))
ris = ris[['authors','title','secondary_title','year','date','abstract','doi','keywords']]
ris['authors'] = [';'.join(a) for a in ris.authors]
if isinstance(ris['keywords'], list): ris['keywords'] = [';'.join(k) for k in ris.keywords]
else: ris['keywords'] = ''
ris = ris.drop_duplicates()

ris = ris.merge(opendata, on='doi', how='inner')

In [None]:
ris.shape

In [None]:
opendata[~opendata.doi.isin(ris.doi)]

## Section 3: Write files

In [None]:
import os, string
import numpy as np
from os.path import dirname
from unidecode import unidecode
# ROOT_DIR = dirname(dirname(os.path.realpath(__file__)))
ROOT_DIR = '..'
POST_DIR = os.path.join(ROOT_DIR, 'source', '_posts')

## Define current files.
posts = sorted([f for f in os.listdir(POST_DIR) if f.endswith('.md')])

for _, row in ris.iterrows():
    
    ## Define post name.
    first_author = unidecode(row.authors.split(',')[0].replace(' ','-'))
    year = row.year
    fname = first_author.lower() + year
    
    ## Check if already file.
    if fname + '.md' in posts:
        i = 0
        while True:
            if fname + string.ascii_lowercase[i] + '.md' in posts:
                i += 1
            else:
                fname += string.ascii_lowercase[i] + '.md'
                break
    else:
        fname += '.md'
    
    ## Define post title.
    authors = row.authors.split(';')
    if len(authors) == 1: 
        title = f'{first_author} ({year})'
    elif len(authors) == 2: 
        second_author = authors[1].split(',')[0].replace(' ','-')
        title = f'{first_author} & {second_author} ({year})'
    else:
        title = f'{first_author} et al. ({year})'
    
    ## Define date.
    date = row.date.split('/')
    if len(date) == 2: date.append('1')
    date = '/'.join(['%0.2d' %int(t) for t in date])
    
    ## Define keywords.
    keywords = [k.lower() for k in row.keywords.split(';')]
    
    ## Write file.
    with open(os.path.join(POST_DIR, fname), 'w') as f:
        f.write('---\n')
        f.write(f'title: {title}\n')
        f.write(f"subtitle: '{row.title}'\n")
        f.write(f"date: {date}\n")
        f.write('authors:\n')
        for author in authors: f.write(f'- {author}\n')
        f.write(f'journal: {row.secondary_title}\n')
        f.write(f'paper_url: {row.paper_url}\n')
        f.write(f'data_url: {row.data_url}\n')
        if keywords:
            f.write('tags:\n')
            for keyword in keywords: f.write(f'- {keyword}\n') 
        # f.write('sample_size:\n')
        f.write('---\n')
        f.write(f'\n{row.abstract}\n')

In [None]:
row