In [None]:
import collections
import litstudy #Use pip install git+https://github.com/NLeSC/litstudy to download dev version. Other encoding problem when loading ris files (load_ris_file needs to use robust_open instead of open)
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import re
import shutil
from pyzotero import zotero

#Read and compile references from the WoS search
#into a single document set (lit_study format)
def rpickle_bibdocset(in_dirpath, in_pattern, out_pickle):
    if not out_pickle.exists():
        #Get list of every bib file
        bib_initlist = [p for p in list(in_dirpath.glob('*')) 
                        if re.compile(in_pattern).match(str(p))]
        #Read bib files from first scoping and join them (takes ~15-20 sec/1000 refs)
        reflist = []
        for bib in bib_initlist:
            reflist += litstudy.load_bibtex(bib)

        #Pickle them (save the full document set as a binary file on disk that can be easily retrieved)
        with open(out_pickle, 'wb') as f:
            pickle.dump(reflist, f)
    else:
        #Read pre-saved document set
        with open(out_pickle, 'rb') as f:
            reflist =  pickle.load(f)
    return(reflist)
            
#Get titles and dois from zotero test list
def get_testlist(library_id, api_key_path):
    api_key = api_key_path.read_text().strip() 
    zot = zotero.Zotero(library_id = library_id, library_type = 'group', api_key = api_key) #Get 
    testlist_colID = str([col['key'] for col in zot.collections_top()
                          if col['data']['name'] == 'test list'][0])
    testlist_items = zot.everything(zot.collection_items_top(testlist_colID))

    testlist_title_dois = collections.defaultdict(list)
    for ref in testlist_items:
        testlist_title_dois[ref['key']].append(ref['data']['title'])
        if 'DOI' in ref['data']:
            testlist_title_dois[ref['key']].append(ref['data']['DOI'])
        else:
            testlist_title_dois[ref['key']].append(np.nan)
        #testlist_title_dois[ref['key']].append(ref['data']['itemType'])
    return(testlist_title_dois)

    
#Get all dois and titles in references returned from search
def tabulate_searchlist(in_reflist, out_csvpath):
    if not out_csvpath.exists():
        reflist_dict = {}
        for i, ref in enumerate(in_reflist):
            reflist_dict[i] = [re.sub("[^a-zA-Z\d\s]", "", ref.title.replace('\n', ' ').lower()),
                               ref.publication_source, ref.publication_year, ref.abstract]
            if 'doi' in ref.entry:
                reflist_dict[i].append(ref.entry['doi'])
            else:
                reflist_dict[i].append(np.nan)

        reflist_pd = pd.DataFrame.from_dict(reflist_dict, orient='index')
        reflist_pd.columns = ['title', 'source', 'year', 'abstract', 'doi']

        reflist_pd.to_csv(out_csvpath)
    else:
        reflist_pd = pd.read_csv(out_csvpath)
    return(reflist_pd)