In [3]:
# Import necessary libraries
import collections
from inspect import getsourcefile
import itertools
import litstudy  # Use pip install git+https://github.com/NLeSC/litstudy to download dev version
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pickle
from pyzotero import zotero
import re
import requests
import shutil
from urllib.parse import urlparse

In [1]:
# Read and compile references from the WoS search into a single document set (lit_study format)
def rpickle_bibdocset(in_dirpath, in_pattern, out_pickle):
    if not out_pickle.exists():
        # Get list of every bib file
        bib_initlist = [p for p in list(in_dirpath.glob('*')) if re.compile(in_pattern).match(str(p))]
        # Read bib files from first scoping and join them (takes ~15-20 sec/1000 refs)
        reflist = []
        for bib in bib_initlist:
            reflist += litstudy.load_bibtex(bib)

        # Pickle them (save the full document set as a binary file on disk that can be easily retrieved)
        with open(out_pickle, 'wb') as f:
            pickle.dump(reflist, f)
    else:
        # Read pre-saved document set
        with open(out_pickle, 'rb') as f:
            reflist = pickle.load(f)
    return reflist

# Get titles and DOIs from Zotero test list
def get_testlist(library_id, api_key_path):
    api_key = api_key_path.read_text().strip()
    zot = zotero.Zotero(library_id=library_id, library_type='group', api_key=api_key)
    testlist_colID = str([col['key'] for col in zot.collections_top() if col['data']['name'] == 'test list'][0])
    testlist_items = zot.everything(zot.collection_items_top(testlist_colID))

    testlist_title_dois = collections.defaultdict(list)
    for ref in testlist_items:
        testlist_title_dois[ref['key']].append(ref['data']['title'])
        if 'DOI' in ref['data']:
            testlist_title_dois[ref['key']].append(ref['data']['DOI'])
        else:
            testlist_title_dois[ref['key']].append(np.nan)
    return testlist_title_dois

# Get all DOIs and titles in references returned from search
def tabulate_searchlist(in_reflist, out_csvpath):
    if not out_csvpath.exists():
        reflist_dict = {}
        for i, ref in enumerate(in_reflist):
            reflist_dict[i] = [re.sub(r"[^a-zA-Z\d\s]", "", ref.title.replace('\n', ' ').lower()),
                               ref.publication_source, ref.publication_year, ref.abstract]
            if 'doi' in ref.entry:
                reflist_dict[i].append(ref.entry['doi'])
            else:
                reflist_dict[i].append(np.nan)

        reflist_pd = pd.DataFrame.from_dict(reflist_dict, orient='index')
        reflist_pd.columns = ['title', 'source', 'year', 'abstract', 'doi']
        reflist_pd.to_csv(out_csvpath)
    else:
        reflist_pd = pd.read_csv(out_csvpath)
    return reflist_pd

# Erite string y to file x
def write(x, y):
    with open(x, 'a') as f:
        f.write(y)
        f.write('\n')
    return _

def combine_2w_regex(pattern1, pattern2, precede=False):
    """
    precede = False means word1 and word2 are looked at with either being first word
    precede = True means word1 must be first, word2 must be second
    """
    regexp = f"{pattern1}\\W{pattern2}\\b"
    if precede == False:
        regexp = f"({regexp})|({pattern2}\\W{pattern1}\\b)"
    return regexp

# Count number a times a simple 2-pattern group occurs in text
def find_2w_regex(text, pattern1, pattern2, precede=False):
    """
    precede = False means word1 and word2 are looked at with either being first word
    precede = True means word1 must be first, word2 must be second
    """
    regexp = f"{pattern1}\\W{pattern2}\\b"
    if precede == False:
        regexp = f"({regexp})|({pattern2}\\W{pattern1}\\b)"
    wa = re.findall(regexp, text)
    wal = len(wa)
    return wal

# Count number a times a simple pattern occurs in text
def find_regex(text, pattern):
    return len(re.findall(f"{word}", text))

# Join all strings in a list with | signs and parentheses
def recomb(in_str, recomb_sep):
    if isinstance(in_str, list):
        return f"({recomb_sep.join(f'({w})' for w in in_str)})"
    else:
        return in_str

# Find patterns in text based on search dictionary
def combo_refind(in_searchdict, text):
    for regexp_combo in in_searchdict.values():
        if regexp_combo[0] == 'with':
            k = find_2w_regex(text, regexp_combo[1][0], regexp_combo[1][1], precede=False)
        elif regexp_combo[0] == 'pre':
            k = find_2w_regex(text, regexp_combo[1][0], regexp_combo[1][1], precede=True)
        elif regexp_combo[0] is None:
            k = find_regex(text, regexp_combo)
        else:
            break
        return k

# Generate n-grams from DOI
#CHECK OUT: from pattern.en import ngrams
#print(ngrams("He goes to hospital", n=2))
def DOI_ngram(A):
    count0 = collections.Counter()
    s1 = A[0].replace("'", '')
    s2 = s1.replace("?", '')
    s3 = s2.replace(".", '')
    s4 = s3.replace(",", '')
    s5 = s4.replace(":", '')
    s6 = s5.lower()
    tokens = nltk.word_tokenize(s6)
    every = nltk.everygrams(tokens, 2, 4)
    count0 = count0 + (collections.Counter(every))
    count0 = count0.most_common()

    count1 = collections.Counter()
    s1 = A[1].replace("'", '')
    s2 = s1.replace("?", '')
    s3 = s2.replace(".", '')
    s4 = s3.replace(",", '')
    s5 = s4.replace(":", '')
    s6 = s5.lower()
    tokens = nltk.word_tokenize(s6)
    every = nltk.everygrams(tokens, 2, 4)
    count1 = count1 + (collections.Counter(every))
    count1 = count1.most_common()

    count2 = collections.Counter()
    for idx, i in enumerate(A[2]):
        x = collections.Counter([l.lower() for l in i])
        count2 += x
    count2 = count2.most_common()

    count3 = count0 + count1 + count2
    return count3