<a href="https://colab.research.google.com/github/petermr/docanalysis/blob/main/software_papers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building a corpus

In [1]:
!pip install pygetpapers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from pprint import pprint
from pygetpapers import Pygetpapers
from lxml import etree

In [9]:
endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT ?item ?itemLabel ?prop ?propLabel ?PMCID  WHERE {
  ?item wdt:P31 wd:Q13442814.
  ?item wdt:P932 ?PMCID.
  {?item ?prop wd:Q513297.}
  UNION 
  {?item ?prop wd:Q181596.}
  UNION
  {?item ?prop wd:Q3699942.}
  
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

list_of_result_bindings = (results['results']['bindings'])

list_of_PMCID = []
for result in list_of_result_bindings:
    list_of_PMCID.append(result["PMCID"]["value"])
print(list_of_PMCID)

['3651407', '3651407', '403769', '403769', '2666812', '2666812', '2844990', '2844990', '2638934', '2638934', '4752027', '4752027', '3855388', '3855388', '4174321', '4174321', '3685583', '3685583', '3031041', '3031041', '3649846', '3649846', '4168754', '4168754', '3835703', '3835703', '3530913', '3530913', '3521214', '3521214', '4184317', '4184317', '4426829', '4426829', '4535771', '4535771', '4909101', '4909101', '3722524', '3722524', '2375448', '2375448', '2572702', '2572702', '2693143', '2693143', '2752617', '2752617', '3009535', '3009535', '2796819', '2796819', '2910028', '2910028', '2958747', '2958747', '2971582', '2971582', '2982160', '2982160', '5472867', '5472867', '3065683', '3065683', '3190406', '3190406', '3228863', '3228863', '3262844', '3262844', '4141640', '4141640', '2935447', '2935447', '3307116', '3307116', '3514294', '3514294', '4183485', '4183485', '3430699', '3430699', '3599859', '3599859', '3582273', '3582273', '3646686', '3646686', '4271147', '4271147', '4277354', 

In [10]:
set_of_PMCID = set(list_of_PMCID)
list_of_unique_PMCID = list(set_of_PMCID)
PMCIDs = []
for id in list_of_unique_PMCID:
  PMCIDs.append( "".join(["PMC", str(id)]))

print(PMCIDs)


['PMC6717989', 'PMC6880260', 'PMC5089131', 'PMC403769', 'PMC6628208', 'PMC4411368', 'PMC4236746', 'PMC3888149', 'PMC4168749', 'PMC5167065', 'PMC3722524', 'PMC5472867', 'PMC2530886', 'PMC3307116', 'PMC226690', 'PMC4184317', 'PMC4287948', 'PMC6039917', 'PMC5260057', 'PMC4499222', 'PMC4670012', 'PMC3262844', 'PMC4288400', 'PMC2971582', 'PMC4224201', 'PMC4826505', 'PMC6203375', 'PMC6392199', 'PMC2844990', 'PMC2638934', 'PMC6492420', 'PMC4642660', 'PMC4708103', 'PMC7194485', 'PMC2796819', 'PMC3646686', 'PMC3521214', 'PMC4743236', 'PMC4380025', 'PMC4410647', 'PMC5624584', 'PMC3599859', 'PMC4816032', 'PMC3228863', 'PMC6800166', 'PMC1008230', 'PMC3031041', 'PMC4168754', 'PMC3065683', 'PMC2693143', 'PMC3685583', 'PMC7718173', 'PMC7584444', 'PMC3083346', 'PMC2910028', 'PMC3307106', 'PMC6051191', 'PMC3530913', 'PMC2572702', 'PMC4426829', 'PMC5998007', 'PMC4111116', 'PMC4752027', 'PMC5588246', 'PMC4828368', 'PMC3514294', 'PMC4271147', 'PMC2427162', 'PMC5638226', 'PMC5978649', 'PMC4497953', 'PMC630

In [11]:
query = " OR ".join(PMCIDs)

In [12]:
pygetpapers_call = Pygetpapers ()
pygetpapers_call.run_command(query=query, limit = 300, output = "software_related_papers", xml=True)

INFO: Total Hits are 138
138it [00:00, 127492.06it/s]
INFO: Saving XML files to /content/software_related_papers/*/fulltext.xml
100%|██████████| 138/138 [00:36<00:00,  3.82it/s]


## Creating a software dictionary

In [17]:
endpoint_url = "https://query.wikidata.org/sparql"

query = """#title: Software used in publications
SELECT
  DISTINCT 
    ?softwareLabel
    ?software

WHERE {
  {?software wdt:P277 [] .}
  UNION
  {?software wdt:P306 [] .}
  UNION
  {?software wdt:P1324 [] .}
  ?paper (wdt:P921|wdt:P4510) ?software .
  ?paper wdt:P932 [].
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY ASC(?softwareLabel)"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)



In [31]:
list_of_binding_results= results["results"]["bindings"]


In [30]:
dictionary_element=  etree.Element("software_dictionary")
dictionary_element.attrib['title']="software_dictionary"
for result in list_of_binding_results:
    try:
      entry_element=etree.SubElement(dictionary_element,"entry")
      entry_element.attrib['term']=result['softwareLabel']['value']
      entry_element.attrib['WikdiataURL']=result['software']['value']
    except Exception as e:
      (f"Couldn't add {result['softwareLabel']['value']} to amidict")


In [29]:
with open("software_dictionary",mode='w', encoding='utf-8') as f:
    f.write(etree.tostring(dictionary_element, pretty_print=True).decode('utf-8'))

In [43]:
!pip install git+https://github.com/petermr/docanalysis.git#egg=docanalysis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docanalysis
  Cloning https://github.com/petermr/docanalysis.git to /tmp/pip-install-q05rshad/docanalysis_21e2c5e666714de4800f446899e1e2cc
  Running command git clone -q https://github.com/petermr/docanalysis.git /tmp/pip-install-q05rshad/docanalysis_21e2c5e666714de4800f446899e1e2cc


In [41]:
!python -m nltk.downloader stopwords
!python -m nltk.downloader punkt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [33]:
!docanalysis --help

usage: docanalysis [-h] [--run_pygetpapers] [--make_section] [-q QUERY]
                   [-k HITS] [--project_name PROJECT_NAME] [-d DICTIONARY]
                   [-o OUTPUT] [--make_ami_dict MAKE_AMI_DICT]
                   [--search_section [SEARCH_SECTION [SEARCH_SECTION ...]]]
                   [--entities [ENTITIES [ENTITIES ...]]]
                   [--spacy_model SPACY_MODEL] [--html HTML]
                   [--synonyms SYNONYMS] [-l LOGLEVEL] [-f LOGFILE]

Welcome to docanalysis version 0.1.0. -h or --help for help

optional arguments:
  -h, --help            show this help message and exit
  --run_pygetpapers     downloads papers from EuropePMC via pygetpapers
  --make_section        makes sections
  -q QUERY, --query QUERY
                        provide query to pygetpapers
  -k HITS, --hits HITS  specify number of papers to download from pygetpapers
  --project_name PROJECT_NAME
                        provide CProject directory name
  -d DICTIONARY, --dictionary DICTI

In [42]:
!docanalysis --make_section --project_name /content/software_related_papers --dictionary /content/software_related_papers/software_dictionary --output software.csv

[1;30mINFO:[0m dict_keys: dict_keys(['abstract', 'acknowledge', 'affiliation', 'author', 'conclusion', 'discussion', 'ethics', 'fig_caption', 'front', 'introduction', 'jrnl_title', 'keyword', 'method', 'octree', 'pdfimage', 'pub_date', 'publisher', 'reference', 'results_discuss', 'search_results', 'sections', 'svg', 'table', 'title'])
[1;30mINFO:[0m wrote XML sections for /content/software_related_papers/PMC4535771/fulltext.xml /content/software_related_papers/PMC4535771/sections
[1;30mINFO:[0m wrote XML sections for /content/software_related_papers/PMC3430699/fulltext.xml /content/software_related_papers/PMC3430699/sections
[1;30mINFO:[0m wrote XML sections for /content/software_related_papers/PMC3855388/fulltext.xml /content/software_related_papers/PMC3855388/sections
[1;30mINFO:[0m wrote XML sections for /content/software_related_papers/PMC3262844/fulltext.xml /content/software_related_papers/PMC3262844/sections
[1;30mINFO:[0m wrote XML sections for /content/software_rel