Skip to content

Commit

Permalink
answering research question
Browse files Browse the repository at this point in the history
  • Loading branch information
notconfusing committed Oct 22, 2014
1 parent 90650af commit e140ce9
Show file tree
Hide file tree
Showing 4 changed files with 532,987 additions and 0 deletions.
39 changes: 39 additions & 0 deletions pmc_journal_extract.py
@@ -0,0 +1,39 @@
from mw import xml_dump
import datetime
import mwparserfromhell
import json
import re

files = ["enwiki-20140304-pages-articles-multistream.xml"]

stime = datetime.datetime.now()

print('starting at', stime)

valid_page_titles = json.load(open('combined_page_titles.json', 'r'))

def page_info(dump, path):
for page in dump:
journal_pmcs = list()
if page.namespace == 0:
if page.title in valid_page_titles:
#print(page.title)
revisions = list(page)
latest_revision = revisions[0]
journal_pmcs = re.findall(r"pmc\s*\=\s*(.*?)[\|\}]", latest_revision.text, flags=re.IGNORECASE)
yield(page.title, page.id, {'journal_pmcs':journal_pmcs})


outfile = open('journal_pmc_list.txt', 'w')

for page_title, page_id, doi_dict in xml_dump.map(files, page_info):
if doi_dict['journal_pmcs']:
print(' pageid', page_id, ' page title ', page_title , ' doi_dict', doi_dict)
for doi in doi_dict['journal_pmcs']:
outfile.write(str(page_title) + '\t'+ str(doi) + '\n')

outfile.close()

etime = datetime.datetime.now()
print(etime)
print('took ', (etime - stime))

0 comments on commit e140ce9

Please sign in to comment.