In [1]:
import glob
import nltk.data
import numpy as np
import xml.etree.ElementTree as ET

# Sentence Analysis

In [2]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [3]:
all_files = glob.glob("../data/pubmed/articles.0-9A-B.txt//**/*.txt", recursive=True)

In [4]:
np.random.seed(1234)
COUNT = 100
files = list(np.random.choice(all_files, size=100, replace=False))

In [5]:
lines = []
for path in files:
    with open(path) as f:
        lines += f.readlines()

In [6]:
sentences = [s for line in lines for s in tokenizer.tokenize(line.strip())]

In [7]:
questions = [s for s in sentences if s[-1] == "?"]

In [8]:
questions

['Mishra AK Skinner HB Davidson JA  Stem loosening and thigh pain in THA: are they related to prosthesis stiffness?',
 'Kulkarni M Wylde V Aspros D Learmonth ID  Early clinical experience with a metaphyseal loading implant: Why have a stem?',
 'How does compression affect the alignment accuracy with respect to the base network alignment method?',
 'How far is our compression method from an optimal compression that produces the compressed network with the minimum number of nodes?',
 'When is it a good idea to do the alignment in compressed domain taking into account the overhead of compression and refinement phases?',
 'What is the right amount of compression?',
 'That is, when does compression minimize the running time of our overall framework?',
 'When should we compress?',
 'How much should we compress?',
 'In this section, we provide a guideline for selecting a value for compression level c that results in the minimum expected running time, among other possible values, for our frame

In [9]:
questions_and_context = []

for i, sentence in enumerate(sentences):
    if sentence[-1] == "?":
        questions_and_context.append(sentences[i-1:i+2])

In [10]:
for sentence_before, question, sentence_after in questions_and_context:
    print(sentence_before)
    print(question)
    print(sentence_after)
    print()

Kim YH Kim JS Cho SH  Primary total hip arthroplasty with a cementless porous-coated anatomic total hip prosthesis: 10- to 12-year results of prospective and consecutive series J Arthroplasty 1999 14 538 548 10475551 10.1016/S0883-5403(99)90074-8
Mishra AK Skinner HB Davidson JA  Stem loosening and thigh pain in THA: are they related to prosthesis stiffness?
Orthopedics 1997 20 58 61 9122054

A prospective, randomized, controlled trial J Bone Joint Surg Am 2005 4 701 710 15805196 10.2106/JBJS.D.02645
Kulkarni M Wylde V Aspros D Learmonth ID  Early clinical experience with a metaphyseal loading implant: Why have a stem?
Hip Int 2006 16 S3 8

1.
How does compression affect the alignment accuracy with respect to the base network alignment method?
2.

2.
How far is our compression method from an optimal compression that produces the compressed network with the minimum number of nodes?
3.

3.
When is it a good idea to do the alignment in compressed domain taking into account the overhead of

# Title Analysis

In [11]:
all_files = glob.glob("../data/pubmed/articles.0-9A-B.xml/**/*.nxml", recursive=True)

In [12]:
np.random.seed(1234)
COUNT = 100
files = list(np.random.choice(all_files, size=100, replace=False))

In [13]:
titles = []

for path in all_files:
    try:
        root = ET.parse(path).getroot()
        titles.append(root.find("front/article-meta/title-group/article-title").text)
    except:
        print("Something went wrong:", path)

Something went wrong: ../data/pubmed/articles.0-9A-B.xml/Acta_Crystallogr_Sect_E_Struct_Rep_Online/PMC2960286.nxml


In [18]:
question_titles = [title for title in titles if title is not None and title[-1] == "?"]

In [20]:
(len(question_titles), len(titles))

(93, 9886)

In [19]:
question_titles

['Is Linear Wound Closure Acceptable Option for Congenital Midline Cervical Cleft Excision in Neonates?',
 'Incidentally Diagnosed Multiple Vascular Lesions of the Spleen: Littoral Cell Angioma or Hemangioma?',
 'Maffucci’s Syndrome or a Variant?',
 'Right Hemi-Diaphragmatic Rupture: An Injury Missed or Masked?',
 'Probiotic soy milk and anthropometric measures: Is probiotic soy milk beyond soy milk?',
 'Can doubling the maintenance dose of clopidogrel prevent from early stent thrombosis after the primary percutaneous coronary intervention?',
 'Can Timi Risk Score Predict Angiographic Involvement in Patients with St-Elevation Myocardial Infarction?',
 'What Every Cardiologist Should Know about H1N1?',
 'Nifedipine, Captopril or Sublingual Nitroglycerin, Which can Reduce Blood Pressure the Most?',
 'Is there any Relationship Between C-Reactive Protein Level and Complex Coronary Plaques in Patients with Unstable Angina?',
 'Can the infusion of isotonic fluids or vasopressors prevent hemo