In [47]:
import chemdataextractor
from chemdataextractor import Document
from chemdataextractor.reader import HtmlReader
import numpy as np

### Read and open paper in HTML format

In [48]:
f = open('journal_articles/Paper5.html', 'rb')

In [49]:
doc = Document.from_file(f, readers=[HtmlReader()])

##### assume we know that doc.element 88 is the synthesis paragraph of interest

In [50]:
para = doc.elements[88]
para

##### store in two separate lists: 
syn_yes_arr contains sentences related to synthesis
syn_no_arr contains all other sentences in the html file that is not in the synthesis paragraph

In [51]:
sen_yes_arr = list()
sen_no_arr = list()

elem_all = np.arange(0,len(doc))
para_yes = [88]
para_no = np.delete(elem_all, para_yes)

for i in para_no:
    if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
        for sentence in doc.elements[i]:
            sen_no_arr.append(sentence)

for i in para_yes:
    if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
        for sentence in doc.elements[i]:
            sen_yes_arr.append(sentence)

In [52]:
sen_yes_arr

[Sentence('Solar cells were fabricated on precleaned ITO-coated glass substrates with a sheet resistance of 20 Ω sq−1.', 0, 107),
 Sentence('First, a thin ZnO nanoparticle layer was spin coated onto the substrate at 3,000 r.p.m. for 30\xa0s.', 108, 205),
 Sentence('The procedure was repeated three times to obtain a continuous smooth film.', 206, 280),
 Sentence('A PbI2 solution (dissolved in N,N-dimethylformamide at a concentration of 460\xa0mg ml−1) was then spin coated on top of the ZnO layer at 3,000 r.p.m. for 15\xa0s.', 281, 438),
 Sentence('After drying for several minutes in air, the substrate was dipped into a solution of CH3NH3I in 2-propanol (10\xa0mg ml−1) for 40\xa0s, then dried under a flow of clean air.', 439, 606),
 Sentence('Subsequently, the spiro-OMeTAD-based hole-transfer layer (80\xa0mg spiro-OMeTAD, 28.5\xa0µl 4-tert-butylpyridine and 17.5\xa0µl lithium-bis(trifluoromethanesulfonyl)imide (Li-TFSI) solution (520\xa0mg\xa0Li-TFSI in 1\xa0ml acetonitrile) all dissolve

In [53]:
# sen_no_arr

make this into a function

In [54]:
def extract_sentences(paper_path, para_yes):
    """extracts sentences from a paper into two lists, given that para_yes contains
    a list of document element numbers corresponding to paragraphs manually identified
    as those containing synthesis information"""

    f = open(paper_path, 'rb')
    doc = Document.from_file(f, readers=[HtmlReader()])
    
    sen_yes_arr = list()
    sen_no_arr = list()

    elem_all = np.arange(0,len(doc))
    para_no = np.delete(elem_all, para_yes)

    for i in para_no:
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sen_no_arr.append(sentence)

    for i in para_yes:
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sen_yes_arr.append(sentence)
    

    return sen_yes_arr, sen_no_arr

### Repeat for all journal articles

This is a pretty simplistic approach to testing/training data, since we don't want to have to manually tag sentences in many many papers, let's start with 5 papers with the same HTML structure.

In [55]:
#0: doc.elements[109]
#1: doc.elements[117:120]
#2: doc.elements[112]
#3: doc.elements[117]
#4: doc.elements[122] and doc.elements[125]
#5: doc.elements[88]

manually identified/tagged synthesis paragraphs

In [56]:
test_p = [[109]]
train_p = [[117, 118, 119], [112], [117], [122, 125], [88]]

In [57]:
p = 1
sen_yes_arr, sen_no_arr = extract_sentences('journal_articles/Paper' + str(p) + '.html', train_p[p-1])

In [58]:
sen_yes_arr

[Sentence('We placed approximately 500\u2009mg of CH3NH3I and 100\u2009mg of PbCl2 into separate crucibles.', 0, 86),
 Sentence('The device substrates were placed in a substrate holder above the sources with the TiO2-coated FTO side facing down towards the sources.', 87, 223),
 Sentence('Once the pressure in the chamber was pumped down to below 10−5\u2009mbar, the two sources were heated slightly above their desired deposition temperatures for approximately 5\u2009min (that is, CH3NH3I was heated to about 120\u2009°C and PbCl2 was heated to about 325\u2009°C) to remove volatile impurities before depositing the materials onto the substrate.', 224, 562),
 Sentence('The substrate holder was rotated to ensure uniform coating throughout deposition, because the right-hand source predominantly coats the right-hand side of the substrate and similarly for the left.', 563, 759),
 Sentence('The substrate holder was water-cooled to approximately 21\u2009°C, though precise measurement of the substr

In [62]:
# sen_no_arr

In [68]:
p = 3
sen_yes_arr, sen_no_arr = extract_sentences('journal_articles/Paper' + str(p) + '.html', train_p[p-1])
sen_yes_arr

[Sentence('Patterned transparent conducting oxide substrates were coated with a TiO2 compact layer by aerosol spray pyrolysis.', 0, 115),
 Sentence('A 350-nm-thick mesoporous TiO2 layer composed of 20-nm-sized particles was then deposited by spin coating.', 116, 222),
 Sentence('The mesoporous TiO2 films were infiltrated with PbI2 by spin-coating a PbI2 solution in DMF (462\u2009mg\u2009ml−1) that was kept at 70\u2009°C.', 223, 352),
 Sentence('After drying, the films were dipped in a solution of CH3NH3I in 2-propanol (10\u2009mg\u2009ml−1) for 20\u2009s and rinsed with 2-propanol.', 353, 477),
 Sentence('After drying, the HTM was deposited by spin-coating a solution of spiro-MeOTAD, 4-tert-butylpyridine, lithium bis(trifluoromethylsulphonyl)imide and tris(2-(1H-pyrazol-1-yl)-4-tert-butylpyridine)cobalt(iii) bis(trifluoromethylsulphonyl)imide in chlorobenzene.', 478, 737),
 Sentence('Gold (80\u2009nm) was thermally evaporated on top of the device to form the back contact.', 738, 822),


##### The is the "test" data, the original Nature paper we are all supposedly using

In [70]:
sen_yes_arr, sen_no_arr = extract_sentences('journal_articles/Paper' + str(0) + '.html', test_p[0])
sen_yes_arr

[Sentence('A dense blocking layer of TiO2 (bl-TiO2, ∼70 nm in thickness) was deposited onto a F-doped SnO2 (FTO, Pilkington, TEC8) substrate by spray pyrolysis, using a 20 mM titanium diisopropoxide bis(acetylacetonate) solution (Aldrich) at 450 °C to prevent direct contact between the FTO and the hole-conducting layer.', 0, 310),
 Sentence('A 200–300-nm-thick mesoporous TiO2 (particle size: about 50 nm, crystalline phase: anatase) film was spin-coated onto the bl-TiO2/FTO substrate using home-made pastes14 and calcining at 500 °C for 1 h in air to remove organic components.', 311, 548),
 Sentence('CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively.', 549, 824),
 Sentence('The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in 