## Initialization

In [30]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import spacy
import logging
import re

from lxml.builder import E

from chemdataextractor import Document
from chemdataextractor.model import Compound, BaseModel, StringType, ListType, ModelType
from chemdataextractor.doc import Paragraph, Heading
from chemdataextractor.utils import first
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Not
from chemdataextractor.parse.cem import chemical_name
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.actions import join, merge, strip_stop
from chemdataextractor.parse.common import hyphen

In [28]:
E?

`.parse`:

* `R(pattern)`: match token text with regular expression
* `I(match)`: case-insensitive match token text
* `W(match)`: match token text exactly
* `T(match)`: match tag exactly
* `Optional(expr)`: abstract class for combining and post-processing parsed tokens
* `merge(tokens, start, result)`: join tokens into a single string with no spaces
* `ZeroOrMore(expr)`: optional repetition of zero or more of the given expression
* `OneOrMore(expr)`: repetition of one or more of the given expression
* `Not(expr):` check ahead to disallow a match with the given parse expression

In [4]:
nlp = spacy.load('en_core_web_sm') # Load English dictionary from spaCy

## Importing test synthesis paragraph and sentence

In [5]:
# Nature Materials
ptext = u"A dense blocking layer of TiO2 (bl-TiO2, ∼70 nm in thickness) was deposited onto a F-doped SnO2 (FTO, Pilkington, TEC8) substrate by spray pyrolysis, using a 20 mM titanium diisopropoxide bis(acetylacetonate) solution (Aldrich) at 450 °C to prevent direct contact between the FTO and the hole-conducting layer. A 200–300-nm-thick mesoporous TiO2 (particle size: about 50 nm, crystalline phase: anatase) film was spin-coated onto the bl-TiO2/FTO substrate using home-made pastes14 and calcining at 500 °C for 1 h in air to remove organic components. CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively. The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in ethanol, recrystallized from diethyl ether, and dried at 60 °C in a vacuum oven for 24 h. The prepared MAI and MABr powders, PbI2 (Aldrich) and PbBr2 (Aldrich) for 0.8 M MAPb(I1 − xBrx)3 (x  =  0.1–0.15) solution were stirred in a mixture of GBL and DMSO (7:3 v/v) at 60 °C for 12 h. The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively. During the second spin-coating step, the substrate (around 1 cm × 1 cm) was treated with toluene drop-casting. A detailed time-rotation profile for the spin-coating is represented in Supplementary Fig. 1c. The substrate was dried on a hot plate at 100 °C for 10 min. A solution of poly(triarylamine) (15 mg, PTAA, EM Index, Mw  =  17,500 g mol−1) in toluene (1.5 ml) was mixed with 15 μl of a solution of lithium bistrifluoromethanesulphonimidate (170 mg) in acetonitrile (1 ml) and 7.5 μl 4-tert-butylpyridine and spin-coated on the MAPb(I1 − xBrx)3 (x  =  0.1–0.15)/mp-TiO2/bl-TiO2/FTO substrate at 3,000 r.p.m for 30 s. Finally, a Au counterelectrode was deposited by thermal evaporation. The active area of this electrode was fixed at 0.16 cm2."

In [6]:
# Formatting the Nature Nanotech paragraph according to ChemDataExtractor
p = Document(Heading(u'Solar cell fabrication'), Paragraph(ptext))

In [8]:
# Sentence version
s = Document(u'The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively.')

In [10]:
p

In [11]:
s

### Outlining Spincoat Parameter Extraction

    {'Synthesis' : {
                'spincoat' : {
                    'time' : {
                        'value' : (number),
                        'units' : 's',
                        },
                    },
                'anneal' : {
                    'time' : {
                        'value' : (number),
                        'units' : 's',
                        },
                    'temperature' : {
                        'value' : (number),
                        'units' : 'K',
                        },
                etc...
    }

Structuring my code to fit the format:

    create 'spincoat' class to parse spincoat spds
    spincoat: (value) (units)
        create 'time' subclass in the spincoat class
        time: (value) (units)
    create 'anneal' class to parse annealing conditions
        create 'time' subclass in the anneal class
        time: (value) (units)
        create 'temperature' subclass in the anneal class
        temperature: (value) (units)

In [12]:
# Parsing spin-coating speed
class SpinSpd(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.spin_spd = ListType(ModelType(SpinSpd))

# Extracting units (may not be strictly next to the value)
def extract_units(tokens, start, result):
    """Extract units from bracketed after nu"""
    for e in result:
        for child in e.iter():
            # Tags units as units (e.g. <unit>rpm</unit>)
            if R(u'^\b?r(\.)?p(\.)?m(\.)?\b?$') in child.text:
                return [E('units', 'rpm')]
            elif R(u'^r(\.)?c(\.)?f(\.)?$') in child.text:
                return [E('units', 'rcf')]
            elif R(u'^([x×]?)(\s?)?g$') in child.text:
                return [E('units', 'g')]
    return []

In [31]:
# Adding GBL to the solvents list
gbl_solvent = (I('GBL') | R('γ-[Bb]utyrolactone') | chemical_name('solvent'))
solvent = (gbl_solvent | chemical_name)('solvent').add_action(join)

# Defining formats for value and units
units = (R(u'^\b?r(\.)?p(\.)?m(\.)?\b?$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)(\s?)?g$'))(u'units') + Optional(W(')')).hide())
#Optional(W('/')).hide() + W(u'^r\.?p\.?m\.?')
#R('^(re)?crystalli[sz](ation|ed)$', re.I)
value = R(u'^\d+(,\d+)?$')(u'value')
spinspd = (value + units)(u'spinspd')

class SpinSpdParser(BaseParser):
    root = spinspd

    def interpret(self, result, start, end):
        compound = Compound(
            spin_spd=[
                SpinSpd(
                    solvent=first(result.xpath('./solvent/text()')),
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [15]:
Paragraph.parsers = [SpinSpdParser()]

In [16]:
d = Document(u'The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively.')

d.records.serialize()

[{'spin_coat_spd': [{'value': '1,000'}]},
 {'spin_coat_spd': [{'value': '5,000'}]},
 {'spin_coat_spd': [{'value': '10'}]},
 {'spin_coat_spd': [{'value': '20'}]}]

In [9]:
sp_nm = nlp(ptext)

for token in sp_nm:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

A a DET DT det X True False
dense dense ADJ JJ amod xxxx True False
blocking block VERB VBG amod xxxx True False
layer layer NOUN NN nsubjpass xxxx True False
of of ADP IN prep xx True True
TiO2 tio2 PROPN NNP pobj XxXd False False
( ( PUNCT -LRB- punct ( False False
bl bl NUM CD compound xx True False
- - PUNCT : punct - False False
TiO2 tio2 PROPN NNP punct XxXd False False
, , PUNCT , punct , False False
∼70 ∼70 VERB VB det ∼dd False False
nm nm NUM CD appos xx True False
in in ADP IN prep xx True True
thickness thickness NOUN NN pobj xxxx True False
) ) PUNCT -RRB- punct ) False False
was be VERB VBD auxpass xxx True True
deposited deposit VERB VBN ROOT xxxx True False
onto onto ADP IN prep xxxx True True
a a DET DT det x True True
F f PROPN NNP npadvmod X True False
- - PUNCT HYPH punct - False False
doped dope VERB VBN amod xxxx True False
SnO2 sno2 NOUN NN pobj XxXd False False
( ( PUNCT -LRB- punct ( False False
FTO fto PROPN NNP appos XXX True False
, , PUNCT , punct , False F

plate plate NOUN NN pobj xxxx True False
at at ADP IN prep xx True True
100 100 NUM CD nummod ddd False False
° ° NOUN NN compound ° False False
C c PROPN NNP pobj X True False
for for ADP IN prep xxx True True
10 10 NUM CD nummod dd False False
min min NOUN NN pobj xxx True False
. . PUNCT . punct . False False
A a DET DT det X True False
solution solution NOUN NN nsubj xxxx True False
of of ADP IN prep xx True True
poly(triarylamine poly(triarylamine NOUN NN pobj xxxx(xxxx False False
) ) PUNCT -RRB- punct ) False False
( ( PUNCT -LRB- punct ( False False
15 15 NUM CD nummod dd False False
mg mg NOUN NN appos xx True False
, , PUNCT , punct , False False
PTAA ptaa PROPN NNP conj XXXX True False
, , PUNCT , punct , False False
EM em PROPN NNP compound XX True False
Index index PROPN NNP conj Xxxxx True False
, , PUNCT , punct , False False
Mw mw PROPN NNP conj Xx True False
    SPACE     False False
= = SYM SYM punct = False False
    SPACE     False False
17,500 17,500 NUM CD nummod 