## Initialization

In [17]:
import spacy

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

## Playing with two different synthetic paragraphs

`text` is from the Angewandte paper out of Stanford.  The synthetic paragraph is not very straightforward.

`text2` is taken from the Methods section of the Nature Nanotech paper out of the University of Toronto.  The synthetic paper is much more straightforward in this case.

`text3` is taken from the Methods section of the Nature Materials paper from Korea.  The 

In [18]:
nlp = spacy.load('en_core_web_sm') # Load English dictionary from spaCy

In [19]:
# Angewandte
#text = 'Layered perovskites can be structurally derived from the 3D analogue by slicing along specific crystallographic planes.4 The interlayer separation and thickness of the inorganic layers can be controlled through the choice of organic cations.5 The inorganic layers of most layered perovskites comprise a single sheet (n=1) of corner‐sharing metal–halide octahedra sandwiching layers of organic cations.4 These 2D materials do not have electronic properties typically associated with good solar‐cell absorbers. Along with larger bandgaps compared to the 3D analogue (n=∞), the spatial confinement of the 2D structure and dielectric mismatch between organic and inorganic layers lead to strongly bound excitons with low mobility.6 Such tightly bound excitons are difficult to dissociate into free carriers at room temperature and the localized charge carriers are unlikely to reach the electron/hole selective contacts in a typical solar‐cell geometry. To access the more favorable electronic properties of the 3D structure, we sought an intermediate structure between the n=1 and n=∞ materials. We synthesized the n=3 member of the series (PEA)2(MA)n−1[PbnI3n+1] (n=number of Pb–I sheets in each inorganic layer), by combining (PEA)I, (MA)I, and PbI2 in a 2:2:3 stoichiometric ratio in a solvent mixture of nitromethane/acetone. Slow solvent evaporation afforded dark red crystals of (PEA)2(MA)2[Pb3I10] (1), the first crystallographically characterized n=3 lead perovskite (Figure 1).'

In [20]:
# Nature Nanotechnology
text2 = "Perovskite film fabrication Different dimensionality perovskite [(PEA)2(CH3NH3)n−1PbnI3n+1] solutions was prepared by dissolving stoichiometric quantities of lead iodide (PbI2), methylammonium idodide (MAI) and PEAI in a dimethyl sulfoxide (DMSO)/ɣ-butyrolactone (1:1 volume ratio) mixture at 70 °C for 1 h with continuous stirring. The resulting solution was then filtered through a polytetrafluoroethylene (PTFE) filter (0.2 µm). The resulting solution was spin-coated onto the substrate via a two-step process at 1,000 r.p.m. and 5,000 r.p.m. for 10 s and 60 s, respectively. During the second spin step, 100 µl of chlorobenzene were poured onto the substrate. The resulting films were then annealed at 70 °C for 10 min to improve crystallization."

### Nature Nanotech paragraph: Playing with ChemDataExtractor

In [63]:
# Formatting the Nature Nanotech paragraph according to ChemDataExtractor
nat = Document(Heading(u'Perovskite film fabrication'), Paragraph(u'Different dimensionality perovskite [(PEA)2(CH3NH3)n−1PbnI3n+1] solutions was prepared by dissolving stoichiometric quantities of lead iodide (PbI2), methylammonium idodide (MAI) and PEAI in a dimethyl sulfoxide (DMSO)/ɣ-butyrolactone (1:1 volume ratio) mixture at 70 °C for 1 h with continuous stirring. The resulting solution was then filtered through a polytetrafluoroethylene (PTFE) filter (0.2 µm). The resulting solution was spin-coated onto the substrate via a two-step process at 1,000 r.p.m. and 5,000 r.p.m. for 10 s and 60 s, respectively. During the second spin step, 100 µl of chlorobenzene were poured onto the substrate. The resulting films were then annealed at 70 °C for 10 min to improve crystallization.'))

In [64]:
nat.records.serialize()

[{'names': ['Perovskite'],
  'spin_coat_steps': [{'value': '5,000', 'units': 'r.p.m.'},
   {'value': '1,000', 'units': 'r.p.m.'}]}]

In [24]:
nat.paragraphs[0].pos_tagged_tokens

[[('Different', 'JJ'),
  ('dimensionality', 'NN'),
  ('perovskite', 'NN'),
  ('[(PEA)2(CH3NH3)n−1PbnI3n+1]', 'NN'),
  ('solutions', 'NNS'),
  ('was', 'VBD'),
  ('prepared', 'VBN'),
  ('by', 'IN'),
  ('dissolving', 'VBG'),
  ('stoichiometric', 'JJ'),
  ('quantities', 'NNS'),
  ('of', 'IN'),
  ('lead', 'NN'),
  ('iodide', 'NN'),
  ('(', '-LRB-'),
  ('PbI2', 'NN'),
  (')', '-RRB-'),
  (',', ','),
  ('methylammonium', 'NN'),
  ('idodide', 'NN'),
  ('(', '-LRB-'),
  ('MAI', 'NN'),
  (')', '-RRB-'),
  ('and', 'CC'),
  ('PEAI', 'NN'),
  ('in', 'IN'),
  ('a', 'DT'),
  ('dimethyl', 'NN'),
  ('sulfoxide', 'NN'),
  ('(', '-LRB-'),
  ('DMSO', 'NN'),
  (')', '-RRB-'),
  ('/', ':'),
  ('ɣ-butyrolactone', 'NN'),
  ('(', '-LRB-'),
  ('1', 'CD'),
  (':', ':'),
  ('1', 'CD'),
  ('volume', 'NN'),
  ('ratio', 'NN'),
  (')', '-RRB-'),
  ('mixture', 'NN'),
  ('at', 'IN'),
  ('70', 'CD'),
  ('°', 'NN'),
  ('C', 'NN'),
  ('for', 'IN'),
  ('1', 'CD'),
  ('h', 'NN'),
  ('with', 'IN'),
  ('continuous', 'JJ'),
  

In [25]:
sp_nat = nlp(text2)

for token in sp_nat:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Perovskite perovskite ADJ JJ amod Xxxxx True False
film film NOUN NN compound xxxx True False
fabrication fabrication NOUN NN compound xxxx True False
Different different ADJ JJ amod Xxxxx True False
dimensionality dimensionality NOUN NN compound xxxx True False
perovskite perovskite NOUN NN ROOT xxxx True False
[ [ PUNCT -LRB- punct [ False False
( ( PUNCT -LRB- punct ( False False
PEA)2(CH3NH3)n−1PbnI3n+1 pea)2(ch3nh3)n−1pbni3n+1 NOUN NNS nmod XXX)d(XXdXXd)x−dXxxXdx+d False False
] ] PUNCT -RRB- punct ] False False
solutions solution NOUN NNS nsubjpass xxxx True False
was be VERB VBD auxpass xxx True True
prepared prepare VERB VBN ROOT xxxx True False
by by ADP IN agent xx True True
dissolving dissolve VERB VBG pcomp xxxx True False
stoichiometric stoichiometric ADJ JJ amod xxxx True False
quantities quantity NOUN NNS dobj xxxx True False
of of ADP IN prep xx True True
lead lead NOUN NN pobj xxxx True False
iodide iodide ADV RB advmod xxxx True False
( ( PUNCT -LRB- punct ( False Fal

In [26]:
#sp_ang = nlp(text)

#for token in sp_ang:
#    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#          token.shape_, token.is_alpha, token.is_stop)

In [27]:
#angew = Document(text)

In [28]:
#angew

## Defining Custom Properties in CDE

I'm using mostly code from the CDE notebook to define a new property for spin-coating step(s).

In [35]:
nat

In [30]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class SpinCoat(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.spin_coat_steps = ListType(ModelType(SpinCoat))

In [80]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

#from chemdataextractor.model.cem import chemical_name

In [91]:
u'^r\.?^p\.?^m\.?$'# == 'r.p.m.'

'^r\\.?^p\\.?^m\\.?$'

In [110]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.cem import chemical_name

solvent = chemical_name('solvent')
units = (R(u'^\br(\.)?p(\.)?m(\.)?\b$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)( )?g$'))(u'units')
#Optional(W('/')).hide() + W(u'^r\.?p\.?m\.?')
#R('^(re)?crystalli[sz](ation|ed)$', re.I)
value = R(u'^\d+(,\d+)?$')(u'value')
spinspd = (value + units)(u'spinspd')

from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class SpinCoatParser(BaseParser):
    root = spinspd

    def interpret(self, result, start, end):
        compound = Compound(
            spin_coat_steps=[
                SpinCoat(
                    solvent=first(result.xpath('./solvent/text()')),
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [93]:
Paragraph.parsers = [SpinCoatParser()]

In [94]:
nat.records.serialize()

[{'names': ['Perovskite'],
  'spin_coat_steps': [{'value': '5,000', 'units': 'r.p.m.'},
   {'value': '1,000', 'units': 'r.p.m.'}]}]

In [113]:
d = Document((u' The resulting solution was spin-coated onto the substrate via a two-step process at (1,000 r.p.m.) and 5,000 r.p.m. for 10 s and 60 s, respectively.')
)

d.records.serialize()

[{'spin_coat_steps': [{'value': '1,000', 'units': 'r.p.m.'}]},
 {'spin_coat_steps': [{'value': '5,000', 'units': 'r.p.m.'}]}]