## Initialization

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import spacy
import logging
import re

from lxml.builder import E

from chemdataextractor import Document
from chemdataextractor.model import Compound, BaseModel, StringType, ListType, ModelType
from chemdataextractor.doc import Paragraph, Heading
from chemdataextractor.utils import first
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Not
from chemdataextractor.parse.cem import chemical_name
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.actions import join, merge, strip_stop
from chemdataextractor.parse.common import hyphen, lbrct, dt, rbrct

`.parse`:

* `R(pattern)`: match token text with regular expression
* `I(match)`: case-insensitive match token text
* `W(match)`: match token text exactly
* `T(match)`: match tag exactly
* `Optional(expr)`: abstract class for combining and post-processing parsed tokens
* `merge(tokens, start, result)`: join tokens into a single string with no spaces
* `ZeroOrMore(expr)`: optional repetition of zero or more of the given expression
* `OneOrMore(expr)`: repetition of one or more of the given expression
* `Not(expr):` check ahead to disallow a match with the given parse expression

### Outlining Spincoat Parameter Extraction

    {'Synthesis' : {
                'spincoat' : {
                    'time' : {
                        'value' : (number),
                        'units' : 's',
                        },
                    },
                'anneal' : {
                    'time' : {
                        'value' : (number),
                        'units' : 's',
                        },
                    'temperature' : {
                        'value' : (number),
                        'units' : 'K',
                        },
                etc...
    }

Structuring my code to fit the format:

    create 'spincoat' class to parse spincoat spds
    spincoat: (value) (units)
        create 'time' subclass in the spincoat class
        time: (value) (units)
    create 'anneal' class to parse annealing conditions
        create 'time' subclass in the anneal class
        time: (value) (units)
        create 'temperature' subclass in the anneal class
        temperature: (value) (units)
        
Or, create output that feeds into the nested dictionary structure.  Your output doesn't have to be as structured.

    create 'spincoat' class for parsed spincoat parameters
    spincoat: (spdval) (spdunits) (timeval) (timeunits) (tempval) (tempunits)
    create 'anneal' class
    ... etc.

Class ideas: spincoat, anneal, precursors / solvents

### Defining new classes

1. Synthesis
    * Precursors / reactants
    * Solvents
    * Time
    * Temperature
2. Spin coating
    * Speed
    * Time
    * Temperature
3. Annealing
    * Time
    * Temperature

#### Data Structure for ir.py (CDE) vs my preferred implementation

IR data structure

    ir_spectra:
     solvent
     units
     \
      \
       pk
       pkunits
       strength
       bond
     \
      \
      pk
      pkunits
      strength
      bond
     ...etc.
     
My proposed data structure

    spin_coat:
     solvent
     units
     \
      \
       spinspd
       spdunits
       time
       timeunits
     \
      \
       spinspd
       spdunits
       time
       timeunits

In [2]:
# What is this doing?
log = logging.getLogger(__name__)

In [3]:
# Creating SpinStep and SpinCoat class with various properties: speed, time, temperature, and respective units.
class SpinStep(BaseModel):
    """
    Classification for each spin-coating step
    """
    spd = StringType()
    spdunits = StringType(contextual=True)
    #time = StringType()
    #timeunits = StringType()
    #temp = StringType()
    #tempunits = StringType()

class SpinCoat(BaseModel):
    """
    Class for full list of spin-coating steps for entire spin-coating process.
    """
    solvent = StringType(contextual=True)
    steps = ListType(ModelType(SpinStep))
    #spd = StringType()
    #spdunits = StringType()
    

### Associating Parameters with a Chemical

In [4]:
#Compound.synthesis = ListType(ModelType(Synthesis))
Compound.spin_coat = ListType(ModelType(SpinCoat))
#Compound.anneal = ListType(ModelType(Anneal))

### Codestuff

In [5]:
# Adding GBL to the solvents list
gbl = (I('GBL') | R('^γ-?[bB]?utyrolactone$'))
solvent = (gbl | chemical_name)('solvent').add_action(join)

### Parser classes for parsing each step

In [6]:
# Deliminator
delim = R('^[;:,\./]$').hide()

# Defining formats for spin-coating value and units
spdunits = Optional(R(u'^r(\.)?p(\.)?m(\.)?$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)(\s?)?g$'))(u'spdunits').add_action(merge)
spd = (Optional(W('(')).hide() + R(u'^\d+(,\d+)[0][0]$')(u'spd') + Optional(W(')')).hide())

step = (spd + ZeroOrMore(spdunits))('step')
steps = (step + ZeroOrMore(ZeroOrMore(delim | W('and')).hide() + step))('steps')

spincoat = (steps + Optional(delim))

In [7]:
class SpinCoatParser(BaseParser):
    root = spincoat

    def interpret(self, result, start, end):
        c = Compound()
        s = SpinCoat(
            solvent=first(result.xpath('./solvent/text()'))
        )
        spdunits = first(result.xpath('./spdunits/text()'))
        for step in result.xpath('./steps/step'):
            spin_step = SpinSpd(
                spd=first(spd_result.xpath('./spd/text()')),
                spdunits=spdunits
            )
            s.steps.append(spin_step)
        c.spin_coat.append(s)
        yield c

### Parsers

In [8]:
Paragraph.parsers = [SpinCoatParser()]

## Testing Outputs

Recall `p` is the whole paragraph, `sy` is the synthesis sentence(s), `sp` is the spin-coat sentences, and `an` is the annealing sentence.

In [9]:
d = Document(u'The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively.')

d.records.serialize()

KeyboardInterrupt: 

## Importing test synthesis paragraph and sentence

In [None]:
# Nature Materials
ptext = u"A dense blocking layer of TiO2 (bl-TiO2, ∼70 nm in thickness) was deposited onto a F-doped SnO2 (FTO, Pilkington, TEC8) substrate by spray pyrolysis, using a 20 mM titanium diisopropoxide bis(acetylacetonate) solution (Aldrich) at 450 °C to prevent direct contact between the FTO and the hole-conducting layer. A 200–300-nm-thick mesoporous TiO2 (particle size: about 50 nm, crystalline phase: anatase) film was spin-coated onto the bl-TiO2/FTO substrate using home-made pastes14 and calcining at 500 °C for 1 h in air to remove organic components. CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively. The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in ethanol, recrystallized from diethyl ether, and dried at 60 °C in a vacuum oven for 24 h. The prepared MAI and MABr powders, PbI2 (Aldrich) and PbBr2 (Aldrich) for 0.8 M MAPb(I1 − xBrx)3 (x  =  0.1–0.15) solution were stirred in a mixture of GBL and DMSO (7:3 v/v) at 60 °C for 12 h. The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively. During the second spin-coating step, the substrate (around 1 cm × 1 cm) was treated with toluene drop-casting. A detailed time-rotation profile for the spin-coating is represented in Supplementary Fig. 1c. The substrate was dried on a hot plate at 100 °C for 10 min. A solution of poly(triarylamine) (15 mg, PTAA, EM Index, Mw  =  17,500 g mol−1) in toluene (1.5 ml) was mixed with 15 μl of a solution of lithium bistrifluoromethanesulphonimidate (170 mg) in acetonitrile (1 ml) and 7.5 μl 4-tert-butylpyridine and spin-coated on the MAPb(I1 − xBrx)3 (x  =  0.1–0.15)/mp-TiO2/bl-TiO2/FTO substrate at 3,000 r.p.m for 30 s. Finally, a Au counterelectrode was deposited by thermal evaporation. The active area of this electrode was fixed at 0.16 cm2."

In [None]:
# Formatting the Nature Nanotech paragraph according to ChemDataExtractor
p = Document(Heading(u'Solar cell fabrication'), Paragraph(ptext))

In [3]:
# Sentence version
sy = Paragraph(u"CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively. The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in ethanol, recrystallized from diethyl ether, and dried at 60 °C in a vacuum oven for 24 h.")
sp = Paragraph(u"The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively.")
an = Paragraph(u"The substrate was dried on a hot plate at 100 °C for 10 min.")

In [None]:
p

In [None]:
s

In [4]:
sp

In [None]:
sp.records.serialize()

In [None]:
sp.pos_tagged_tokens

In [None]:
nlp = spacy.load('en_core_web_sm') # Load English dictionary from spaCy

In [None]:
sp_nm = nlp(ptext)

for token in sp_nm:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

In [5]:
# Create Anneal class
#class Anneal(BaseModel):
#    time = StringType()
#    timeunits = StringType()
#    temp = StringType()
#    tempunits = StringType()

In [None]:
# Create Synthesis class
#class Synthesis(BaseModel):
    #precursor = StringType()
    #solvent = StringType()
    #time = StringType()
    #timeunits = StringType()
    #temp = StringType()
    #tempunits = StringType()

In [None]:
class SynthesisParser(BaseParser):

In [None]:
class AnnealParser(BaseParser):