## Initialization

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound, BaseModel, \
                                    StringType, ListType, ModelType
from chemdataextractor.doc import Paragraph, Sentence
from chemdataextractor.parse.actions import join
from chemdataextractor.parse import R, I, W, Optional, merge, ZeroOrMore
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

### Glossary for Regular Expression Parsing

`.parse`:

* `R(pattern)`: match token text with regular expression
* `I(match)`: case-insensitive match token text
* `W(match)`: match token text exactly
* `T(match)`: match tag exactly
* `Optional(expr)`: abstract class for combining and post-processing parsed tokens
* `merge(tokens, start, result)`: join tokens into a single string with no spaces
* `ZeroOrMore(expr)`: optional repetition of zero or more of the given expression
* `OneOrMore(expr)`: repetition of one or more of the given expression
* `Not(expr):` check ahead to disallow a match with the given parse expression

`.parse.actions`:

* `strip_stop`: removes trailing full stop from tokens
* `join`: join tokens into a single string with spaces between

### Outlining Annealing Parameter Extraction

    {'Synthesis' : {
                'spincoat' : {
                    'time' : {
                        'value' : (number),
                        'units' : 's',
                        },
                    },
                'anneal' : {
                    'time' : {
                        'value' : (number),
                        'units' : 's',
                        },
                    'temperature' : {
                        'value' : (number),
                        'units' : 'K',
                        },
                etc...
    }

Structuring my code to fit the format:

    create 'anneal' class to parse annealing conditions
        create 'time' subclass in the anneal class
        time: (value) (units)
        create 'temperature' subclass in the anneal class
        temperature: (value) (units)

In [3]:
# Creating Anneal class with various subclasses: AnnealTemp and AnnealTime (for parsing temperature and time, respectively).
class AnnealTemp(BaseModel):
    """
    Class for each spin-coating speed in a spin-coating process.
    """
    tempvalue = StringType()
    tempunits = StringType(contextual=True)
    
class AnnealTime(BaseModel):
    """
    Class for each spin-coating time in a spin-coating process.
    """
    timevalue = StringType()
    timeunits = StringType(contextual=True)

class Anneal(BaseModel):
    """
    Class for full list of spin-coating step parameters for full process.
    """
    temps = ListType(ModelType(AnnealTemp))
    times = ListType(ModelType(AnnealTime))

### Associating Parameters with a Chemical

In [4]:
Compound.anneal = ListType(ModelType(Anneal))

### Defining Parameters for the Parser

In [24]:
# Deliminator
delim = R('^[;:,\./]$').hide()

# Defining formats for annealing temperature and units
tempprefix = (I('at') | I('or')).hide()
tempunits = (W('°') + R('^[CFK]\.?$'))('tempunits').add_action(merge)
tempvalue = R('^\d{2,4}?$')('tempvalue').add_action(merge) + Optional(delim)

# Defining formats for spin-coating time and time units
timeprefix = I('for').hide()
timeunits = (R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim)
timevalue = R('^\d{,2}$')('timevalue') + Optional(delim)

# Putting everything together
temp = (tempvalue)('temp')
temps = (temp + ZeroOrMore(ZeroOrMore(tempprefix | tempunits | delim | W('and')).hide() + temp))('temps')
time = (timevalue)('time')
times = (time + ZeroOrMore(ZeroOrMore(timeunits | delim | W('and')).hide() + time))('times')

annealing = (tempprefix + temps + Optional(delim) + tempunits + Optional(delim) + timeprefix + Optional(delim) + times + Optional(delim) + timeunits + Optional(delim))('annealing')

In [25]:
class AnnealParser(BaseParser):
    root = annealing

    def interpret(self, result, start, end):
        c = Compound()
        s = Anneal()
        tempunits = first(result.xpath('./tempunits/text()'))
        timeunits = first(result.xpath('./timeunits/text()'))
        for temp in result.xpath('./temps/temp'):
            anneal_temp = AnnealTemp(
                tempvalue=first(temp.xpath('./tempvalue/text()')),
                tempunits=tempunits
            )
            s.temps.append(anneal_temp)
        for time in result.xpath('./times/time'):
            anneal_time = AnnealTime(
                timevalue=first(time.xpath('./timevalue/text()')),
                timeunits=timeunits
            )
            s.times.append(anneal_time)
        c.anneal.append(s)
        yield c

### Parsers

In [26]:
Paragraph.parsers = [AnnealParser()]

## Testing Outputs

In [27]:
sp = 'The substrate was dried on a hot plate at 100 °C for 10 min.'
s = Document(sp)

s.records.serialize()

[{'anneal': [{'temps': [{'tempvalue': '100', 'tempunits': '°C'}],
    'times': [{'timevalue': '10', 'timeunits': 'min'}]}]}]

In [28]:
def parse_anneal(anneal_str):
    """
    Given a string as input, converts the string into a ChemDrawExtractor Paragraph and returns a list of annealing parameters (speeds and times) found via parsing the string.
    """
    p = Paragraph(anneal_str)
    return p.records.serialize()

In [29]:
parse_anneal(sp)

[{'anneal': [{'temps': [{'tempvalue': '100', 'tempunits': '°C'}],
    'times': [{'timevalue': '10', 'timeunits': 'min'}]}]}]

In [30]:
sp2 = "The mesoporous TiO2 films were then infiltrated with CH3NH3SnI3−xBrx by spin coating at 4,000 r.p.m. for 45 s and dried at 125 °C for 30 min to remove the solvent."
parse_anneal(sp2)

[{'anneal': [{'temps': [{'tempvalue': '125', 'tempunits': '°C'}],
    'times': [{'timevalue': '30', 'timeunits': 'min'}]}]}]

In [31]:
sp3 = "The mesoporous TiO2 films were then infiltrated with PbI2 by spin coating at 6,500 r.p.m. for 90 s and dried at 70 °C for 30 min."
parse_anneal(sp3)

[{'anneal': [{'temps': [{'tempvalue': '70', 'tempunits': '°C'}],
    'times': [{'timevalue': '30', 'timeunits': 'min'}]}]}]

In [32]:
sp4 = "The substrate was then dried on a hot plate at 100 °C or 150 °C for 10 min."
parse_anneal(sp4)

[{'anneal': [{'temps': [{'tempvalue': '100', 'tempunits': '°C'},
     {'tempvalue': '150', 'tempunits': '°C'}],
    'times': [{'timevalue': '10', 'timeunits': 'min'}]}]}]

**In summary:**  Totally works on all of the test papers downloaded by Linnette! (Note that paper 5 does not have an annealing step.)

## Importing test synthesis paragraph and sentence

In [20]:
# Formatting the Nature Nanotech paragraph according to ChemDataExtractor
p = Document(Heading(u'Solar cell fabrication'), Paragraph(ptext))
p

In [10]:
# Sentence version
sp = 'The substrate was dried on a hot plate at 100 °C for 10 min.'
s = Document(sp)

In [None]:
p

In [4]:
s