# Examples using the various tools that make up PaperParser

In [1]:
import sys
sys.path.insert(0, '../paperparser/read_paper')
import extract_sentences
import sentence_classifier
import search_paper_for_perform_sentences

sys.path.insert(0, '../paperparser/parse')
import anneal
import order
import spincoat
import pce

import pandas as pd
from sklearn.externals import joblib

## Reading a paper

The first thing to do is take a paper and load it using `chemdataextractor`'s .html reader

### Read paper in HTML format as an input and store as a chemdataextractor Document type

In [15]:
paper = extract_sentences.read_html_paper('journal_articles/Paper0.html')

#### Extract all sentences and keep track of the sentences original location (element index in document and sentence index in element)

In [16]:
X_sentences, sentences_record = extract_sentences.extract_all_sentences(paper)

#### Load pre-trained model to extract relevant sentences from paper that contains synthesis steps

In [17]:
syn_sen_model = joblib.load('syn_sen_model.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'syn_sen_model.pkl'

#### Using model to classify sentences

In [None]:
pred_data, synthesis_sentences, not_synthesis_sentences = sentence_classifier.classify_sentences(syn_sen_model, X_sentences)

In [None]:
synthesis_sentences

#### Organizing sentences into a dataframe 

In [45]:
df_sentences = pd.DataFrame({'Sentences':X_sentences, 'Element # in doc':[rec[0] for rec in sentences_record],
                             'Sentence_index_in_para':[rec[1] for rec in sentences_record], 'Tag':pred_data})

In [46]:
df_sentences.loc[df_sentences['Tag'] == 1.0]

Unnamed: 0,Sentences,Element # in doc,Sentence_index_in_para,Tag
124,The spin-coated layer formed with the solvent ...,92,15,1.0
152,We see that the formation of the perovskite ph...,96,2,1.0
160,"Accordingly, the formation of the intermediate...",99,0,1.0
187,"Generally, the average value of the efficiency...",100,21,1.0
188,For a deeper understanding of the dependence o...,103,0,1.0
217,CH3NH3I (MAI) and CH3NH3Br (MABr) were first s...,109,2,1.0
218,The precipitate was recovered by evaporation a...,109,3,1.0
220,The resulting solution was coated onto the mp-...,109,5,1.0
221,"During the second spin-coating step, the subst...",109,6,1.0
223,The substrate was dried on a hot plate at 100 ...,109,8,1.0


#### Using synthesis parameters parsers - spincoat and anneal - on sentences tagged as containing synthesis steps

In [34]:
for syn_sentence in synthesis_sentences:
    print(spincoat.parse_spincoat(syn_sentence))

[]
[]
[]
[]
[]
[]
[]
[{'spin_coat': [{'spds': [{'spdvalue': '1,000', 'spdunits': 'r.p.m'}, {'spdvalue': '5,000', 'spdunits': 'r.p.m'}], 'times': [{'timevalue': '10', 'timeunits': 's'}, {'timevalue': '20', 'timeunits': 's'}]}]}]
[]
[]
[{'spin_coat': [{'spds': [{'spdvalue': '3,000', 'spdunits': 'r.p.m'}], 'times': [{'timevalue': '30', 'timeunits': 's'}]}]}]
[]


In [49]:
for syn_sentence in synthesis_sentences:
    print(anneal.parse_anneal(syn_sentence))

[]
[]
[]
[]
[]
[]
[]
[]
[]
[{'anneal': [{'temps': [{'tempvalue': '100', 'tempunits': '°C'}], 'times': [{'timevalue': '10', 'timeunits': 'min'}]}]}]
[]
[]


#### Using order function for a specified paragraph

In [55]:
paper[109]

In [56]:
steps_order, steps_dict = order.syn_order(paper[109])
steps_dict

{0: [],
 1: ['spin-coat'],
 2: [],
 3: ['dry'],
 4: [],
 5: ['coat', 'spin-coat'],
 6: ['spin-coat', 'dry'],
 7: ['spin-coat'],
 8: ['dry'],
 9: ['spin-coat'],
 10: [],
 11: []}

### Using device performance metrics parsers - PCE - on identified sentences

In [1]:
import sys
sys.path.insert(0, '../paperparser/read_paper')
import extract_sentences
import sentence_classifier
import search_paper_for_perform_sentences

sys.path.insert(0, '../paperparser/parse')
import anneal
import order
import spincoat
import pce as pceparser

import pandas as pd
from sklearn.externals import joblib

The first step is to find the relavent sentences to pass to the parser. 

In [2]:
relevant_sentences_to_pce = search_paper_for_perform_sentences.list_perform_sents('journal_articles/Paper0.html')

In [3]:
relevant_sentences_to_pce

['For example, when MAPbI3 was loaded on a mesoporous (mp)-TiO2 electrode by the sequential deposition of PbI2 and methylammonium iodide (MAI), a 15.0% power-conversion efficiency (PCE) was achieved under 1 sun illumination11.',
 'The Jsc, Voc and FF values obtained from the I–V curve of the reverse scan were 19.2 mA cm−2, 1.09 V and 0.69, respectively, yielding a PCE of 14.4% under standard AM 1.5 conditions.',
 'The average values from the J–V curves from the reverse and forward scans (Fig.\xa05a) exhibited a Jsc of 19.58 mA cm−2, Voc of 1.105 V, and FF of 76.2%, corresponding to a PCE of 16.5% under standard AM 1.5 G conditions.',
 'The best device also showed a very broad IPCE plateau of over 80% between 420 and 700 nm, as shown in Fig.\xa05b.',
 'One of these devices was certified by the standardized method in a photovoltaics calibration laboratory, confirming a PCE of 16.2% under AM 1.5 G full sun (Supplementary Fig.\xa06).',
 'In summary, we developed a solvent-engineering techn

Wow! Look at that output. It's lookin real nice. A whole paper down to just those sentences, and they all have quantitative info on the PCE! I don't know about you, but I am inpressed...

Back to buisiness, these sentences can be fed to the pce parcer to extract values and relations.

In [4]:
parsed_pce_info = pceparser.parse_pce(relevant_sentences_to_pce)
parsed_pce_info

[[], [], [], [], [], []]

In [5]:
pceparser.parse_pce(['Solar cells containing 1 display PCEs up to 4.73 %.'])

[[{'pce': [{'value': '4.73', 'units': '%'}]}]]

In [2]:
Sentence.parsers = [PCEParser()]
Sentence('Solar cells containing 1 display PCEs up to 4.73 %.').records.serialize()

NameError: name 'PCEParser' is not defined

In [3]:
Sentence.parsers = [PceParser()]
Sentence('Solar cells containing 1 display PCEs up to 4.73 %.').records.serialize()

[{'pce': [{'value': '4.73', 'units': '%'}]}]

In [21]:
def parse_pce(list_of_sentences):
    """ Takes a list of sentences and parses for quantified PCE
        information and relationships to chemicals/chemical labels
        """

    Sentence.parsers.append(PceParser())

    cde_senteces = [Sentence(sent).records.serialize() for sent in list_of_sentences]
    return cde_senteces

In [22]:
parse_pce(['Solar cells containing 1 display PCEs up to 4.73 %.'])

[[{'pce': [{'value': '4.73', 'units': '%'}]}]]

In [23]:
pce.parse_pce(['Solar cells containing 1 display PCEs up to 4.73 %.'])

AttributeError: 'And' object has no attribute 'parse_pce'