# This notebook is for development of a tool to extract sentences describing numeric values of performance metrics

In [4]:
import sys
sys.path.insert(0, '../paperparser/read_paper')
import extract_sentences

sys.path.insert(0, '../paperparser/parse')

import pandas as pd
from sklearn.externals import joblib

import re

In [5]:
paper = extract_sentences.read_html_paper('journal_articles/Paper0.html')
X_sentences, sentences_record = extract_sentences.extract_all_sentences(paper)

In [7]:
len(X_sentences)

851

In [13]:
pce_sents = []
pce_patterns = ['PCE']
pce_units_patterns = ['%', 'percent']
for sent in X_sentences:
    for pce_pattern in pce_patterns:
        # Check for pce_pattern 
        pce_found = re.search(pce_pattern, sent, re.IGNORECASE)
        if pce_found: # check for percent and stop iterating if found
            # Check for units
            for pce_units_pattern in pce_units_patterns: 
                units_found = re.search(pce_units_pattern, sent)
                if units_found: break # stop looking for units
                # if this loop exits with out finding units, throw away the sentence
            if units_found:
                pce_sents.append(sent)
                break
pce_sents

['For example, when MAPbI3 was loaded on a mesoporous (mp)-TiO2 electrode by the sequential deposition of PbI2 and methylammonium iodide (MAI), a 15.0% power-conversion efficiency (PCE) was achieved under 1 sun illumination11.',
 'The Jsc, Voc and FF values obtained from the I–V curve of the reverse scan were 19.2 mA cm−2, 1.09 V and 0.69, respectively, yielding a PCE of 14.4% under standard AM 1.5 conditions.',
 'The average values from the J–V curves from the reverse and forward scans (Fig.\xa05a) exhibited a Jsc of 19.58 mA cm−2, Voc of 1.105 V, and FF of 76.2%, corresponding to a PCE of 16.5% under standard AM 1.5 G conditions.',
 'The best device also showed a very broad IPCE plateau of over 80% between 420 and 700 nm, as shown in Fig.\xa05b.',
 'One of these devices was certified by the standardized method in a photovoltaics calibration laboratory, confirming a PCE of 16.2% under AM 1.5 G full sun (Supplementary Fig.\xa06).',
 'In summary, we developed a solvent-engineering techn

Add check for numbers

In [14]:
pce_sents = []
pce_patterns = ['PCE']
pce_units_patterns = ['%', 'percent']
for sent in X_sentences:
    for pce_pattern in pce_patterns:
        # Check for pce_pattern 
        pce_found = re.search(pce_pattern, sent, re.IGNORECASE)
        if pce_found: # check for percent and stop iterating if found
            # Check for numbers
            numbers_found = re.search('\d+', sent)
            if not numbers_found:
                # Stop looking at sentence
                break
            # Check for units
            for pce_units_pattern in pce_units_patterns: 
                units_found = re.search(pce_units_pattern, sent)
                if units_found: break # stop looking for units
                # if this loop exits with out finding units, throw away the sentence
            if units_found:
                pce_sents.append(sent)
                break
pce_sents

['For example, when MAPbI3 was loaded on a mesoporous (mp)-TiO2 electrode by the sequential deposition of PbI2 and methylammonium iodide (MAI), a 15.0% power-conversion efficiency (PCE) was achieved under 1 sun illumination11.',
 'The Jsc, Voc and FF values obtained from the I–V curve of the reverse scan were 19.2 mA cm−2, 1.09 V and 0.69, respectively, yielding a PCE of 14.4% under standard AM 1.5 conditions.',
 'The average values from the J–V curves from the reverse and forward scans (Fig.\xa05a) exhibited a Jsc of 19.58 mA cm−2, Voc of 1.105 V, and FF of 76.2%, corresponding to a PCE of 16.5% under standard AM 1.5 G conditions.',
 'The best device also showed a very broad IPCE plateau of over 80% between 420 and 700 nm, as shown in Fig.\xa05b.',
 'One of these devices was certified by the standardized method in a photovoltaics calibration laboratory, confirming a PCE of 16.2% under AM 1.5 G full sun (Supplementary Fig.\xa06).',
 'In summary, we developed a solvent-engineering techn

building a simple test to make sure I have it right

In [21]:
test_sentences = [
    'This sentece should be found: I declare that the PCE has a value of 5% indeed yes sir.', # yes
    'This sentece should be found: I declare that the PCE has a value of 5 percent indeed yes sir.', # yes
    'This sentece should be found I think?: I declare that the PCE has a value of 5 percentage indeed yes sir.', # yes
    'This sentece should NOT be found: The PCE has a very high percent, yes it does', # no
    'This sentece should NOT be found: The PCE has a very high %, yes it does',
    'This sentece should NOT be found: The PCE has a very high percent, yes it does',
]

But first I have to functionalize the code block

In [26]:
def pce_sentence_search(sentence_list):
    """ Finds sentences in list that contain quantitative information about PCE (power conversion efficiency)"""
    pce_sents = []
    pce_patterns = ['PCE']
    pce_units_patterns = ['%', 'percent']
    for sent in sentence_list:
        for pce_pattern in pce_patterns:
            # Check for pce_pattern 
            pce_found = re.search(pce_pattern, sent, re.IGNORECASE)
            if pce_found: # check for percent and stop iterating if found
                # Check for numbers
                numbers_found = re.search('\d+', sent)
                if not numbers_found:
                    # Stop looking at sentence
                    break
                # Check for units
                for pce_units_pattern in pce_units_patterns: 
                    units_found = re.search(pce_units_pattern, sent)
                    if units_found: break # stop looking for units
                    # if this loop exits with out finding units, throw away the sentence
                if units_found:
                    pce_sents.append(sent)
                    break
    return pce_sents

Let's try it on the test sentences

In [27]:
pce_sentence_search(test_sentences)

['This sentece should be found: I declare that the PCE has a value of 5% indeed yes sir.',
 'This sentece should be found: I declare that the PCE has a value of 5 percent indeed yes sir.',
 'This sentece should be found I think?: I declare that the PCE has a value of 5 percentage indeed yes sir.']

It works!

## Generalizing to work with other performance metrics

I'd like for this to be able to work for VOC and JSC as well. 

In [29]:
def quantified_performance_sentence_search(sentence_list, metric='PCE'):
    """ Finds sentences in list that contain quantitative information about PCE (power conversion efficiency)"""
    return_sents = []
    
    if metric == 'PCE':
        metric_patterns = ['PCE']
        units_patterns = ['%', 'percent']
    elif metric == "VOC":
        metric_patterns = ['VOC']
        units_patterns = ['V\w', 'volts']
    elif metric == "JSC":
        metric_patterns = ['JSC']
        units_patterns = ['A\w', 'amps']
    else:
        raise ValueError('{} is not a valid performance metric'.format(metric))
    for sent in sentence_list:
        for pce_pattern in metric_patterns:
            # Check for pce_pattern 
            pce_found = re.search(pce_pattern, sent, re.IGNORECASE)
            if pce_found: # check for percent and stop iterating if found
                # Check for numbers
                numbers_found = re.search('\d+', sent)
                if not numbers_found:
                    # Stop looking at sentence
                    break
                # Check for units
                for pce_units_pattern in units_patterns: 
                    units_found = re.search(pce_units_pattern, sent)
                    if units_found: break # stop looking for units
                    # if this loop exits with out finding units, throw away the sentence
                if units_found:
                    return_sents.append(sent)
                    break
    return return_sents

In [30]:
quantified_performance_sentence_search(test_sentences)

['This sentece should be found: I declare that the PCE has a value of 5% indeed yes sir.',
 'This sentece should be found: I declare that the PCE has a value of 5 percent indeed yes sir.',
 'This sentece should be found I think?: I declare that the PCE has a value of 5 percentage indeed yes sir.']