In [1]:
import extract_sentences
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin 
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from spacy.lang.en import English
import string

### Functions defining synthesis sentence predictor

In [2]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    punctuations = string.punctuation
    parser = English()
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

def syn_sen_predictor(X_train, Y_train):
    #create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
    vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
    classifier = LinearSVC()

    # Create the  pipeline to clean, tokenize, vectorize, and classify 
    pipe = Pipeline([("cleaner", predictors()),
                     ('vectorizer', vectorizer),
                     ('classifier', classifier)])

    # Fit model using training data
    pipe.fit([X_train[i] for i in range(len(X_train))], [Y_train[i] for i in range(len(Y_train))])

    return pipe

def classify_sentences(model, X_sentences):
    
    pred_data = model.predict(X_sentences) 
    predicted_output = pred_data.astype(np.float)
    synthesis_sentence = []
    not_synthesis_sentence = []
    for i in range(len(predicted_output)):
        if predicted_output[i] == 1:
            synthesis_sentence.append(X_sentences[i])
        else:
            not_synthesis_sentence.append(X_sentences[i])
            
    return synthesis_sentence, not_synthesis_sentence

### Data manipulation for training and testing datasets

In [3]:
# Load sample data
#manually identified/tagged synthesis paragraphs
train_p = [[117, 118, 119], [112], [117], [122, 125], [88]]
p=[1,2,3,4,5]
syn_yes=[]
syn_no=[]
for i in range(len(p)):
    sen_yes_arr, sen_no_arr = extract_sentences.extract_sentences('journal_articles/Paper' + str(p[i]) + '.html', train_p[p[i]-1])
    for j in range(len(sen_yes_arr)):
        syn_yes.append(sen_yes_arr[j])
    for k in range(len(sen_no_arr)):
        syn_no.append(sen_no_arr[k])
Syn_sen=pd.DataFrame({'x':syn_yes, 'y':np.ones(len(syn_yes))})
Syn_not_sen=pd.DataFrame({'x':syn_no, 'y':np.zeros(len(syn_no))})
Train=[Syn_sen,Syn_not_sen]
train_data=pd.concat(Train,ignore_index=True)

In [4]:
t=[0]
test_p = [[109]]
syn_test_yes=[]
syn_test_no=[]
for i in range(len(t)):
    sen_yes_arr, sen_no_arr = extract_sentences.extract_sentences('journal_articles/Paper' + str(t[i]) + '.html', test_p[t[i]])
    for j in range(len(sen_yes_arr)):
        syn_test_yes.append(sen_yes_arr[j])
    for k in range(len(sen_no_arr)):
        syn_test_no.append(sen_no_arr[k])
Syn_test_sen=pd.DataFrame({'X':syn_test_yes, 'Y':np.ones(len(syn_test_yes))})
Syn_test_not_sen=pd.DataFrame({'X':syn_test_no, 'Y':np.zeros(len(syn_test_no))})
Test=[Syn_test_sen,Syn_test_not_sen]
test_data=pd.concat(Test,ignore_index=True)

In [5]:
X_train=[str(train_data['x'][x]) for x in range(train_data.shape[0])]
Y_train=[str(train_data['y'][x]) for x in range(train_data.shape[0])]
X_test=[str(test_data['X'][x]) for x in range(test_data.shape[0])]
Y_test=[str(test_data['Y'][x]) for x in range(test_data.shape[0])]

### Train predictor model

In [6]:
syn_sen_model = syn_sen_predictor(X_train, Y_train)

### Test model and measure accuracy

In [7]:
pred_data = syn_sen_model.predict(X_test) 
print ("Accuracy:", accuracy_score(Y_test, pred_data))

Accuracy: 0.9894242068155111


In [10]:
synthesis_sentences, not_synthesis_sentences = classify_sentences(syn_sen_model, X_test)

In [11]:
synthesis_sentences

['A 200–300-nm-thick mesoporous TiO2 (particle size: about 50 nm, crystalline phase: anatase) film was spin-coated onto the bl-TiO2/FTO substrate using home-made pastes14 and calcining at 500 °C for 1 h in air to remove organic components.',
 'CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively.',
 'The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in ethanol, recrystallized from diethyl ether, and dried at 60 °C in a vacuum oven for 24 h.',
 'The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively.',
 'During the second spin-coating step, the substrate (around 1 cm × 1 cm) was treated with toluene drop-castin

### Try on a paper

In [27]:
import chemdataextractor
from chemdataextractor import Document
from chemdataextractor.reader import HtmlReader

def extract_all_sentences(paper_path):
    """extracts all sentences in paper as a list of strings"""

    f = open(paper_path, 'rb')
    doc = Document.from_file(f, readers=[HtmlReader()])
    
    sentences_list = list()

    for i in range(len(doc.elements)):
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sentences_list.append(str(sentence))

    return sentences_list

In [28]:
X_sentences = extract_all_sentences('journal_articles/Paper0.html')
synthesis_sentences, not_synthesis_sentences = classify_sentences(syn_sen_model, X_sentences)
synthesis_sentences

['Furthermore, it was reported that the uniformity of the perovskite films depended on the thickness of the TiO2 compact layer, and modification of the spinning conditions could not achieve 100% surface coverage20.',
 'We see that the formation of the perovskite phase is accompanied by the complete transformation of the MAI–PbI2–DMSO at 130 °C, whereas both MAI–PbI2–DMSO and perovskite phases coexist at 100 °C.',
 'As shown in Fig.\xa02d, at the initial stage during spinning, the film is composed of MAI and PbI2 dissolved in the DMSO/GBL solvent mixture, whereas in the intermediate stage, the composition of the film is concentrated by the evaporation of GBL.',
 'A 200–300-nm-thick mesoporous TiO2 (particle size: about 50 nm, crystalline phase: anatase) film was spin-coated onto the bl-TiO2/FTO substrate using home-made pastes14 and calcining at 500 °C for 1 h in air to remove organic components.',
 'CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (4