In [2]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import extract_sentences_original
import string

In [3]:
punctuations = string.punctuation

In [4]:
from spacy.lang.en import English
parser = English()

In [5]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

In [6]:
# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [7]:
#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

In [8]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

In [9]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [10]:
# Load sample data
#manually identified/tagged synthesis paragraphs
train_p = [[117, 118, 119], [112], [117], [122, 125], [88],
           [142, 146], [130], [115], [123,125], [105]]
p = [1,2,3,4,5,6,7,8,9,10]
syn_yes=[]
syn_no=[]
for i in range(len(p)):
    #currently the path is local need to update that
    sen_yes_arr, sen_no_arr = extract_sentences_original.extract_sentences(r'C:\Users\neels\Desktop\Data Science\paper-parser\examples\journal_articles\Paper' + str(p[i]) + '.html', train_p[p[i]-1])
    for j in range(len(sen_yes_arr)):
        syn_yes.append(sen_yes_arr[j])
    for k in range(len(sen_no_arr)):
        syn_no.append(sen_no_arr[k])
Syn_sen=pd.DataFrame({'x':syn_yes, 'y':np.ones(len(syn_yes))}) #tagging 
Syn_not_sen=pd.DataFrame({'x':syn_no, 'y':np.zeros(len(syn_no))})
Train=[Syn_sen,Syn_not_sen]
train_data=pd.concat(Train,ignore_index=True)


In [11]:
#load test data
t=[0]
test_p = [[109]]
syn_test_yes=[]
syn_test_no=[]
for i in range(len(t)):
    #currently the path is local need to update that
    sen_yes_arr, sen_no_arr = extract_sentences_original.extract_sentences(r'C:\Users\neels\Desktop\Data Science\paper-parser\examples\journal_articles\Paper' + str(t[i]) + '.html', test_p[t[i]])
    for j in range(len(sen_yes_arr)):
        syn_test_yes.append(sen_yes_arr[j])
    for k in range(len(sen_no_arr)):
        syn_test_no.append(sen_no_arr[k])
Syn_test_sen=pd.DataFrame({'X':syn_test_yes, 'Y':np.ones(len(syn_test_yes))})
Syn_test_not_sen=pd.DataFrame({'X':syn_test_no, 'Y':np.zeros(len(syn_test_no))})
Test=[Syn_test_sen,Syn_test_not_sen]
test_data=pd.concat(Test,ignore_index=True)

In [12]:
#making list of strings to use it for pipeline
X_train=[str(train_data['x'][x]) for x in range(train_data.shape[0])]
Y_train=[str(train_data['y'][x]) for x in range(train_data.shape[0])]
X_test=[str(test_data['X'][x]) for x in range(test_data.shape[0])]
Y_test=[str(test_data['Y'][x]) for x in range(test_data.shape[0])]

In [13]:
# Create model and measure accuracy
a=pipe.fit([X_train[i] for i in range(len(X_train))], [Y_train[i] for i in range(len(Y_train))])

In [14]:
pred_data = a.predict([X_test[i] for i in range(len(X_test))]) 
for (sample, pred) in zip(X_test, pred_data):
    print (sample, pred) 
print ("Accuracy:", accuracy_score([Y_test[i] for i in range(len(Y_test))], pred_data))

A dense blocking layer of TiO2 (bl-TiO2, ∼70 nm in thickness) was deposited onto a F-doped SnO2 (FTO, Pilkington, TEC8) substrate by spray pyrolysis, using a 20 mM titanium diisopropoxide bis(acetylacetonate) solution (Aldrich) at 450 °C to prevent direct contact between the FTO and the hole-conducting layer. 0.0
A 200–300-nm-thick mesoporous TiO2 (particle size: about 50 nm, crystalline phase: anatase) film was spin-coated onto the bl-TiO2/FTO substrate using home-made pastes14 and calcining at 500 °C for 1 h in air to remove organic components. 0.0
CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively. 1.0
The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in ethanol, recrystallized from diethyl ether, and dried at 60 °C in a v

Generally, γ-butyrolactone (GBL), N,N-dimethylformamide, DMSO and N-methyl-2-pyrrolidone are used as effective solvents for lead halides and MAI. 0.0
However, simple spin-coating did not yield a homogeneous perovskite layer having uniform thickness over a large area20, although the convective spreading flow due to centrifugal force was applied to the slowly evaporating solvents. 0.0
Previously, we reported the formation of a pillared structure with an island-type upper layer10, and proposed the importance of a homogeneous upper layer on the mp-TiO2 base; however, we were unable to deposit uniform thin films by the solution process. 0.0
Furthermore, it was reported that the uniformity of the perovskite films depended on the thickness of the TiO2 compact layer, and modification of the spinning conditions could not achieve 100% surface coverage20. 0.0
This limited the applicability of the solution process. 0.0
Here, we report the solution-process fabrication of efficient perovskite solar 

The FTIR spectra of MAI, PbI2, DMSO–PbI2–DMSO, MAI–PbI2–DMSO and MAPbI3 are provided in Supplementary Fig. 3 for functional group comparison. 0.0
The detection of both the N–H and S–O deformation modes in the intermediate phase guarantees the successful inclusion of a solvent molecule (DMSO) and MAI into the PbI2. 0.0
Furthermore, the C=C mode indicative of the toluene dripping solvent is not detected. 0.0
The conversion of the intermediate phase into perovskite with annealing temperature was monitored by in situ high-temperature XRD. 0.0
The low-angle diffraction peaks in the XRD pattern of Fig. 2c clearly show that, with increasing temperature, the putative MAI–PbI2–DMSO disappears and new peaks are observed due to the appearance of MAPbI3 perovskite. 0.0
We see that the formation of the perovskite phase is accompanied by the complete transformation of the MAI–PbI2–DMSO at 130 °C, whereas both MAI–PbI2–DMSO and perovskite phases coexist at 100 °C. 1.0
On the basis of the above observ

Interestingly, the efficiency measured in the reverse scan did not vary greatly with the mp-TiO2 thickness, whereas the forward scan yielded a pronounced variation, resulting in a large discrepancy between the two scan directions. 0.0
Such a strong dependence of the measured efficiency on the scan direction with mp-TiO2 thickness suggests that solar cells fabricated from MAPbX3 perovskite materials operate differently than other types of cells such as organic, dye-sensitized, and conventional Si solar cells. 0.0
The stepwise increase or decrease in the bias voltage, corresponding to a reverse or forward scan, respectively, requires a charge redistribution to reach a new equilibrium state. 0.0
These characteristics of perovskite solar cells may be related to the large diffusion capacitance of such cells operating under reverse or forward biases24. 0.0
The large capacitance is assumed to be related to the underestimation in the forward scan and overestimation in the reverse scan by charg

PubMed 0.0
Article 0.0
Google Scholar 0.0
12. 0.0
Liu, M., Johnston, M. 0.0
B. 0.0
& Snaith, H. 0.0
J. 0.0
. 0.0
Nature 501, 395–398 (2013). 0.0
Show context for reference 12 0.0
CAS 0.0
PubMed 0.0
Article 0.0
Google Scholar 0.0
13. 0.0
Noh, J. 0.0
H., Im, S. 0.0
H., Heo, J. 0.0
H., Mandal, T. 0.0
N. 0.0
& Seok, S. 0.0
I. 0.0
. 0.0
Nano Lett. 13, 1764–1769 (2013). 0.0
Show context for reference 13 0.0
CAS 0.0
PubMed 0.0
Article 0.0
Google Scholar 0.0
14. 0.0
Malinkiewicz, O. et al. 0.0
. 0.0
Nature Photon. 0.0
6, 128–132 (2014). 0.0
Show context for reference 14 0.0
Google Scholar 0.0
15. 0.0
Waleed, W. 0.0
A. 0.0
& Etgar, L. 0.0
. 0.0
Energy Environ. Sci. 6, 3249–3253 (2013). 0.0
Show context for reference 15 0.0
CAS 0.0
Article 0.0
Google Scholar 0.0
16. 0.0
You, J. et al. 0.0
. 0.0
ACS Nano 8, 1674–1680 (2014). 0.0
Show context for reference 16 0.0
CAS 0.0
PubMed 0.0
Article 0.0
Google Scholar 0.0
17. 0.0
Stranks, S. 0.0
D. et al. 0.0
. 0.0
Science 342, 341–344 (2013). 0.0
Show cont

Article 0.0
Google Scholar 0.0
Xing, G. et al. 0.0
. 0.0
Science 342, 344–347 (2013). 0.0
CAS 0.0
PubMed 0.0
Article 0.0
Google Scholar 0.0
Jeng, J-U. 0.0
et al. 0.0
. 0.0
Adv. Mater. 25, 3727–3732 (2013). 0.0
CAS 0.0
PubMed 0.0
Article 0.0
Google Scholar 0.0
Eperon, G. 0.0
E., Burlakov, V. 0.0
M., Docampo, P., Goriely, A. 0.0
& Snaith, H. 0.0
J. 0.0
. 0.0
Adv. Funct. Mater. 24, 151–157 (2014). 0.0
CAS 0.0
Article 0.0
Google Scholar 0.0
Beckmann, P. 0.0
A. 0.0
. 0.0
Cryst. Res. Technol. 45, 455–460 (2010). 0.0
CAS 0.0
Article 0.0
Google Scholar 0.0
Miyamae, H. et al. 0.0
. 0.0
Chem. Lett. 6, 663–664 (1980). 0.0
Article 0.0
Google Scholar 0.0
Herman, M. et al. 0.0
. 0.0
Int. J. 0.0
Photoenergy 2012, 151452 (2012) 0.0
Article 0.0
Google Scholar 0.0
Koide, N. 0.0
& Han, I. 0.0
. 0.0
Rev. Sci. Instrum. 75, 2828–2831 (2004). 0.0
CAS 0.0
Article 0.0
Google Scholar 0.0
ISSN 1476-4660 (online) 0.0
About us 0.0
Press releases 0.0
Press office 0.0
Contact us 0.0
Journals A-Z 0.0
Articles by subj

In [15]:
predicted_output = pred_data.astype(np.float)
Synthesis_sentence=[]
Not_Synthesis_sentence=[]
for i in range(len(predicted_output)):
    if predicted_output[i]==1:
        Synthesis_sentence.append(X_test[i])
    else:
        Not_Synthesis_sentence.append(X_test[i])

In [16]:
Synthesis_sentence

['CH3NH3I (MAI) and CH3NH3Br (MABr) were first synthesized by reacting 27.86 ml CH3NH2 (40% in methanol, Junsei Chemical) and 30 ml HI (57 wt% in water, Aldrich) or 44 ml HBr (48 wt% in water, Aldrich) in a 250 ml round-bottom flask at 0 °C for 4 h with stirring, respectively.',
 'The precipitate was recovered by evaporation at 55 °C for 1 h. MAI and MABr were dissolved in ethanol, recrystallized from diethyl ether, and dried at 60 °C in a vacuum oven for 24 h.',
 'The resulting solution was coated onto the mp-TiO2/bl-TiO2/FTO substrate by a consecutive two-step spin-coating process at 1,000 and 5,000 r.p.m for 10 and 20 s, respectively.',
 'During the second spin-coating step, the substrate (around 1 cm × 1 cm) was treated with toluene drop-casting.',
 'The substrate was dried on a hot plate at 100 °C for 10 min.',
 'A solution of poly(triarylamine) (15 mg, PTAA, EM Index, Mw\xa0 = \xa017,500 g mol−1) in toluene (1.5 ml) was mixed with 15 μl of a solution of lithium bistrifluoromethan