In [2]:
# Import libraries
import docx
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import scispacy
nlp = spacy.load("en_core_sci_lg")
from spacy import displacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc, Span, Token
from summa import summarizer
import matplotlib.pyplot as plt
import os
from sklearn.decomposition import PCA
import benepar
from benepar.spacy_plugin import BeneparComponent
nlp.add_pipe(BeneparComponent("benepar_en2"))

In [3]:
text = 'Amorphous oxide semiconductors (AOSs)—ternary or quaternary oxides of post‐transition metals such as In‐Sn‐O, Zn‐Sn‐O, or In‐Ga‐Zn‐O—have been known for a decade and have attracted a great deal of attention as they possess several technological advantages, including low‐temperature large‐area deposition, mechanical flexibility, smooth surfaces, and high carrier mobility that is an order of magnitude larger than that of amorphous silicon (a‐Si:H). Compared to their crystalline counterparts, the structure of AOSs is extremely sensitive to deposition conditions, stoichiometry, and composition, giving rise to a wide range of tunable optical and electrical properties. The large parameter space and the resulting complex deposition–structure–property relationships in AOSs make the currently available theoretical and experimental research data rather scattered and the design of new materials difficult. In this work, the key properties of several In‐based AOSs are studied as a function of cooling rates, oxygen stoichiometry, cation composition, or lattice strain. Based on a thorough comparison of the results of ab initio modeling, comprehensive structural analysis, accurate property calculations, and systematic experimental measurements, a four‐dimensional parameter space for AOSs is derived, serving as a solid foundation for property optimization in known AOSs and for design of next‐generation transparent amorphous semiconductors.'

In [4]:
doc = nlp("This is a sentence.")
displacy.render(doc, style="dep")

In [5]:
doc = nlp(text)
displacy.render(list(doc.sents)[0], style="dep")

In [168]:
doc = nlp(u"This is a sentence. It's never too late to do something.")
sent = list(doc.sents)[0]
print(sent._.parse_string)
#(S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))

(S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .))


In [150]:
doc = nlp(text)
sent = list(doc.sents)[3]
print(sent._.parse_string)

(S (PP (IN In) (NP (DT this) (NN work))) (, ,) (NP (NP (DT the) (JJ key) (NNS properties)) (PP (IN of) (NP (JJ several) (VBN In‐based) (NNS AOSs)))) (VP (VBP are) (VP (VBN studied) (PP (IN as) (NP (NP (DT a) (NN function)) (PP (IN of) (NP (NP (VBG cooling) (NNS rates)) (, ,) (NP (NN oxygen) (NN stoichiometry)) (, ,) (NP (NN cation) (NN composition)) (, ,) (CC or) (NP (NN lattice) (NN strain)))))))) (. .))


In [169]:
string = sent._.parse_string

In [170]:
def parenthetic_contents(string):
    """Generate parenthesized contents in string as pairs (level, contents)."""
    stack = []
    for i, c in enumerate(string):
        if c == '(':
            stack.append(i)
        elif c == ')' and stack:
            start = stack.pop()
            yield (len(stack), string[start + 1: i])

In [171]:
def return_top_np(parsed_sent):
    for element in parsed_sent:
        if 1 in element:
            if element[1][0:2]=='NP':
                return element[1]

In [172]:
parsed_sent = list(parenthetic_contents(string))

In [173]:
top_np_s = return_top_np(parsed_sent)
print((top_np_s))

NP (DT This)


In [174]:
# recursion funciton
def top_np(string):

# base case
    if string.count('NP') == 1:
        return string

    contents = parenthetic_contents(string)
    #print(list(contents))
    print('/n')
    for element in contents:
        if 0 in element:
            print(element[1])
            return top_np(element[1])


In [216]:
def get_words(string):
    parsed_string = list(parenthetic_contents(string))
    words = ''
    for node in parsed_string:
        words += ' ' + node[1].split(' ')[1]
    return words[1:]

In [217]:
top_np(top_np_s)

'NP (DT This)'

In [218]:
sent

This is a sentence.

## Get the main nodes of a paragraph
Here the subject of each setence is a node.

In [219]:
def get_nodes(text):
    '''
    Function that finds the main NP of each paragraph and
    returns them in a list to be used as nodes.
    
    inputs: string of paragraph text
    output: list of nodes
    '''
    
    # Create spacy doc
    doc = nlp(text)
    
    # Get sentences as list
    sentences = list(doc.sents)
    
    # Initiate list of nodes
    nodes = []
    
    for sentence in sentences:
        string = sentence._.parse_string
        parsed_sent = list(parenthetic_contents(string))
        top_np_s = return_top_np(parsed_sent)
        nodes.append(get_words(top_np(top_np_s)))
    
    return nodes

In [220]:
nodes = get_nodes(text)

/n
NP (NP (JJ Amorphous) (NN oxide) (NNS semiconductors)) (JJ (AOSs)—ternary) (CC or) (JJ quaternary) (NNS oxides)
/n
NP (JJ Amorphous) (NN oxide) (NNS semiconductors)
/n
NP (DT the) (NN structure)
/n
NP (DT The) (JJ large) (NN parameter) (NN space)
/n
NP (DT the) (JJ key) (NNS properties)
/n
NP (DT a) (JJ four‐dimensional) (NN parameter) (NN space)


In [221]:
nodes

[' Amorphous oxide semiconductors',
 ' the structure',
 ' The large parameter space',
 ' the key properties',
 ' a four‐dimensional parameter space']

### Manual parsing using parantheses 

In [8]:
# 0 sentence
(S 
 (NP 
  (NP 
   (NP (JJ Amorphous) (NN oxide) (NNS semiconductors)) 
   (JJ (AOSs)—ternary) 
   (CC or) 
   (JJ quaternary) 
   (NNS oxides)
  ) 
  (PP (IN of) (NP (NP (NN post‐transition) (NNS metals)) 
               (PP (JJ such) (IN as) 
                (NNP In‐Sn‐O) (, ,) 
                (NNP Zn‐Sn‐O) (, ,) 
                (CC or) 
                (NP (NNP In‐Ga‐Zn‐O)) (, —)
               )
              )
  )
 )
 
 (VP 
  (VP (VBP have) (VP (VBN been) (VP (VBN known) (PP (IN for) (NP (DT a) (NN decade)))))) 
  (CC and) 
  (VP 
   (VBP have) 
   (VP 
    (VBN attracted) 
    (NP (NP (DT a) (JJ great) (NN deal)) (PP (IN of) (NP (NN attention))))
    (SBAR 
     (IN as) 
     (S 
      (NP (PRP they)) 
      (VP (VBP possess) 
       (NP 
        (NP (JJ several) (JJ technological) (NNS advantages)) (, ,) 
        (PP 
         (VBG including) 
         (NP (NP (NN low‐temperature)) (NN large‐area) (NN deposition)) (, ,) 
         (NP (JJ mechanical) (NN flexibility)) (, ,) 
         (NP (JJ smooth) (NNS surfaces)) (, ,) 
         (CC and) 
         (NP 
          (NP (JJ high) (NN carrier) (NN mobility)) 
          (SBAR 
           (WHNP (WDT that)) 
           (S 
            (VP 
             (VBZ is) 
             (ADJP 
              (ADJP 
               (NP (DT an) (NN order)) 
               (PP (IN of) (NP (NN magnitude))) 
               (ADJP (JJR larger))
              ) 
              (PP 
               (IN than) 
               (NP 
                (NP (DT that)) 
                (PP 
                 (IN of) 
                 (NP 
                  (NP (JJ amorphous) (NN silicon)) 
                  (PRN 
                   (-LRB- -LRB-) 
                   (NP (JJ a‐Si) (. :) (NN H))
                   (-RRB- -RRB-)
                  )
                 )
                )
               )
              )
             )
            )
           )
          )
         )
        )
       )
      )
     )
    )
   )
  )
 ) 
 (. .)
)

SyntaxError: invalid syntax (<ipython-input-8-c1f7b964b557>, line 1)

In [None]:
# 1 sentence
(S 
 (PP (VBN Compared) (PP (TO to) (NP (PRP$ their) (JJ crystalline) (NNS counterparts)))) (, ,) 
 (NP (NP (DT the) (NN structure)) (PP (IN of) (NP (NNP AOSs)))) 
 (VP (VBZ is) 
  (ADJP (RB extremely) (JJ sensitive) 
               (PP (TO to) (NP 
                            (NP (NN deposition) (NNS conditions)) (, ,) 
                            (NP (NN stoichiometry)) (, ,) (CC and) 
                            (NP (NN composition))))) (, ,) 
  (S (VP (VBG giving) (NP (NN rise)) 
      (PP (TO to) (NP (NP (DT a) (JJ wide) (NN range)) 
                   (PP (IN of) 
                    (NP (JJ tunable) (ADJP (JJ optical) (CC and) (JJ electrical)) (NNS properties))
                   )
                  )
      )
     )
  )
 ) 
 (. .)
)

