In [1]:
import nltk
from nltk import sent_tokenize
from get_SAO_en import *
import json

nltk.download('punkt')

claims = "1. A display substrate, comprising: a base substrate; a thin film transistor on the base substrate; and a light shielding layer on the base substrate, the light shielding layer comprising a first light shielding layer and a second light shielding layer that are stacked, wherein an orthographic projection of an active layer of the thin film transistor on the base substrate is within an orthogonal projection of the light shielding layer on the base substrate, and the second light shielding layer comprises nanoparticles capable of absorbing light in a specific wavelength range.\n2. The display substrate according to claim 1, wherein a material of the first light shielding layer comprises monocrystalline silicon, polycrystalline silicon or amorphous silicon.\n3. The display substrate according to claims 1 or 2, wherein a material of the second light shielding layer further comprises silicon nitride or silicon carbide.\n4. The display substrate according to any one of claims 1-3, wherein a thickness of the first light shielding layer ranges from 400 Å to 600 Å, and a thickness of the second light shielding layer ranges from 200 Å to 500 Å.\n5. The display substrate according to any one of claims 1-4, wherein the nanoparticles are nano silicon particles.\n6. The display substrate according to claim 5, wherein particle sizes of the nano silicon particles range from 3nm to 5nm.\n7. The display substrate according to any one of claims 1-6, wherein the light is blue  light, and a wavelength of the blue light ranges from 420nm to 480 nm.\n8. The display substrate according to any one of claims 1-7, wherein the first light shielding layer is on a side of the second light shielding layer that is away from the base substrate; or the second light shielding layer is on a side of the first light shielding layer that is away from the base substrate\n9. The display substrate according to any one of claims 1-8, wherein the thin film transistor comprises a thin film transistor of a top gate structure or a thin film transistor of a bottom gate structure.\n10. The display substrate according to claim 9, wherein in a case where the thin film transistor has the bottom gate structure, the light shielding layer is on a side of the active layer that is away from the base substrate; or in a case where the thin film transistor has the top gate structure, the light shielding layer is disposed between the base substrate and the active layer.\n11. A display device, comprising the display substrate according to any one of claims 1-10.\n12. A manufacture method of a display substrate, comprising: providing a base substrate; forming a thin film transistor on the base substrate; and forming a light shielding layer, which comprises a first light shielding layer and a second light shielding layer, on the base substrate, wherein an orthographic projection of an active layer of the thin film transistor on the base substrate is within an orthogonal projection of the light shielding layer on the base substrate, and the second light shielding layer comprises nanoparticles capable of absorbing light in a specific wavelength range.\n13. The manufacture method of a display substrate according to claim 12, wherein a method of forming the second light shielding layer comprises spiral wave plasma chemical vapor deposition.\n14. The manufacture method of a display substrate according to claims 12 or 13, wherein the nanoparticles are nano silicon particles, and forming the second light shielding layer comprises: forming a second light shielding layer film comprising the nano silicon particles through a reaction gas comprising at least nitrogen, silane and hydrogen, or through a reaction gas comprising at least nitrogen, methane, silane and hydrogen; and performing a patterning process on the second light shielding layer film to form the second light shielding layer comprising the nano silicon particles.\n15. The manufacture method of a display substrate according to claim 14, wherein in a case where the second light shielding layer film is formed through the reaction gas comprising at least nitrogen, methane, silane and hydrogen, process conditions of the spiral wave plasma chemical vapor deposition comprise: a temperature ranging from 650 degrees Celsius to 750 degrees Celsius, power ranging from 400 Watts to 600 Watts, low pressure of pressure being up to 1.33 Pa, and a magnetic induction intensity ranging from 90 Gs to 130 Gs.\n16. The manufacture method of a display substrate according to claim 15, wherein the process conditions of the spiral wave plasma chemical vapor deposition comprise: the temperature being 700 degrees Celsius; the pressure is 1.33 Pa, the power being 500 watts; the magnetic induction intensity being 110 Gs and a volume ratio of the hydrogen, methane and silane being 1:2:40.\n17. The manufacture method of a display substrate according to any one of claims 12-16, wherein the light is blue light, and a wavelength of the blue light ranges from 420nm to 480  nm.\n18. The manufacture method of a display substrate according to any one of claims 12-16, wherein forming the light shielding layer, which comprises the first light shielding layer and the second light shielding layer, on the base substrate comprises: forming the first light shielding layer on the base substrate; and forming the second light shielding layer on the first light shielding layer.\n19. The manufacture method of a display substrate according to claim 18, wherein the light shielding layer is formed synchronously with the active layer in the thin film transistor, and the method comprises: after a thin film of the light shielding layer and a thin film of the active layer are sequentially formed on the base substrate, using a same mask for the thin film of the light shielding layer and the thin film of the active layer to form the light shielding layer and the active layer."
global preposition_list
preposition_list = ['about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'between', 'beyond', 'but', 'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out', 'over', 'past', 'plus', 'since', 'throughout', 'to', 'towards', 'under', 'until', 'up', 'upon', 'with', 'within', 'without']

def extractTriplets(claims):
    sents = sent_tokenize(claims, language="english")
    for sent in sents:
        saos = get_SAO_en(sent)
        if saos:
            if saos != None:
                return saos
            else:
                return []
            
def get_noun_chunks(claims, model=nlp):
    res = []
    doc = nlp(claims)

    for chunk in doc.noun_chunks:
        # remove det and stop words as root
        try:
            while chunk[0].pos_=="DET" or chunk[0].text in stopwords or chunk[0].lemma_ in stopwords or "," in chunk[0].text:
                chunk = chunk[1:]
        except IndexError:
            continue

        if len(chunk) == 1:
            if chunk[0].lemma_ not in stopwords:
                res.append(chunk)
        else:
            if not (chunk.root.lemma_ in stopwords):
                res.append(chunk.root)
                
            if not ("plurality of" in chunk.text.lower() or any(t.replace('.','',1).isdigit() for t in chunk.text.split()) or ("e.g." in chunk.text) or (len(chunk) > 8 and "," in chunk.text)): 
                res.append(chunk)

            if len(chunk) > 3:
                children_compound = [child.i for child in chunk.root.children if child.dep_ == "compound"]
                if children_compound:
                    idx = children_compound + [chunk.root.i]
                    span = doc[min(idx): max(idx)+1]

                    if span != chunk:
                        res.append(span)

                # to split some very long noun chunk which includes "," in it
                if len(chunk) > 8 and "," in chunk.text:
                        
                    start_index = chunk[0].i
                    curr_index = start_index + 1
                    while curr_index < chunk[-1].i:
                        if doc[curr_index].text == ",":

                            res.append(doc[start_index: curr_index])
                            start_index = curr_index + 1
                            curr_index = start_index + 1
                        else:
                            curr_index += 1
                    res.append(doc[start_index:chunk[-1].i+1])

    # convert all tokens into span type        
    res = [(doc[term.i: term.i+1] if type(term)==type(doc[0]) else term) for term in res]

    for i in range(len(res)-1):
        curr_term = res[i]
        next_term = res[i+1]
        try:
            if doc[curr_term[-1].i+1].text in preposition_list and next_term[0].i == (curr_term[-1].i+2):
                res.append(doc[curr_term[0].i: next_term[-1].i+1])
        except IndexError:
            continue

    return res

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/merouanebe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def extractListOfClaims(file):
    with open(file) as f:
        lines = f.readlines()
        stop = 1
        listOfClaims = []
        for line in lines:
            if stop == 0:
                claims = claims + line.replace("\n"," ")
            if "_____c:" in line:
                claims=""
                stop = 0
            if line == "\n":
                listOfClaims.append(claims)
                stop = 1
    return listOfClaims
            


f = open('verb_rel.json')
data = json.load(f)
listOfVerbs = ""
for i in data:
    listOfVerbs = listOfVerbs + " " + i
f.close()

verbsDict = {}

listOfClaims = extractListOfClaims('texts_raw_2018.txt')
for claims in listOfClaims:
    try:
        triplets = extractTriplets(claims)
        listOfTerms = get_noun_chunks(claims)
        listTerms = []
        for t in listOfTerms:
            if t.text not in listTerms:
                listTerms.append(t.text)
        print(claims)
        print("\n\n")
        print("-----------------------LISTE DES TERMES")
        print("\n\n")
        print(listTerms)
        print("\n\n")
        print("-----------------------LISTE DES TRIPLES")
        print("\n\n")

    
        if triplets is not None:
            for triplet in triplets:
                if triplet[0] in listTerms:
                    listTerms.remove(triplet[0])
                if triplet[2] in listTerms:
                    listTerms.remove(triplet[2])
                print(triplet)
                if triplet[1] not in listOfVerbs:
                    if triplet[1] in verbsDict:
                        verbsDict[triplet[1]] = verbsDict[triplet[1]] + 1
                    else:
                        verbsDict[triplet[1]] = 1
    except IndexError:
            continue
    print("\n\n")
    print("-----------------------LISTE DES TERMES NON UTILISES DANS LES TRIPLETS")
    print(listTerms)
    print("\n\n")
    print("\n\n")
    print("-----------------------AUTRES CLAIMS")
    
    
for key, value in verbsDict.items():
    print(f"{key}: {value}")

merouane
merouane
merouane
merouane
merouane
merouane
merouane
