In [6]:
def split_into_clauses(text):
    doc = nlp(text)
    clauses = []
    current_clause = []

    for token in doc:
        current_clause.append(token)
        # Check for coordinating conjunctions (e.g., "and", "but", "or") or subordinating conjunctions
        if token.dep_ == 'cc' or (token.pos_ == 'SCONJ'):
            # Remove the conjunction from the current clause
            current_clause = current_clause[:-1]
            clause_text = ' '.join([t.text for t in current_clause]).strip()
            if clause_text:
                clauses.append(clause_text)
            current_clause = [token]  # Start a new clause with the conjunction if needed

    # Add the last clause
    if current_clause:
        clause_text = ' '.join([t.text for t in current_clause]).strip()
        if clause_text:
            clauses.append(clause_text)

    return clauses
text = ("In rheumatoid arthritis, the body's immune system misfunctions by attacking healthy cells in the joints "
        "causing the release of a hormone that in turn causes pain and swelling.")

clauses = split_into_clauses(text)
for idx, clause in enumerate(clauses, 1):
    print(f"Clause {idx}: {clause}")


Clause 1: In rheumatoid arthritis , the body 's immune system misfunctions by attacking healthy cells in the joints causing the release of a hormone that in turn causes pain
Clause 2: and swelling .


In [13]:
import spacy
import benepar

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Add Benepar to the spaCy pipeline
if 'benepar' not in nlp.pipe_names:
    nlp.add_pipe('benepar', config={'model': 'benepar_en3'})


  state_dict = torch.load(
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
def split_into_clauses_constituency(text):
    doc = nlp(text)
    clauses = []

    for sent in doc.sents:
        if not sent._.constituency:
            clauses.append(sent.text.strip())
            continue

        tree = sent._.constituency

        # Function to recursively extract clauses
        def extract_clauses(tree):
            for subtree in tree.subtrees():
                if subtree.label() in ('S', 'SBAR', 'SINV', 'SQ'):
                    clause = ' '.join([leaf[0] for leaf in subtree.leaves()])
                    clause = clause.strip()
                    if clause and clause not in clauses:
                        clauses.append(clause)

        extract_clauses(tree)

    return clauses


In [18]:
import spacy
import benepar

# Step 1: Download Benepar model (Run only once)
# Uncomment the following line if you haven't downloaded the model yet
benepar.download('benepar_en3')

# Step 2: Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Step 3: Add Benepar to the spaCy pipeline
if 'benepar' not in nlp.pipe_names:
    nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

# Verify that 'benepar' is in the pipeline
print("Pipeline components:", nlp.pipe_names)

# Step 4: Define the clause splitting function
def split_into_clauses_constituency(text):
    doc = nlp(text)
    clauses = []

    for sent in doc.sents:
        if not sent._.constituency:
            clauses.append(sent.text.strip())
            continue

        tree = sent._.constituency

        # Function to recursively extract clauses
        def extract_clauses(tree):
            for subtree in tree.subtrees():
                if subtree.label() in ('S', 'SBAR', 'SINV', 'SQ'):
                    clause = ' '.join([leaf[0] for leaf in subtree.leaves()])
                    clause = clause.strip()
                    if clause and clause not in clauses:
                        clauses.append(clause)

        extract_clauses(tree)

    return clauses

# Step 5: Test the function
if __name__ == "__main__":
    text = ("In rheumatoid arthritis, the body's immune system misfunctions by attacking healthy cells in the joints "
            "causing the release of a hormone that in turn causes pain and swelling.")

    clauses = split_into_clauses_constituency(text)
    for idx, clause in enumerate(clauses, 1):
        print(f"Clause {idx}: {clause}")


[nltk_data] Downloading package benepar_en3 to /Users/log/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'benepar']


AttributeError: [E046] Can't retrieve unregistered extension attribute 'constituency'. Did you forget to call the `set_extension` method?

In [20]:
import spacy

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

def split_into_clauses_dependency(text):
    doc = nlp(text)
    clauses = []
    current_clause = []

    for token in doc:
        current_clause.append(token.text)
        # Identify clause boundaries based on coordinating conjunctions and punctuation
        if token.dep_ == 'cc' and token.text.lower() in {'and', 'but', 'or', 'so', 'yet', 'for', 'nor'}:
            # Remove the conjunction from the current clause
            clause = ' '.join(current_clause[:-1]).strip()
            if clause:
                clauses.append(clause)
            # Start a new clause
            current_clause = [token.text]
        elif token.text in {',', ';', ':'}:
            clause = ' '.join(current_clause[:-1]).strip()
            if clause:
                clauses.append(clause)
            current_clause = []

    # Add any remaining clause
    if current_clause:
        clause = ' '.join(current_clause).strip()
        if clause:
            clauses.append(clause)

    return clauses

# Test the function
if __name__ == "__main__":
    text = ("In rheumatoid arthritis, the body's immune system misfunctions by attacking healthy cells in the joints "
            "causing the release of a hormone that in turn causes pain and swelling.")

    clauses = split_into_clauses_dependency(text)
    for idx, clause in enumerate(clauses, 1):
        print(f"Clause {idx}: {clause}")


Clause 1: In rheumatoid arthritis
Clause 2: the body 's immune system misfunctions by attacking healthy cells in the joints causing the release of a hormone that in turn causes pain
Clause 3: and swelling .


In [22]:
import spacy
import deplacy
en = spacy.load('en_core_web_sm')

text = "This all encompassing experience wore off for a moment and in that moment, my awareness came gasping to the surface of the hallucination and I was able to consider momentarily that I had killed myself by taking an outrageous dose of an online drug and this was the most pathetic death experience of all time."

doc = en(text)
#deplacy.render(doc)

seen = set() # keep track of covered words

chunks = []
for sent in doc.sents:
    heads = [cc for cc in sent.root.children if cc.dep_ == 'conj']

    for head in heads:
        words = [ww for ww in head.subtree]
        for word in words:
            seen.add(word)
        chunk = (' '.join([ww.text for ww in words]))
        chunks.append( (head.i, chunk) )

    unseen = [ww for ww in sent if ww not in seen]
    chunk = ' '.join([ww.text for ww in unseen])
    chunks.append( (sent.root.i, chunk) )

chunks = sorted(chunks, key=lambda x: x[0])

for ii, chunk in chunks:
    print(chunk)

This all encompassing experience wore off for a moment and in that moment , my awareness came gasping to the surface of the hallucination and
I was able to consider momentarily that I had killed myself by taking an outrageous dose of an online drug and this was the most pathetic death experience of all time .


In [23]:
import spacy
nlp = spacy.load('en_core_web_sm')
# tokenization
doc1 = nlp(u'''Online sales hit a record high.Jewelry sales increased 8.8% and electrics sales grew 10.7% over last year.
Online shopping sales made up 14.6% of total retail spending and 24.5% of all holiday season shopping happened on Cyber Monday. ''')
print([token.text for token in doc1])

['Online', 'sales', 'hit', 'a', 'record', 'high', '.', 'Jewelry', 'sales', 'increased', '8.8', '%', 'and', 'electrics', 'sales', 'grew', '10.7', '%', 'over', 'last', 'year', '.', '\n', 'Online', 'shopping', 'sales', 'made', 'up', '14.6', '%', 'of', 'total', 'retail', 'spending', 'and', '24.5', '%', 'of', 'all', 'holiday', 'season', 'shopping', 'happened', 'on', 'Cyber', 'Monday', '.']


In [26]:
doc1 = nlp(u'''In rheumatoid arthritis, the body's immune system misfunctions by attacking healthy cells in the joints causing the release of a hormone that in turn causes pain and swelling. This hormone is normally activated only in reaction to injury or infection. A new arthritis medication will contain a protein that inhibits the functioning of the hormone that causes pain and swelling in the joints.
''')
for sentence in doc1.sents:
  print(sentence)

In rheumatoid arthritis, the body's immune system misfunctions by attacking healthy cells in the joints causing the release of a hormone that in turn causes pain and swelling.
This hormone is normally activated only in reaction to injury or infection.
A new arthritis medication will contain a protein that inhibits the functioning of the hormone that causes pain and swelling in the joints.



In [None]:
from datasets import load_dataset

ds = load_dataset("tasksource/spartqa-mchoice")

In [35]:
import json
import sys
import traceback

input_file = '/Users/log/Github/textual_grounding/data/SPARTQA/test.jsonl'   # Your input file path
output_file = '/Users/log/Github/textual_grounding/data/SPARTQA/test2.jsonl' # Your output file path

try:
    with open(input_file, 'r', encoding='utf-8') as infile:
        print("Input file opened successfully.")
        lines = infile.readlines()
        if not lines:
            print("Input file is empty.")
        else:
            print(f"Number of lines in input file: {len(lines)}")
    with open(output_file, 'w', encoding='utf-8') as outfile:
        print("Output file opened successfully.")
        print("Processing data...")
        for idx_line, line in enumerate(lines):
            print(f"\nProcessing line {idx_line+1}:")
            print(f"Raw line content: {line.strip()}")
            try:
                data = json.loads(line)
                print("JSON parsed successfully.")
            except json.JSONDecodeError as e:
                print(f"JSON decoding failed: {e}")
                continue  # Skip to the next line

            story = data.get('story', '')
            candidate_answers = data.get('candidate_answers', [])

            # Append candidate answers to the story with numbering
            options = ''
            for idx, answer in enumerate(candidate_answers):
                options += f"\n{idx}: {answer}"
            print(f"Options appended to story:{options}")
            data['story'] = story + options

            # Write the modified data to the output file
            json.dump(data, outfile, ensure_ascii=False)
            outfile.write('\n')
        print("Data processing completed.")
except Exception as e:
    print("An error occurred:")
    traceback.print_exc()


Input file opened successfully.
Number of lines in input file: 3594
Output file opened successfully.
Processing data...

Processing line 1:
Raw line content: {"story":"We have three blocks. Lets call them A, B and C. Block B is below A. Block A is below C. Block A contains a medium yellow square. Block B has two medium blue squares. Medium blue square number one is touching the bottom edge of this block. Medium blue square number two is below a medium yellow square. Medium blue square number one is below the square which is below the medium yellow square. It is below the medium yellow square. Block C contains one medium black square. What is below the black thing? a medium yellow square that is in block A or a medium yellow square that is in block B?","answer":2,"candidate_answers":["medium yellow square  that is in block A","medium yellow square  that is in block B","both of them","none of them"]}
JSON parsed successfully.
Options appended to story:
0: medium yellow square  that is in

In [37]:
import json

input_file = '/Users/log/Github/textual_grounding/data/SPARTQA/test.jsonl'
output_file = '/Users/log/Github/textual_grounding/data/SPARTQA/test2.jsonl'

# Open the input file for reading and the output file for writing
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for idx, line in enumerate(infile, start=1):
        # Parse the JSON object from the line
        data = json.loads(line.strip())
        
        # Add the "id" attribute
        data["id"] = idx
        
        # Write the updated JSON object back as a line to the output file
        outfile.write(json.dumps(data) + '\n')

print(f"Updated file with 'id' attribute written to: {output_file}")


Updated file with 'id' attribute written to: /Users/log/Github/textual_grounding/data/SPARTQA/test2.jsonl


In [39]:
import spacy

def extract_concepts(text):
    # Load the English language model
    nlp = spacy.load("en_core_web_sm")
    
    # Process the text
    doc = nlp(text)
    
    # Extract named entities
    entities = [ent.text for ent in doc.ents]
    
    # Extract noun phrases
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    
    # Extract important words (nouns, verbs, adjectives)
    important_words = []
    
    # Combine and deduplicate the results
    concepts = list(set(entities + noun_phrases + important_words))
    
    return concepts

# Example usage
text = """In rheumatoid arthritis, the body's immune system misfunctions by attacking healthy cells in the joints causing the release of a hormone that in turn causes pain and swelling. This hormone is normally activated only in reaction to injury or infection. A new arthritis medication will contain a protein that inhibits the functioning of the hormone that causes pain and swelling in the joints."""

extracted_concepts = extract_concepts(text)
print("Extracted concepts:")
for concept in extracted_concepts:
    print(f"- {concept}")

Extracted concepts:
- pain
- that
- reaction
- the hormone
- healthy cells
- This hormone
- A new arthritis medication
- a hormone
- injury
- the functioning
- infection
- turn
- the body's immune system misfunctions
- swelling
- the release
- the joints
- rheumatoid arthritis
- a protein
