In [5]:
!pip install transformers
!pip install spacy
!pip install nltk



In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize SpaCy Matcher
matcher = Matcher(nlp.vocab)

# Define legal clause patterns for matching
clause_patterns = [
    [{"LOWER": "section"}, {"IS_DIGIT": True}],
    [{"LOWER": "clause"}, {"IS_DIGIT": True}],
    [{"LOWER": "article"}, {"IS_DIGIT": True}]
]

# Add clause patterns to Matcher
matcher.add("LEGAL_CLAUSE", clause_patterns)

# Load Transformers summarization pipeline
summarization_pipeline = pipeline("summarization")

def extract_legal_clauses(text):
    doc = nlp(text)
    matches = matcher(doc)
    legal_clauses = []
    for match_id, start, end in matches:
        legal_clause = Span(doc, start, end)
        legal_clauses.append(legal_clause)
    return legal_clauses

def generate_summary(text):
    # Split text into sentences
    sentences = sent_tokenize(text)
    # Concatenate sentences into one string
    input_text = " ".join(sentences[:200])  # Limit input to 200 tokens for BERT-based summarization
    # Generate summary
    summary = summarization_pipeline(input_text)[0]['summary_text']
    return summary

# Example usage
legal_document = """
Section 1: Definitions
1.1 "Agreement" means this legally binding document.
1.2 "Party" refers to the entities involved in this Agreement.
Clause 2: Rights and Obligations
2.1 Party A agrees to provide services.
2.2 Party B agrees to pay for the services provided by Party A.
"""

# Extract legal clauses from the document
clauses = extract_legal_clauses(legal_document)

# Process and summarize each legal clause
for clause in clauses:
    clause_text = clause.text
    summary = generate_summary(clause_text)
    print("Original Clause:", clause_text)
    print("Summary:", summary)
    print("=" * 50)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Your max_length is set to 142, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 142, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


Original Clause: Section 1
Summary:  Section 1 of Section 1. Section 1 is the first section of Section 2 . Section 2 is the second section of section 1 . Section 1 focuses on the development of Section 3 of Section 4. Section 2. Section 3 focuses on Section 4 of Section 5 . Section 5 is the only section that focuses on section 2 of Section 7 .
Original Clause: Clause 2
Summary:  Clause 2. Clause 2 . Clause 2 of Clause 1 . Clause 1. Clause 3. Clause 4. Clause 1: Clause 2 is Clause 3 . Clause 4: Clause 1 is Clause 2, Clause 2 and Clause 3 is Clause 4 . Clause 3: Clause 4 is Clause 1, Clause 4, Clause 3, Clause 5. Clause 6. Clause 5 is Clause 5 . Clause 6: Clause 3 of Clause 2
