In [76]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import sys
import nltk
from nltk.tokenize import sent_tokenize
from captum.attr import IntegratedGradients, LayerIntegratedGradients
from captum.attr import visualization as viz
import re
from bs4 import BeautifulSoup

In [88]:
model_directory = "../../model/LegalBert"

model = AutoModelForSequenceClassification.from_pretrained(model_directory) # Don't need attentions
tokenizer = AutoTokenizer.from_pretrained(model_directory)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() # Ensure in evaluation mode

loaded_data = np.load("../mlb.npz", allow_pickle=True)
loaded_classes = loaded_data["classes"]
mlb = MultiLabelBinarizer()
mlb.classes_ = loaded_classes
num_labels = len(mlb.classes_)

In [89]:
def remove_html_tags(text):
    """Removes HTML tags from text."""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ")

def clean_special_chars(text):
    """
    Removes/replaces special characters and URLs from text.
    """
    # Remove URLs
    text = re.sub(r"http\S+", "", text)  # Removes URLs starting with "http" or "https"

    # Remove characters that are not alphanumeric, whitespace, or basic punctuation
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s.,!?;:'\"-]", "", text)
    
    return cleaned_text

def normalize_whitespace(text):
    """Normalizes whitespace in text."""
    cleaned_text = " ".join(text.split())
    return cleaned_text.strip()

def preprocess_text(text):
    """Applies all preprocessing steps to text."""
    text = remove_html_tags(text)
    text = clean_special_chars(text)
    text = normalize_whitespace(text)
    text = text.lower()  # Lowercasing
    return text


In [90]:
license_text=""" 
Alien Public License v1.0
 
1. DEFINITIONS
"Contribution" means:
 
a) in the case of the initial Contributor, the initial content Distributed under this Agreement, and
b) in the case of each subsequent Contributor:
i) changes to the Program, and
ii) additions to the Program;
where such changes and/or additions to the Program originate from and are Distributed by that particular Contributor. A Contribution "originates" from a Contributor if it was added to the Program by such Contributor itself or anyone acting on such Contributor's behalf. Contributions do not include changes or additions to the Program that are not Modified Works.
 
"Contributor" means any person or entity that Distributes the Program.
 
"Licensed Patents" mean patent claims licensable by a Contributor which are necessarily infringed by the use or sale of its Contribution alone or when combined with the Program.
 
"Program" means the Contributions Distributed in accordance with this Agreement.
 
"Recipient" means anyone who receives the Program under this Agreement or any Secondary License (as applicable), including Contributors.
 
"Derivative Works" shall mean any work, whether in Source Code or other form, that is based on (or derived from) the Program and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship.
 
"Modified Works" shall mean any work in Source Code or other form that results from an addition to, deletion from, or modification of the contents of the Program, including, for purposes of clarity any new file in Source Code form that contains any contents of the Program. Modified Works shall not include works that contain only declarations, interfaces, types, classes, structures, or files of the Program solely in each case in order to link to, bind by name, or subclass the Program or Modified Works thereof.
 
"Distribute" means the acts of a) distributing or b) making available in any manner that enables the transfer of a copy to a third party.
 
"Source Code" means the form of a Program preferred for making modifications, including but not limited to software source code, documentation source, and configuration files.
 
"Secondary License" means either the GNU General Public License, Version 2.0, or any later versions of that license, or the Affero General Public License, Version 3.0, or any later versions of that license including any exceptions or additional permissions as identified by the initial Contributor.
 
2. GRANT OF RIGHTS
a) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, Distribute and sublicense the Contribution of such Contributor, if any, and such Derivative Works.
b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in Source Code or other form. This patent license shall apply to the combination of the Contribution and the Program if, at the time the Contribution is added by the Contributor, such addition of the Contribution causes such combination to be covered by the Licensed Patents. The licenses granted in this Section are effective on the date the Contributor first Distributes the Program.
c) Notwithstanding Section 2(b) above, no patent license is granted: 1) for code that is deleted from the Program; 2) separate from the Program; or 3) for infringements caused by: i) the modification of the Program or ii) the combination of the Program with other software or devices.
d) Recipient understands that although each Contributor grants the licenses to its Contributions set forth herein, no assurances are provided by any Contributor that the Program does not infringe the patent or other intellectual property rights of any other entity. Each Contributor disclaims any liability to Recipient for claims brought by any other entity based on infringement of intellectual property rights or otherwise. As a condition to exercising the rights and licenses granted hereunder, each Recipient hereby assumes sole responsibility to secure any other intellectual property rights needed, if any. For example, if a third party patent license is required to allow Recipient to Distribute the Program, it is Recipient's responsibility to acquire that license before distributing the Program.
e) Each Contributor represents that to its knowledge it has sufficient copyright rights in its Contribution, if any, to grant the copyright license set forth in this Agreement.
f) Notwithstanding the terms of any Secondary License, no Contributor makes additional grants to any Recipient (other than those set forth in this Agreement) as a result of such Recipient's receipt of the Program under the terms of a Secondary License.
3. REQUIREMENTS
3.1 If a Contributor Distributes the Program in any form, then:
a) the Program must also be made available as Source Code, in accordance with section 3.2, and the Contributor must accompany the Program with a statement that the Source Code for the Program is available under this Agreement, and informs Recipients how to obtain it in a reasonable manner on or through a medium customarily used for software exchange; and
b) the source code of any accompanying software used with the Program must also be made available under this Agreement along with information to Recipients on how to obtain the same; and
c) there must not be any further restrictions imposed on Recipient’s use of the Program.
3.2 When the Program is Distributed as Source Code:
a) it must be made available under this Agreement,
b) a copy of this Agreement must be included with each copy of the Program.
3.3 If the Program is combined with other material in a separate file or files made available under a Secondary License, and the initial Contributor attached the notice described in Exhibit A of this Agreement to the Source Code, then the Program may be made available under the terms of such Secondary Licenses.
3.4 If identifiable sections are not derived from the Program and can be reasonably considered independent and separate works in themselves, then this Agreement, and its terms, do not apply to those sections when Distributed as separate works.
3.5 Contributors may not remove or alter any copyright, patent, trademark, attribution notices, disclaimers of warranty, or limitations of liability ("notices") contained within the Program from any copy of the Program which they Distribute, provided that Contributors may add their own appropriate notices.
4. COMMERCIAL DISTRIBUTION
Commercial distributors of software may accept certain responsibilities with respect to end users, business partners and the like. While this license is intended to facilitate the commercial use of the Program, the Contributor who includes the Program in a commercial product offering should do so in a manner which does not create potential liability for other Contributors. Therefore, if a Contributor includes the Program in a commercial product offering, such Contributor ("Commercial Contributor") hereby agrees to defend and indemnify every other Contributor ("Indemnified Contributor") against any losses, damages and costs (collectively "Losses") arising from claims, lawsuits and other legal actions brought by a third party against the Indemnified Contributor to the extent caused by the acts or omissions of such Commercial Contributor in connection with its distribution of the Program in a commercial product offering. The obligations in this section do not apply to any claims or Losses relating to any actual or alleged intellectual property infringement. In order to qualify, an Indemnified Contributor must: a) promptly notify the Commercial Contributor in writing of such claim, and b) allow the Commercial Contributor to control, and cooperate with the Commercial Contributor in, the defense and any related settlement negotiations. The Indemnified Contributor may participate in any such claim at its own expense.
 
For example, a Contributor might include the Program in a commercial product offering, Product X. That Contributor is then a Commercial Contributor. If that Commercial Contributor then makes performance claims, or offers warranties related to Product X, those performance claims and warranties are such Commercial Contributor's responsibility alone. Under this section, the Commercial Contributor would have to defend claims against the other Contributors related to those performance claims and warranties, and if a court requires any other Contributor to pay any damages as a result, the Commercial Contributor must pay those damages.
 
5. NO WARRANTY
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the appropriateness of using and distributing the Program and assumes all risks associated with its exercise of rights under this Agreement, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and unavailability or interruption of operations.
 
6. DISCLAIMER OF LIABILITY
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 
7. GENERAL
This Agreement is to be construed according to the laws of the State of California and you consent to personal jurisdiction in the State of California in the event it is necessary to enforce the provisions of this License.
If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
If Recipient institutes patent litigation(i) against Contributor with respect to a patent applicable to software or (ii) against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed.
All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive.
Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. The Program (including Contributions) may always be Distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to Distribute the Program (including its Contributions) under the new version.
 
Exhibit A - Form of Secondary Licenses Notice
"This Source Code may also be made available under the following Secondary Licenses when the conditions for such availability set forth in this license are satisfied: {name license(s), version(s), and exceptions or additional permissions here}."
 
Simply including a copy of this Agreement, including this Exhibit A is not sufficient to license the Source Code under Secondary Licenses.
 
If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE file in a relevant directory) where a recipient would be likely to look for such a notice.
 
You may add additional accurate notices of copyright ownership.
"""

In [91]:
def predictor(input_ids, attention_mask=None, token_type_ids=None):
    """
    Wrapper function for Captum compatibility.
    """
    outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    return torch.sigmoid(outputs.logits)

def construct_input_and_baseline(text):
    """
    Creates input_ids, attention_mask, token_type_ids, and baseline.
    Handles missing token_type_ids.
    """
    encoded_input = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    if 'token_type_ids' in encoded_input:
        token_type_ids = encoded_input['token_type_ids'].to(device)
    else:
        token_type_ids = None

    baseline_ids = torch.full_like(input_ids, tokenizer.pad_token_id).to(device)
    baseline_ids[0, 0] = tokenizer.cls_token_id

    return input_ids, attention_mask, token_type_ids, baseline_ids

def summarize_attributions(attributions):
    """
    Sums attributions to get a single score per token.
    """
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

def get_token_to_sentence_mapping(text):
    encoded = tokenizer(text, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    offsets = encoded["offset_mapping"][0].tolist()
    sentences = sent_tokenize(text)
    
    current_index = 0
    sentence_boundaries = []
    for sent in sentences:
        start = text.find(sent, current_index)
        end = start + len(sent)
        sentence_boundaries.append((start, end))
        current_index = end

    token_to_sentence = []
    for start, end in offsets:
        if start == end:
            token_to_sentence.append(-1)
        else:
            sent_idx = next((i for i, (s, e) in enumerate(sentence_boundaries) if s <= start < e), -1)
            token_to_sentence.append(sent_idx)
    return token_to_sentence, sentences

def get_top_sentences(text, attributions, predicted_labels, top_k=3):
    token_to_sentence, sentences = get_token_to_sentence_mapping(text)
    
    result_dict = {}
    # Assume attributions is a list of tensors, one per label.
    for label_index, label in enumerate(predicted_labels[0]):
        sentence_scores = []
        for sent_idx in range(len(sentences)):
            indices = [i for i, x in enumerate(token_to_sentence) if x == sent_idx]
            if indices:
                score = np.sum(attributions[label_index].cpu().numpy()[indices])
            else:
                score = 0.0
            sentence_scores.append((sentences[sent_idx], score))
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        result_dict[label] = [sentence for sentence, _ in sentence_scores[:top_k]]
    return result_dict

In [92]:
# --- Main Execution ---
input_ids, attention_mask, token_type_ids, baseline_ids = construct_input_and_baseline(license_text)

# Use LayerIntegratedGradients to attribute to the embeddings. This is generally
# preferred for text models.  We use the word embeddings layer.
lig = LayerIntegratedGradients(predictor, model.base_model.embeddings)

# Calculate attributions for each predicted label.
attributions = []
probabilities = predictor(input_ids, attention_mask, token_type_ids)
predictions = (probabilities > 0.5).int()
predicted_labels = mlb.inverse_transform(predictions.cpu().numpy())

In [93]:
print(f"Predicted labels: {predicted_labels}")



In [94]:
for label_index in range(len(predicted_labels[0])):
    attributions_label = lig.attribute(inputs=input_ids,
                                      baselines=baseline_ids,
                                      additional_forward_args=(attention_mask, token_type_ids),
                                      target=label_index,
                                      n_steps=100,  # Number of steps in the approximation. Increase for more accuracy.
                                      internal_batch_size=1 # To avoid any memory issues
                                      )
    attributions.append(summarize_attributions(attributions_label))

In [95]:
result = get_top_sentences(license_text, attributions, predicted_labels, top_k=3)

In [96]:
print(result)



In [97]:
for label, sentences in result.items():
    print(f"\nTop 3 Sentences associated with {label}:")
    for sentence in sentences:
        print(f"- {sentence}")


- Modified Works shall not include works that contain only declarations, interfaces, types, classes, structures, or files of the Program solely in each case in order to link to, bind by name, or subclass the Program or Modified Works thereof.
- 2.
- b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in Source Code or other form.

- "Secondary License" means either the GNU General Public License, Version 2.0, or any later versions of that license, or the Affero General Public License, Version 3.0, or any later versions of that license including any exceptions or additional permissions as identified by the initial Contributor.
- "Source Code" means the form of a Program preferred for making modifications, including but not limited to software source code, documentatio