In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
!pip install groq
!pip install langchain
!pip install langchain_openai
!pip install langchain-community



In [13]:
import transformers
from datasets import load_dataset
import pandas as pd
from datasets import Dataset

In [14]:
dataset_load = load_dataset("darrow-ai/LegalLensNLI")
dataset_load

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'legal_act', 'label'],
        num_rows: 312
    })
})

In [15]:
dataset = dataset_load['train']
dataset

Dataset({
    features: ['premise', 'hypothesis', 'legal_act', 'label'],
    num_rows: 312
})

In [16]:
def expand_dataset(dataset):
    list_exp = []
    id = 0
    for row in dataset:
        list_exp.append({
            'premise': row['premise'],
            'hypothesis': row['hypothesis'],
            'legal_act': row['legal_act'],
            'label': row['label'],
            'augment_method': 'None',
            'id': str(id),
            'from_id': 'None'
        })
        id += 1
    exp_dataset = Dataset.from_pandas(pd.DataFrame(list_exp))
    return exp_dataset, list_exp

expanded_dataset, list_exp = expand_dataset(dataset)
expanded_dataset[0]

{'premise': "Consumers who used an ADP timeclock in Illinois between June 5, 2013, and Nov. 6, 2020, may be eligible to claim a $250 class action rebate as part of a settlement. ADP, a company that provides human resources tools and services, was sued for violating the Illinois Biometric Information Privacy Act (BIPA) by collecting individuals' biometric information without their consent. BIPA prohibits the collection of biometric information without permission and requires disclosure about its storage and destruction. ADP agreed to pay $25 million to settle the lawsuit and will also provide a written retention policy on its website regarding biometric information. Class Members can submit a Claim Form to participate in the settlement, and the deadline for submission is Feb. 8, 2021.",
 'hypothesis': 'Really been enjoying that ADP timeclock at work in Illinois, makes clocking in and out a breeze!',
 'legal_act': 'privacy',
 'label': 'Neutral',
 'augment_method': 'None',
 'id': '0',
 'f

# LLM

In [17]:
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [18]:
class Paraphrase(BaseModel):
    hypo_85_paraphrase: str = Field(description="The 8.5 IELTS version paraphrase of the hypothesis")
    pre_85_paraphrase: str = Field(description="The 8.5 IELTS version paraphrase of the premise")
    hypo_65_paraphrase: str = Field(description = "The 6.5 IELTS version paraphrase of the hypothesis")
    pre_65_paraphrase: str = Field(description = "The 6.5 IELTS version paraphrase of the premise")

    def to_json(self):
        return {"hypo_85_paraphrase": self.hypo_85_paraphrase, "pre_85_paraphrase": self.pre_85_paraphrase, 'hypo_65_paraphrase': self.hypo_65_paraphrase, 'pre_65_paraphrase': self.pre_65_paraphrase}


In [19]:
OPENAI_API_KEY = 'your-api-key'
class Paraphraser:
    PARAPHRASE_TEMPLATE = """
        I am doing a Natural Language Inference task and i need you to help me augment my training data for a richer dataset.
        Here is the hypothesis {hypothesis} and here is the premise {premise}
        Given a legal pair of hypothesis and premise. I need you to paraphrase them, both the hypothesis and premise each has 2 versions.
        One version is as if you have the English level of a person with IELTS 8.5.
        One version is as if you have the English level of a person with IELTS 6.5.
        Please read and paraphrase carefully so that it does not lose meaning. 
        
        
        {format_instructions}
    """
    def __init__(self):
        self.model = ChatOpenAI(
            temperature=0,
            model='gpt-4o-mini',
            timeout= None,
            openai_api_key=OPENAI_API_KEY,
            streaming=False,
            verbose=True,
            model_kwargs={"seed": 42},
        )

    def _get_chain(self):
        parser = PydanticOutputParser(pydantic_object = Paraphrase)
        prompt = PromptTemplate.from_template(
            self.PARAPHRASE_TEMPLATE,
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )

        chain = prompt | self.model | parser

        return chain, prompt

    def paraphrase(self, hypothesis: str, premise: str) -> Paraphrase:
        chain, prompt = self._get_chain()
        res = chain.invoke({
            "hypothesis": hypothesis,
            "premise": premise
        })
        return res

In [None]:
import json
cache_file = 'he.json'
from tqdm import tqdm
paraphraser = Paraphraser()
augmented = []
for item in tqdm(expanded_dataset):
    paraphraser = Paraphraser()
    paraphrases = paraphraser.paraphrase(hypothesis = item['hypothesis'], premise = item['premise'])
    augmented.append({
        'premise': paraphrases.pre_85_paraphrase,
        'hypothesis': paraphrases.hypo_85_paraphrase,
        'legal_act': item['legal_act'],
        'label': item['label'],
        'augment_method': 'gpt4_8.5',
        'id': 'None',
        'from_id': item['id']
    })
    augmented.append({
        'premise': paraphrases.pre_65_paraphrase,
        'hypothesis': paraphrases.hypo_65_paraphrase,
        'legal_act': item['legal_act'],
        'label': item['label'],
        'augment_method': 'gpt4_6.5',
        'id': 'None',
        'from_id': item['id']
    })

augmented_df = pd.Dataframe(augmented)
augmented_df = pd.to_csv('augmnented.csv')