In [9]:
from settings import load_settings
from text_utils import load_text_files
import pandas as pd
import os
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import AIMessage, HumanMessage
from pinecone import Pinecone

settings = load_settings()

In [12]:
text_data = load_text_files(os.path.join("../", settings.TEXT_DIR))
text_data['1985page54.txt'] = text_data['1985page54.txt'].replace("’", "'")

In [13]:
def clean_text(text):
    text_builder = []
    for line in text.split("\n"):
        line = line.lstrip()
        line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
        if line.strip() == "":
            continue
        if line.replace("-", "").strip() == "":
            text_builder.append(f"\n## {text_builder.pop()}")
            continue
        if line.startswith("#"):
            text_builder.append("")
        text_builder.append(line)
        
    return "\n".join(text_builder)

In [14]:
cleaned_text_data = {filename: clean_text(content) for filename, content in text_data.items()}

embed_model = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY)
semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_amount=95, breakpoint_threshold_type="percentile")
naive_chunker = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

chunked_text_data = []

for filename, content in cleaned_text_data.items():
    sentence_chunks = naive_chunker.split_text(content)
    
    # Step 2: Apply semantic chunking on these pre-chunks
    semantic_chunks = semantic_chunker.create_documents(sentence_chunks)
    
    # Store processed chunks
    chunked_text_data = [chunk.page_content for chunk in semantic_chunks]

In [15]:
chunked_text_data = list(set(chunked_text_data))

In [16]:
# Initialize LLM
llm = ChatOpenAI(model_name="gpt-4o-mini", openai_api_key=settings.OPENAI_API_KEY)

TYPES = [
    "Legal Instrument",  # Acts, statutes, constitutions, regulations (e.g., "Finance Act 1985")
    "Government Body",  # Government agencies, departments, courts (e.g., "Treasury", "Revenue Service")
    "Tax Or Duty",  # Various taxes, duties, and levies (e.g., "Excise Duty", "Capital Gains Tax")
    "Legal Entity",  # Persons, corporations, trusts (e.g., "Taxpayer", "Corporation A")
    "Financial Term",  # Monetary values, rates, allowances (e.g., "£15.77", "10% Tax Rate")
    "Jurisdiction",  # Countries, states, provinces (e.g., "United Kingdom", "State of California")
    "Legal Principle",  # Doctrines, legal concepts (e.g., "Tax Avoidance", "Due Process")
    "Statutory Provision",  # Sections, schedules, articles (e.g., "Section 98(1)(6)", "Schedule 1")
    "Legal Status",  # Active, repealed, prospective, retrospective (e.g., "Repealed", "Effective")
    "Procedure Rule",  # Appeals, penalties, compliance (e.g., "Record-Keeping Requirements")
]

# Prompt template for generating a title
title_prompt = PromptTemplate.from_template("Generate a concise three-word title for the following text:\n\n{text}\n\nTitle:")
type_prompt = PromptTemplate.from_template(f"Select one category that the following text falls under: [{", ".join(TYPES)}]" + " \n\n{text}\n\nCategory:")

chunked_text_data = []

for filename, content in cleaned_text_data.items():
    sentence_chunks = naive_chunker.split_text(content)
    
    # Step 2: Apply semantic chunking
    semantic_chunks = semantic_chunker.create_documents(sentence_chunks)
    
    for chunk in semantic_chunks:
        chunk_text = chunk.page_content
        
        # Generate a three-word title
        title = llm.invoke(title_prompt.format(text=chunk_text)).content.strip()
        type = llm.invoke(type_prompt.format(text=chunk_text)).content.strip()
        
        # Store results as tuples (title, content)
        chunked_text_data.append((title, chunk_text, type))

  llm = ChatOpenAI(model_name="gpt-4o-mini", openai_api_key=settings.OPENAI_API_KEY)


In [17]:
chunked_text_data

[('"Finance Act Overview"',
  "# Finance Act 1985\nAn Act to grant certain duties, to alter other duties, and to amend the law relating to the National Debt and the Public Revenue, and to make further provision in connection with Finance. [25th July 1985]\n[^X1] [^X2] Most Gracious Sovereign,We, Your Majesty's most dutiful and loyal subjects, the Commons of the United Kingdom in Parliament assembled, towards raising the necessary supplies to defray Your Majesty's public expenses, and making an addition to the public revenue, have freely and voluntarily resolved to give and grant unto your Majesty the several duties hereinafter mentioned; and do therefore most humbly beseech Your Majesty that it may be enacted, and be it enacted by the Queen's most Excellent Majesty, by and with the advice and consent of the Lords Spiritual and Temporal, and Commons, in this present Parliament assembled, and by the authority of the same, as follows:--",
  'Statutory Provision'),
 ('"Legislative Provisio

In [18]:
def vectorise(docs, titles, types, settings):
    embeddings = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY)
    # vectorize the documents
    vectorized_docs = [embeddings.embed_query(x) for x in docs]
    vectorized_docs = pd.DataFrame(vectorized_docs)

    docs_ = pd.DataFrame(docs, columns=["text"])
    num_embedding_dims = vectorized_docs.shape[1]
    vectorized_docs.columns = [f"e{i}" for i in range(num_embedding_dims)]
    output = pd.concat([docs_, vectorized_docs], axis=1)
    output.insert(0, 'title', titles)
    output.insert(1, 'type', types)

    return output

In [19]:
titles, texts, types = zip(*chunked_text_data)
docs = list(texts)
titles = list(titles)
valid_types = list(map(str.upper, TYPES))
types = [t.upper() if t.upper() in valid_types else t for t in types]
for i, t in enumerate(types):
    if t.upper() in valid_types:
        types[i] = t.upper()
    else:
        for type in valid_types:
            if type in t.upper():
                types[i] = type

docs, titles, types

(["# Finance Act 1985\nAn Act to grant certain duties, to alter other duties, and to amend the law relating to the National Debt and the Public Revenue, and to make further provision in connection with Finance. [25th July 1985]\n[^X1] [^X2] Most Gracious Sovereign,We, Your Majesty's most dutiful and loyal subjects, the Commons of the United Kingdom in Parliament assembled, towards raising the necessary supplies to defray Your Majesty's public expenses, and making an addition to the public revenue, have freely and voluntarily resolved to give and grant unto your Majesty the several duties hereinafter mentioned; and do therefore most humbly beseech Your Majesty that it may be enacted, and be it enacted by the Queen's most Excellent Majesty, by and with the advice and consent of the Lords Spiritual and Temporal, and Commons, in this present Parliament assembled, and by the authority of the same, as follows:--",
  '[^I1]: Act partly in force at Royal Assent, partly prospective, partly retr

In [20]:
output = vectorise(docs, titles, types, settings)

output.to_csv("output_semantic_chunk.csv", index=False)

In [21]:
def init_pinecone(vectorized_docs, settings):
    pc = Pinecone(api_key=settings.PINECONE_API_KEY)
    pinecone_index = pc.Index("irac")
    
    for i in range(len(vectorized_docs)):
        row = vectorized_docs.iloc[i]
        title = row["title"]
        chunk_text = row["text"]  # the chunk
        embedding  = row[3:].values.astype(float)  # everything else in the row
        type = row["type"]

        pinecone_index.upsert(
            vectors=[
                {
                    'id': str(i + 1),
                    'values': embedding.tolist(),  # MUST be a list of floats
                    'metadata': {
                        'source': 'legal-data',
                        'text': chunk_text,
                        'title': title,
                        'type': type
                    }
                }
            ]
        )

df = pd.read_csv("output_semantic_chunk.csv")
init_pinecone(df, settings)

In [22]:
cache = {}

def vectorise_BoW(BoW, settings):
    embeddings = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY)
    vectorized_words = []

    for word in BoW:
        if word in cache:
            vectorized_words.append(cache[word])
        else:
            embedding = embeddings.embed_query(word)
            cache[word] = embedding
            vectorized_words.append(embedding)

    vectorized_words = pd.DataFrame(vectorized_words)
    docs_ = pd.DataFrame(BoW, columns=["word"])
    num_embedding_dims = vectorized_words.shape[1]
    vectorized_words.columns = [f"e{i}" for i in range(num_embedding_dims)]
    output = pd.concat([docs_, vectorized_words], axis=1)

    return output

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from torch import cosine_similarity
import torch
from sklearn.feature_extraction.text import CountVectorizer

data_path = 'output_semantic_chunk.csv'
df = pd.read_csv(data_path)
# Extract embeddings only (4th column onwards)
embeddings = df.iloc[:, 3:]

# Preprocess: Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(embeddings)

# Apply PCA
pca = PCA(n_components=3)  # Keep 3D projection
X_pca = pca.fit_transform(X_scaled)

# for each pca.components_ find the top 5 vectors that are most similar to it
top_indices = []
bow = {}
for i in range(3):
    pc = pca.components_[i]
    similarities = cosine_similarity(torch.from_numpy(pc.reshape(1, -1)), torch.from_numpy(X_scaled))
    top_5_indices = torch.topk(similarities, 5).indices.numpy().flatten()
    top_indices.extend(top_5_indices)
    print(f"Top 5 vectors for PCA component {i + 1}:")
    df.insert(i, f'similarity_PCA{i + 1}', similarities.flatten())
    print(df.iloc[top_5_indices, (i):  i + 2])

    # Extract the text of the top 5 vectors for each PCA component
    top_texts = list(set("".join(df.iloc[top_indices]['text'].values).split(" ")))
    top_texts = [re.sub(r'\W+', '', word).lower() for word in top_texts if word.isalpha()]
    top_texts = list(set(top_texts))
    bow[f'PCA{i + 1}'] = top_texts

Top 5 vectors for PCA component 1:
     similarity_PCA1                     title
139         0.448876            Tax Law Repeal
79          0.435369       "Repeal of Section"
146         0.434241  "Tax Legislation Update"
249         0.430677         Tax Law Amendment
111         0.430308    Tax Legislation Update
Top 5 vectors for PCA component 2:
     similarity_PCA2                           title
126         0.485201       Treasury Statutory Orders
127         0.461853          Unitary State Taxation
239         0.452283     Tax Postponement Remittance
121         0.446776         Tax Credit Restrictions
247         0.443719  Treasury Functions Transferred
Top 5 vectors for PCA component 3:
     similarity_PCA3                          title
140         0.623561          "Unfinished Business"
147         0.614900         "71 Unveiled Insights"
98          0.586259    "Uncharted Territory Ahead"
53          0.575670  "Numerical Sequence Analysis"
114         0.559582        "F35 Ov

In [24]:
bow_embedding = []
for bag in bow.values():
    out = vectorise_BoW(bag, settings)
    bow_embedding.append(out)

In [None]:
similar_words_df = pd.DataFrame()

for i, bag in enumerate(bow_embedding):
    pc = pca.components_[i]
    similarities = cosine_similarity(torch.from_numpy(pc.reshape(1, -1)), torch.from_numpy(bag.iloc[:, 1:].values))
    top_10_indices = torch.topk(similarities, 20).indices.numpy().flatten()
    
    similar_words = bag.iloc[top_10_indices]
    similar_words['similarity'] = similarities.flatten()[top_10_indices]
    similar_words['PCA_component'] = f'PCA{i + 1}'
    
    similar_words_df = pd.concat([similar_words_df, similar_words])

similar_words_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_words['similarity'] = similarities.flatten()[top_10_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_words['PCA_component'] = f'PCA{i + 1}'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_words['similarity'] = similarities.flatten()[top_10_indices]
A value is trying to

Unnamed: 0,word,e0,e1,e2,e3,e4,e5,e6,e7,e8,...,e1528,e1529,e1530,e1531,e1532,e1533,e1534,e1535,similarity,PCA_component
0,repealed,-0.039626,-0.002867,0.000836,-0.008338,-0.03435,0.005307,-0.016744,-0.022922,-0.02443,...,0.011878,-0.02443,-0.026422,0.036853,-0.014725,-0.01239,-0.006471,-0.016475,0.024947,PCA1
1,repealing,-0.04094,0.001469,0.002205,-0.001783,-0.035929,0.0113,-0.031133,-0.009083,-0.020188,...,0.023672,-0.013812,-0.040243,0.026056,0.002649,0.000305,-0.008239,-0.012191,6.4e-05,PCA1
2,amending,-0.023317,0.00045,-0.007666,-0.016049,-0.041446,0.023817,-0.029153,-0.011077,-0.011273,...,0.005076,-0.018845,-0.011314,0.026005,0.001677,-0.001374,-0.009429,-0.02175,-0.020923,PCA1
3,taxation,0.001577,-0.018252,-0.000265,-0.044466,-0.020863,-0.000886,-0.019118,-0.015098,-0.009527,...,0.029291,-0.019389,-0.028438,-0.002336,0.00751,0.002346,-0.000356,-0.02328,-0.022264,PCA1
4,corporation,-0.01598,-0.014203,-0.006352,-0.039837,-0.011219,-0.001872,-0.017903,-0.018884,0.008587,...,0.026881,-0.011776,-0.020661,-5.6e-05,0.02033,-0.008441,-0.01663,-0.010556,-0.027096,PCA1
5,tax,0.015503,-0.033749,0.010118,-0.048056,-0.004064,0.001498,-0.017453,-0.037783,0.002533,...,0.023544,-0.028075,-0.025951,0.008726,0.018421,0.004,-0.004666,-0.025332,-0.035655,PCA1
6,chargeable,-0.01114,-0.008063,-0.00729,-0.031088,-0.029159,-0.009311,-0.020692,-0.011544,-0.016466,...,0.032676,-0.015388,-0.011942,-0.00473,0.01131,-0.011417,0.002996,-0.006971,-0.046862,PCA1
7,capital,0.024596,-0.030651,0.006107,-0.049495,-0.01127,0.029079,-0.023133,0.003021,-0.001854,...,0.029741,-0.013657,-0.033852,-0.00101,0.012525,-0.013788,-0.003442,-0.018057,-0.054703,PCA1
8,chapter,0.010119,-0.006788,0.004986,-0.030657,-0.00073,0.010299,-0.015877,-0.016519,-0.010433,...,0.027687,-0.020745,-0.01387,0.004451,0.004391,-0.00331,0.001618,-0.003294,-0.057172,PCA1
9,income,0.005696,-0.047407,0.009457,-0.04104,-0.007573,0.011618,-0.027735,-0.012516,-0.013363,...,0.029969,-0.017526,-0.025544,0.011034,0.005031,-0.008266,-0.003575,-0.042617,-0.06196,PCA1


In [26]:
similar_words_df.to_csv("similar_words.csv", index=False)

In [29]:
data_path = 'output_semantic_chunk.csv'
df = pd.read_csv(data_path)

In [28]:
df = df.iloc[:, 6:]
df.head()

Unnamed: 0,e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,...,e1526,e1527,e1528,e1529,e1530,e1531,e1532,e1533,e1534,e1535
0,-0.0143,-0.010493,-0.009473,-0.03046,-0.00564,0.006295,-0.0139,-0.002836,-0.0242,-0.021322,...,0.006847,-0.006647,0.019863,-0.024574,-0.023671,0.003527,0.038204,0.001141,0.00979,-0.013655
1,-0.021201,-0.02436,-0.003171,-0.014536,-0.015058,0.007388,-0.017842,-0.01299,-0.021536,-0.024829,...,-0.008011,-0.003775,0.024026,-0.013974,0.001914,0.016999,0.015192,0.010775,0.003311,-0.012455
2,0.003974,-0.001056,0.018366,-0.035335,-0.006113,-0.012146,-0.008236,-0.013214,-0.009667,-0.027677,...,0.010997,-0.011145,0.032487,-0.013207,0.009949,-0.018998,-0.017883,0.01717,0.003046,0.00614
3,0.016915,-0.017172,0.000583,-0.030818,-0.035131,0.008824,-0.015952,-0.007962,-0.013212,-0.020034,...,0.018637,-0.020713,0.02036,-0.019763,0.010248,0.004591,0.003567,0.001724,0.025433,-0.01412
4,0.014282,-0.009683,0.009922,-0.020215,-0.015623,0.011535,-0.016871,-0.005751,-0.024383,-0.009398,...,-0.002232,-0.007559,0.014946,-0.02587,-0.012988,0.035732,-0.005097,-0.02798,0.00041,-0.018702


In [31]:
df.insert(3, 'PCA1', X_pca[:, 0])
df.insert(4, 'PCA2', X_pca[:, 1])
df.insert(5, 'PCA3', X_pca[:, 2])

df.head()

Unnamed: 0,title,type,text,PCA1,PCA2,PCA3,e0,e1,e2,e3,...,e1526,e1527,e1528,e1529,e1530,e1531,e1532,e1533,e1534,e1535
0,"""Finance Act Overview""",STATUTORY PROVISION,# Finance Act 1985\nAn Act to grant certain du...,10.238246,12.429076,-1.482664,-0.0143,-0.010493,-0.009473,-0.03046,...,0.006847,-0.006647,0.019863,-0.024574,-0.023671,0.003527,0.038204,0.001141,0.00979,-0.013655
1,"""Legislative Provisions Status""",STATUTORY PROVISION,"[^I1]: Act partly in force at Royal Assent, pa...",10.623093,3.081029,-7.925429,-0.021201,-0.02436,-0.003171,-0.014536,...,-0.008011,-0.003775,0.024026,-0.013974,0.001914,0.016999,0.015192,0.010775,0.003311,-0.012455
2,Timed Provisions Enacted,STATUTORY PROVISION,Some provisions came into force at specific ti...,3.781979,7.784157,-3.336591,0.003974,-0.001056,0.018366,-0.035335,...,0.010997,-0.011145,0.032487,-0.013207,0.009949,-0.018998,-0.017883,0.01717,0.003046,0.00614
3,Customs and Excise,STATUTORY PROVISION,"[^X1]: ss. 1-3 , 5-7 , 10 , 98(1)(6) , Schs. 1...",7.438898,1.029142,4.813829,0.016915,-0.017172,0.000583,-0.030818,...,0.018637,-0.020713,0.02036,-0.019763,0.010248,0.004591,0.003567,0.001724,0.025433,-0.01412
4,"""Traffic Regulations Overview""",STATUTORY PROVISION,"2, 27 Pt. II from Gp 107:2(Road Traffic), ss. ...",2.849653,-3.442231,7.192774,0.014282,-0.009683,0.009922,-0.020215,...,-0.002232,-0.007559,0.014946,-0.02587,-0.012988,0.035732,-0.005097,-0.02798,0.00041,-0.018702


In [1]:
df.to_csv("output.csv", index=False)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/kaede/Desktop/IRACwithRAG/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3549, in run_code
  File "/tmp/ipykernel_160486/100344739.py", line 1, in <module>
    df.to_csv("output.csv", index=False)
    ^^
NameError: name 'df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kaede/Desktop/IRACwithRAG/.venv/lib/python3.12/site-packages/pygments/styles/__init__.py", line 45, in get_style_by_name
ModuleNotFoundError: No module named 'pygments.styles.default'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kaede/Desktop/IRACwithRAG/.venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 2173, in showtraceback
  File "/home/kaede/Desktop/IRACwithRAG/.venv/lib/python3.12/site-packages/IPython/core/ultratb.py", line 1182, in structured_traceback
 