In [22]:
import pandas as pd

In [4]:
!gdown 1k8Ud9xu5VmgtVMdpmNGwrfHOpmFs5c5b

Downloading...
From: https://drive.google.com/uc?id=1k8Ud9xu5VmgtVMdpmNGwrfHOpmFs5c5b
To: /kaggle/working/combined_data.tsv
100%|██████████████████████████████████████| 8.33M/8.33M [00:00<00:00, 63.0MB/s]


In [54]:
df = pd.read_csv('1combined_data.tsv', sep='\t')

In [55]:
import pandas as pd
import numpy as np
import regex
from multiprocessing import Pool, cpu_count
from tqdm.notebook import tqdm  # notebook-friendly

# Your cleaning function
def clean_content_fuzzy_regex(text, short_title=None):
    if pd.isna(text):
        return ""

    # text = text.replace("\n", " ").replace("\r", " ")
    # text = regex.sub(r'\s+', ' ', text)

    boilerplate_patterns = [
        r'PARLIAMENT OF THE DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA',
        r'Printed on the Orders? of Government',
        r'Printed at the Department of Government Printing, SRI LANKA',
        r'TO BE PURCHASED AT THE GOVT\. PUBLICATIONS BUREAU, COLOMBO',
        r'Price\s*:\s*\d+\s*cents',
        r'Postage\s*:\s*\d+\s*cents',
        r'Annual subscription of Bills and Laws of the Parliament.*?respect of the year following\.?'
    ]

    for pattern in boilerplate_patterns:
        text = regex.sub(f'({pattern}){{s<=2}}', '', text, flags=regex.IGNORECASE)

    text = regex.sub(r'(\w+)- (\w+)', r'\1\2', text)
    # text = regex.sub(r'\b\d+[-A-Za-z]*\s*\d+(,\d+)*\b', '', text)
    # text = regex.sub(r'\s+', ' ', text).strip()

    # Remove short_title if provided using fuzzy matching
    if short_title:
        pattern = regex.escape(short_title)
        text = regex.sub(f'({pattern}){{e<=2}}', '', text, flags=regex.IGNORECASE)

    return text

# Top-level function for multiprocessing (row by row)
def clean_row(args):
    text, short_title = args
    return clean_content_fuzzy_regex(text, short_title=short_title)

# Flattened multiprocessing with per-row tqdm
def parallel_clean(series, short_titles, n_jobs=None):
    if n_jobs is None:
        n_jobs = max(cpu_count() - 1, 1)
    
    results = []
    with Pool(n_jobs) as pool:
        for cleaned in tqdm(pool.imap(clean_row, zip(series, short_titles)), total=len(series)):
            results.append(cleaned)
    return pd.Series(results, index=series.index)

# Apply cleaning with short_title removal
df['content'] = parallel_clean(df['content'], df['short_title'])


  0%|          | 0/443 [00:00<?, ?it/s]

In [57]:
df['length'] = df['content'].apply(lambda x: len(x.split()))
df['chars'] = df['content'].apply(len)
df

Unnamed: 0,key,content,short_title,length,chars
0,1981-1-02,"C ACT, No. 2 OF 1981 [Certified on 29th Januar...",Presidential Elections (Special Provisions),2752,15695
1,1981-11-67,"ACT, No. 67 OF 1981 [Certified on 3rd Novembe...",Sri Lanka State Trading Corporations (Amendment),290,1643
2,1981-11-68,"ACT, No. 68 OF 1981 [Certified on 3rd Novemb...",Sir John Kotalawala Defence Academy,3928,22200
3,1981-11-69,"- - ACT, No 69 OE 1981 Certifled nn 12th Nove...",Turnover Tax,14276,80585
4,1981-11-71,"ACT, No. 71 OF 1981 [Certified on 18th Novemb...",Judicature (Amendment),1341,7574
...,...,...,...,...,...
438,2008-8-29,PARLIAMENT OF THE DEMOCRATIC SOCIALIST REPUBLI...,Malwathu Maha Vihariya Tibbatuwawe Sri Siddhar...,1863,11395
439,2008-8-30,"BUDDHIST CULTURAL CENTRE OF NEDIMALA,DEHIWALA...",Buddhist Culture Centre of Nedimala Dehiwela (...,1890,11688
440,2008-8-31,PARLIAMENT OFTHE DEMOCRATIC SOCIALIST REPUBLIC...,University of Vocational Technology,12878,77794
441,2008-9-32,"ACT, No.32 OF 2008 [Certified on 05th Septemb...",School Teachers Pension (Amendment),642,3958


In [58]:
df.describe()

Unnamed: 0,length,chars
count,443.0,443.0
mean,3114.293454,18009.051919
std,4876.739568,28171.043847
min,144.0,779.0
25%,942.0,5610.5
50%,1581.0,9557.0
75%,3019.0,17695.5
max,45267.0,265664.0


In [59]:
chunk_size = 1000
overlap = 200

chunks = []

for idx, row in df.iterrows():
    words = row['content'].split()
    start = 0
    chunk_id = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunks.append({
            'key': row['key'],
            'short_title': row['short_title'],
            'chunk_id': chunk_id,
            'content': ' '.join(chunk_words),
            'length': len(chunk_words)
        })
        chunk_id += 1
        start += chunk_size - overlap  # move forward with overlap

# Create new DataFrame
df_chunks = pd.DataFrame(chunks)
df_chunks

Unnamed: 0,key,short_title,chunk_id,content,length
0,1981-1-02,Presidential Elections (Special Provisions),0,"C ACT, No. 2 OF 1981 [Certified on 29th Januar...",1000
1,1981-1-02,Presidential Elections (Special Provisions),1,"be visible and, having held up the ballot pape...",1000
2,1981-1-02,Presidential Elections (Special Provisions),2,declare such candidate elected to the office o...,1000
3,1981-1-02,Presidential Elections (Special Provisions),3,days from the date of such publication. 22. A ...,352
4,1981-11-67,Sri Lanka State Trading Corporations (Amendment),0,"ACT, No. 67 OF 1981 [Certified on 3rd November...",290
...,...,...,...,...,...
1930,2008-8-31,University of Vocational Technology,16,Institute of Technical Education of Sri Lanka ...,78
1931,2008-9-32,School Teachers Pension (Amendment),0,"ACT, No.32 OF 2008 [Certified on 05th Septembe...",642
1932,2008-9-33,Information and Communication Technology (Amen...,0,"7 Act, No. 33 of 2008 Annual subscription of E...",1000
1933,2008-9-33,Information and Communication Technology (Amen...,1,determined by the Cabinet of Ministers. Powers...,900


In [62]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure RecursiveCharacterTextSplitter
chunk_size_words = 1000
chunk_overlap_words = 200

# LangChain splits by characters, so approximate characters per word
approx_chars_per_word = 6  # adjust if your words are longer
chunk_size = chunk_size_words * approx_chars_per_word
chunk_overlap = chunk_overlap_words * approx_chars_per_word

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    # separators=["\n\n", "\n"]
)

# Split the documents
all_chunks = []

for idx, row in df.iterrows():
    chunks = text_splitter.split_text(row['content'])
    for chunk_id, chunk in enumerate(chunks):
        all_chunks.append({
            'key': row['key'],
            'short_title': row['short_title'],
            'chunk_id': chunk_id,
            'content': chunk,
            'length': len(chunk.split())  # word count
        })

df_chunks = pd.DataFrame(all_chunks)
df_chunks.to_csv("2 df_chunks.tsv", sep="\t", index=False)
df_chunks

Unnamed: 0,key,short_title,chunk_id,content,length
0,1981-1-02,Presidential Elections (Special Provisions),0,"C ACT, No. 2 OF 1981 [Certified on 29th Januar...",435
1,1981-1-02,Presidential Elections (Special Provisions),1,"Act, No. 2 of 1981 (2) A member who wishes to ...",777
2,1981-1-02,Presidential Elections (Special Provisions),2,"Act, No. 2 of 1981 Declaration 11. Where any c...",780
3,1981-1-02,Presidential Elections (Special Provisions),3,"Act, No. 2 of 1981 (a) that the offence of bri...",760
4,1981-11-67,Sri Lanka State Trading Corporations (Amendment),0,"ACT, No. 67 OF 1981 [Certified on 3rd November...",290
...,...,...,...,...,...
1913,2008-8-31,University of Vocational Technology,19,"4 Act, No. 31 of 2008 (b) exercise the powers ...",981
1914,2008-9-32,School Teachers Pension (Amendment),0,"ACT, No.32 OF 2008 [Certified on 05th Septembe...",642
1915,2008-9-33,Information and Communication Technology (Amen...,0,"7 Act, No. 33 of 2008\n Annual subscription of...",705
1916,2008-9-33,Information and Communication Technology (Amen...,1,"7 Act, No. 33 of 2008\n Annual subscription of...",899
