In [1]:
!gdown https://drive.google.com/uc?id=1F_WDXotjwawHL2HO4FEAKungLN-rPGN6 -O data.tsv

Downloading...
From: https://drive.google.com/uc?id=1F_WDXotjwawHL2HO4FEAKungLN-rPGN6
To: /kaggle/working/data.tsv
100%|██████████████████████████████████████| 16.7M/16.7M [00:00<00:00, 62.8MB/s]


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 

import warnings
warnings.filterwarnings('ignore') 


In [3]:
df = pd.read_csv('data.tsv', sep='\t')
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Sort by date descending
df = df.sort_values(by='date', ascending=False)

In [4]:
# df

In [5]:
df['doc_type_name'].value_counts()

doc_type_name
extra-gazettes    34770
gazettes           6275
acts               1647
bills              1351
Name: count, dtype: int64

In [6]:
import os
import requests
from urllib.parse import urlparse

def raw_download(url):
    try:
        path = urlparse(url).path
        relative_path = path.split('/view/')[-1]  # e.g., "acts/2021/10/24-2021_E.pdf"
        parts = relative_path.strip('/').split('/')

        folder_name = parts[0]  # e.g., 'acts'
        filename = '-'.join(parts[1:])  # e.g., "2021-10-24-2021_E.pdf"

        folder_path = os.path.join('pdf', folder_name)
        os.makedirs(folder_path, exist_ok=True)

        filepath = os.path.join(folder_path, filename)

        # Skip if file already exists
        if os.path.exists(filepath):
            print(f"[SKIPPED] Already exists: {filepath}")
            return

        response = requests.get(url)
        response.raise_for_status()

        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"[DOWNLOADED] {filepath}")

    except Exception as e:
        print(f"[ERROR] Failed to download {url} — {type(e).__name__}: {e}")


In [7]:
import time
import pandas as pd

def download_range(doc_type='acts', start=0, end=9):
    # Filter and sort
    filtered_df = df[
        (df['doc_type_name'] == doc_type) &
        (df['source_url_en'].notna()) &
        (df['source_url_en'].str.strip() != '')
    ].sort_values(by='date', ascending=False).reset_index(drop=True)

    # Adjust slice range
    sliced_df = filtered_df.iloc[start:end]

    total_start = time.time()

    for idx, url in enumerate(sliced_df['source_url_en'], start=start + 1):
        start_time = time.time()
        raw_download(url)
        end_time = time.time()
        print(f"  [{idx}] Download time: {end_time - start_time:.2f} seconds")

    total_end = time.time()
    print(f"\n✅ Total time for {len(sliced_df)} downloads: {total_end - total_start:.2f} seconds")


In [8]:
download_range(doc_type='acts', start=0, end=99999)


[DOWNLOADED] pdf/acts/2025-6-07-2025_E.pdf
  [1] Download time: 4.13 seconds
[DOWNLOADED] pdf/acts/2025-6-08-2025_E.pdf
  [2] Download time: 3.28 seconds
[DOWNLOADED] pdf/acts/2025-5-06-2025_E.pdf
  [3] Download time: 1.65 seconds
[DOWNLOADED] pdf/acts/2025-4-05-2025_E.pdf
  [4] Download time: 4.37 seconds
[DOWNLOADED] pdf/acts/2025-4-04-2025_E.pdf
  [5] Download time: 3.76 seconds
[DOWNLOADED] pdf/acts/2025-3-03-2025_E.pdf
  [6] Download time: 3.51 seconds
[DOWNLOADED] pdf/acts/2025-3-02-2025_E.pdf
  [7] Download time: 1.14 seconds
[DOWNLOADED] pdf/acts/2025-2-01-2025_E.pdf
  [8] Download time: 11.80 seconds
[DOWNLOADED] pdf/acts/2024-6-32-2024_E.pdf
  [9] Download time: 4.28 seconds
[DOWNLOADED] pdf/acts/2024-6-33-2024_E.pdf
  [10] Download time: 5.28 seconds
[DOWNLOADED] pdf/acts/2024-6-30-2024_E.pdf
  [11] Download time: 2.78 seconds
[DOWNLOADED] pdf/acts/2024-6-29-2024_E.pdf
  [12] Download time: 2.19 seconds
[DOWNLOADED] pdf/acts/2024-5-28-2024_E.pdf
  [13] Download time: 0.66 se

In [9]:
download_range(doc_type='bills', start=0, end=99999)


[DOWNLOADED] pdf/bills/2025-7-621-2025_E.pdf
  [1] Download time: 4.28 seconds
[DOWNLOADED] pdf/bills/2025-7-620-2025_E.pdf
  [2] Download time: 5.47 seconds
[ERROR] Failed to download https://documents.gov.lk/view/bills/2025/7/619-2025_E.pdf — HTTPError: 404 Client Error: Not Found for url: https://documents.gov.lk/view/bills/2025/7/619-2025_E.pdf
  [3] Download time: 0.33 seconds
[DOWNLOADED] pdf/bills/2025-7-618-2025_E.pdf
  [4] Download time: 2.18 seconds
[DOWNLOADED] pdf/bills/2025-7-617-2025_E.pdf
  [5] Download time: 3.82 seconds
[DOWNLOADED] pdf/bills/2025-7-616-2025_E.pdf
  [6] Download time: 7.77 seconds
[DOWNLOADED] pdf/bills/2025-6-611-2025_E.pdf
  [7] Download time: 3.42 seconds
[DOWNLOADED] pdf/bills/2025-6-607-2025_E.pdf
  [8] Download time: 3.49 seconds
[DOWNLOADED] pdf/bills/2025-6-615-2025_E.pdf
  [9] Download time: 3.64 seconds
[DOWNLOADED] pdf/bills/2025-6-614-2025_E.pdf
  [10] Download time: 3.06 seconds
[DOWNLOADED] pdf/bills/2025-6-612-2025_E.pdf
  [11] Download 

In [10]:
!du -sh pdf/*/

626M	pdf/acts/
161M	pdf/bills/


In [11]:
import zipfile
import os

def zip_folder(folder_path, output_zip):
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(full_path, folder_path)
                zipf.write(full_path, arcname=rel_path)

zip_folder('pdf', 'pdf.zip')