# Sri Lankan Legal AI Dataset for Unsloth Gemma 3

Simple dataset preparation for fine-tuning Gemma 3 with Sri Lankan legal data.

## Dataset Sources:
1. `finetune_data.json` - Constitutional Law data
2. `finetune_penalCode.json` - Penal Code data  
3. `finetuneData_penalcode2.json` - Additional Penal Code data

In [1]:
# Install required packages
!pip install datasets huggingface_hub pandas python-dotenv -q

In [2]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
import random
from typing import List, Dict, Any
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

if HF_TOKEN:
    print("✅ Hugging Face token loaded from .env file")
else:
    print("❌ No HF_TOKEN found in .env file!")

✅ Hugging Face token loaded from .env file


## Load JSON Files

In [3]:
def load_json_file(file_path: str) -> List[Dict[str, Any]]:
    """Load JSON file and return data"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} entries from {file_path}")
        return data
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return []

# Load all dataset files
constitutional_data = load_json_file('docs/finetune_data.json')
penal_code_data1 = load_json_file('docs/finetune_penalCode.json')
penal_code_data2 = load_json_file('docs/finetuneData_penalcode2.json')

print(f"\n📊 Dataset Summary:")
print(f"Constitutional Law: {len(constitutional_data)} entries")
print(f"Penal Code 1: {len(penal_code_data1)} entries")
print(f"Penal Code 2: {len(penal_code_data2)} entries")
print(f"Total: {len(constitutional_data) + len(penal_code_data1) + len(penal_code_data2)} entries")

✅ Loaded 601 entries from docs/finetune_data.json
✅ Loaded 890 entries from docs/finetune_penalCode.json
✅ Loaded 133 entries from docs/finetuneData_penalcode2.json

📊 Dataset Summary:
Constitutional Law: 601 entries
Penal Code 1: 890 entries
Penal Code 2: 133 entries
Total: 1624 entries


## Convert to Conversations Format

In [4]:
def generate_instruction_from_content(content: str) -> str:
    """Generate appropriate instruction based on content"""
    content_lower = content.lower()
    
    if 'freedom of speech' in content_lower:
        return "Explain the legal framework of freedom of speech in Sri Lankan law"
    elif any(word in content_lower for word in ['murder', 'homicide']):
        return "Define and explain homicide and murder under Sri Lankan Penal Code"
    elif 'theft' in content_lower:
        return "Explain the legal definition and punishment for theft"
    elif 'public servant' in content_lower:
        return "Define public servant according to Sri Lankan Penal Code"
    elif any(word in content_lower for word in ['constitution', 'constitutional']):
        return "Explain this constitutional law concept"
    else:
        return "Explain this legal concept under Sri Lankan law"

In [5]:
def convert_to_conversations(data: List[Dict], category: str) -> List[Dict]:
    """Convert data to conversations format for Unsloth"""
    conversations = []
    
    for item in data:
        # Get user message
        if 'input' in item:
            user_message = item['input']
        else:
            user_message = generate_instruction_from_content(item.get('output', ''))
        
        # Get assistant message
        if 'output' in item:
            if 'legalDoc' in item and item['legalDoc'].strip():
                assistant_message = f"According to {item['legalDoc']}: {item['output']}"
            else:
                assistant_message = item['output']
        else:
            continue
        
        # Create conversation
        conversation = [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": assistant_message}
        ]
        
        conversations.append({
            'conversations': conversation,
            'category': category
        })
    
    return conversations

# Convert all data
constitutional_conversations = convert_to_conversations(constitutional_data, 'constitutional_law')
penal_conversations1 = convert_to_conversations(penal_code_data1, 'penal_code')
penal_conversations2 = convert_to_conversations(penal_code_data2, 'penal_code')

print(f"Converted:")
print(f"Constitutional: {len(constitutional_conversations)} conversations")
print(f"Penal Code 1: {len(penal_conversations1)} conversations")
print(f"Penal Code 2: {len(penal_conversations2)} conversations")

Converted:
Constitutional: 601 conversations
Penal Code 1: 890 conversations
Penal Code 2: 133 conversations


## Combine and Split Dataset

In [6]:
# Combine all data
all_conversations = constitutional_conversations + penal_conversations1 + penal_conversations2

# Shuffle
random.seed(42)
random.shuffle(all_conversations)

# Split 80/20
split_idx = int(0.8 * len(all_conversations))
train_data = all_conversations[:split_idx]
val_data = all_conversations[split_idx:]

print(f"📊 Dataset created:")
print(f"Total: {len(all_conversations)} conversations")
print(f"Training: {len(train_data)} conversations")
print(f"Validation: {len(val_data)} conversations")

# Show sample
print(f"\n🔍 Sample conversation:")
sample = train_data[0]
print(f"User: {sample['conversations'][0]['content'][:100]}...")
print(f"Assistant: {sample['conversations'][1]['content'][:100]}...")

📊 Dataset created:
Total: 1624 conversations
Training: 1299 conversations
Validation: 325 conversations

🔍 Sample conversation:
User: What is the punishment for importing counterfeit coin?...
Assistant: According to Penal Code of Sri Lanka, Section 234 - Import or export of machine or instrument for co...


## Create Hugging Face Dataset

In [8]:
# Create datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print("✅ Hugging Face dataset created!")
print(f"Features: {train_dataset.features}")

✅ Hugging Face dataset created!
Features: {'conversations': List({'content': Value('string'), 'role': Value('string')}), 'category': Value('string')}


## Upload to Hugging Face

In [9]:
# Upload dataset
DATASET_NAME = "Nishan726/sri-lankan-legal-conversations"

if HF_TOKEN:
    try:
        print(f"🚀 Uploading to {DATASET_NAME}...")
        dataset_dict.push_to_hub(DATASET_NAME, token=HF_TOKEN, private=False)
        print(f"✅ Upload successful!")
        print(f"🔗 Dataset: https://huggingface.co/datasets/{DATASET_NAME}")
        print(f"\n📋 Use in Gemma 3 notebook:")
        print(f'dataset = load_dataset("{DATASET_NAME}", split="train")')
    except Exception as e:
        print(f"❌ Upload failed: {e}")
else:
    print("❌ No HF_TOKEN found. Add it to .env file first.")

🚀 Uploading to Nishan726/sri-lankan-legal-conversations...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

✅ Upload successful!
🔗 Dataset: https://huggingface.co/datasets/Nishan726/sri-lankan-legal-conversations

📋 Use in Gemma 3 notebook:
dataset = load_dataset("Nishan726/sri-lankan-legal-conversations", split="train")


## Save Locally (Optional)

In [11]:
# Save datasets locally
with open('sri_lankan_legal_dataset.json', 'w', encoding='utf-8') as f:
    json.dump({
        'train': train_data,
        'validation': val_data,
        'info': {
            'total': len(all_conversations),
            'train': len(train_data),
            'validation': len(val_data)
        }
    }, f, indent=2, ensure_ascii=False)

print("💾 Dataset saved locally as sri_lankan_legal_dataset.json")
print("\n🎯 Ready for Gemma 3 fine-tuning!")

💾 Dataset saved locally as sri_lankan_legal_dataset.json

🎯 Ready for Gemma 3 fine-tuning!
