# Constitutional Law Data Exploration

This notebook explores the constitutional law dataset and provides insights into the data structure and quality.

## Setup

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import numpy as np
from wordcloud import WordCloud
import re

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Data exploration setup complete!")

## Load Raw Data

Load and examine the raw case data:

In [None]:
def load_all_cases(data_dir):
    """Load all case files from directory."""
    cases = []
    data_path = Path(data_dir)
    
    for json_file in data_path.rglob("*.json"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                case_data = json.load(f)
                case_data['source_file'] = str(json_file)
                case_data['amendment_type'] = 'First Amendment' if 'first_amendment' in str(json_file) else 'Fourth Amendment'
                cases.append(case_data)
        except Exception as e:
            print(f"Error loading {json_file}: {e}")
    
    return cases

# Load all cases
raw_cases = load_all_cases("../data/raw")
print(f"Loaded {len(raw_cases)} cases")

# Convert to DataFrame for easier analysis
df = pd.DataFrame(raw_cases)
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## Data Overview

Get basic statistics about the dataset:

In [None]:
# Amendment distribution
amendment_counts = df['amendment_type'].value_counts()
print("Amendment Distribution:")
print(amendment_counts)

# Plot amendment distribution
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
amendment_counts.plot(kind='bar')
plt.title('Cases by Amendment Type')
plt.ylabel('Number of Cases')
plt.xticks(rotation=45)

# Text length analysis
df['facts_length'] = df['facts_of_the_case'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)
df['question_length'] = df['question'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)
df['conclusion_length'] = df['conclusion'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

print(f"\nText Length Statistics:")
print(f"Facts - Mean: {df['facts_length'].mean():.1f}, Median: {df['facts_length'].median():.1f}")
print(f"Questions - Mean: {df['question_length'].mean():.1f}, Median: {df['question_length'].median():.1f}")
print(f"Conclusions - Mean: {df['conclusion_length'].mean():.1f}, Median: {df['conclusion_length'].median():.1f}")

# Plot text length distribution
plt.subplot(1, 2, 2)
plt.hist(df['facts_length'], bins=20, alpha=0.7, label='Facts')
plt.hist(df['conclusion_length'], bins=20, alpha=0.7, label='Conclusions')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Text Length Distribution')
plt.legend()

plt.tight_layout()
plt.show()

## Content Analysis

Analyze the content of the cases:

In [None]:
# Extract years from case names
def extract_year(case_name):
    """Extract year from case name."""
    if pd.isna(case_name):
        return None
    
    # Look for 4-digit year
    year_match = re.search(r'(19|20)\d{2}', str(case_name))
    if year_match:
        return int(year_match.group())
    
    # Look for year ranges like 1900-1940
    range_match = re.search(r'(19|20)\d{2}-(19|20)\d{2}', str(case_name))
    if range_match:
        start_year = int(range_match.group().split('-')[0])
        return start_year
    
    return None

df['year'] = df['name'].apply(extract_year)
year_counts = df['year'].value_counts().sort_index()

print(f"Cases by decade:")
df['decade'] = (df['year'] // 10) * 10
decade_counts = df['decade'].value_counts().sort_index()
print(decade_counts.head(10))

# Plot cases by decade
plt.figure(figsize=(12, 6))
decade_counts.plot(kind='bar')
plt.title('Constitutional Law Cases by Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Cases')
plt.xticks(rotation=45)
plt.show()

## Word Analysis

Analyze common terms and concepts:

In [None]:
# Combine all text for analysis
all_text = ""
for _, row in df.iterrows():
    facts = str(row['facts_of_the_case']) if pd.notna(row['facts_of_the_case']) else ""
    conclusion = str(row['conclusion']) if pd.notna(row['conclusion']) else ""
    all_text += f" {facts} {conclusion}"

# Clean text for analysis
def clean_text_for_analysis(text):
    """Clean text for word analysis."""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text

cleaned_text = clean_text_for_analysis(all_text)

# Get word frequencies
words = cleaned_text.split()
word_freq = Counter(words)

# Remove common stop words
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'was', 'is', 'are', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'shall', 'that', 'this', 'these', 'those', 'they', 'them', 'their', 'there', 'where', 'when', 'what', 'who', 'which', 'how', 'why', 'if', 'as', 'so', 'not', 'no', 'all', 'any', 'some', 'each', 'every', 'other', 'another', 'such', 'than', 'only', 'own', 'same', 'few', 'more', 'most', 'less', 'much', 'many', 'little', 'large', 'small', 'good', 'bad', 'new', 'old', 'long', 'short', 'high', 'low', 'big', 'small', 'right', 'left', 'up', 'down', 'out', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'should', 'now'}
filtered_freq = {word: count for word, count in word_freq.items() if word not in stop_words and len(word) > 3}

# Top 20 most common words
top_words = dict(sorted(filtered_freq.items(), key=lambda x: x[1], reverse=True)[:20])

print("Top 20 most common words:")
for word, count in top_words.items():
    print(f"{word}: {count}")

# Plot word frequencies
plt.figure(figsize=(12, 6))
words_list = list(top_words.keys())
counts_list = list(top_words.values())
plt.bar(words_list, counts_list)
plt.title('Top 20 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Legal Concepts Analysis

Analyze constitutional and legal concepts:

In [None]:
# Define legal concept categories
legal_concepts = {
    'First Amendment': ['speech', 'religion', 'press', 'assembly', 'petition', 'expression', 'establishment', 'exercise'],
    'Fourth Amendment': ['search', 'seizure', 'warrant', 'privacy', 'probable', 'reasonable', 'unreasonable'],
    'Court Actions': ['held', 'ruled', 'decided', 'concluded', 'found', 'determined', 'affirmed', 'reversed'],
    'Legal Standards': ['constitutional', 'unconstitutional', 'violated', 'protected', 'prohibited', 'permitted', 'compelling', 'strict'],
    'Parties': ['plaintiff', 'defendant', 'petitioner', 'respondent', 'appellant', 'appellee', 'government', 'state']
}

# Count occurrences of each concept category
concept_counts = {}
for category, terms in legal_concepts.items():
    count = sum(filtered_freq.get(term, 0) for term in terms)
    concept_counts[category] = count

print("Legal Concept Frequencies:")
for category, count in concept_counts.items():
    print(f"{category}: {count}")

# Plot concept frequencies
plt.figure(figsize=(10, 6))
categories = list(concept_counts.keys())
counts = list(concept_counts.values())
plt.bar(categories, counts)
plt.title('Legal Concept Frequencies')
plt.xlabel('Concept Categories')
plt.ylabel('Total Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Data Quality Assessment

Check for data quality issues:

In [None]:
# Check for missing data
print("Missing Data Analysis:")
missing_data = df.isnull().sum()
print(missing_data)

# Check for empty strings
print("\nEmpty String Analysis:")
empty_facts = df['facts_of_the_case'].apply(lambda x: len(str(x).strip()) == 0 if pd.notna(x) else True).sum()
empty_questions = df['question'].apply(lambda x: len(str(x).strip()) == 0 if pd.notna(x) else True).sum()
empty_conclusions = df['conclusion'].apply(lambda x: len(str(x).strip()) == 0 if pd.notna(x) else True).sum()

print(f"Empty facts: {empty_facts}")
print(f"Empty questions: {empty_questions}")
print(f"Empty conclusions: {empty_conclusions}")

# Check for very short/long texts
print("\nText Length Issues:")
short_facts = (df['facts_length'] < 20).sum()
long_facts = (df['facts_length'] > 200).sum()
short_conclusions = (df['conclusion_length'] < 20).sum()
long_conclusions = (df['conclusion_length'] > 300).sum()

print(f"Very short facts (<20 words): {short_facts}")
print(f"Very long facts (>200 words): {long_facts}")
print(f"Very short conclusions (<20 words): {short_conclusions}")
print(f"Very long conclusions (>300 words): {long_conclusions}")

# Show examples of problematic cases
print("\nExamples of very short conclusions:")
short_conclusion_cases = df[df['conclusion_length'] < 20]
for _, case in short_conclusion_cases.head(3).iterrows():
    print(f"Case: {case['name']}")
    print(f"Conclusion: {case['conclusion']}")
    print("---")

## Processed Data Analysis

Analyze the processed training data:

In [None]:
# Load processed data if available
processed_train_file = "../data/processed/train_cleaned.jsonl"
processed_val_file = "../data/processed/validation_cleaned.jsonl"

if os.path.exists(processed_train_file):
    # Load processed training data
    train_data = []
    with open(processed_train_file, 'r', encoding='utf-8') as f:
        for line in f:
            train_data.append(json.loads(line))
    
    # Load processed validation data
    val_data = []
    with open(processed_val_file, 'r', encoding='utf-8') as f:
        for line in f:
            val_data.append(json.loads(line))
    
    print(f"Processed training examples: {len(train_data)}")
    print(f"Processed validation examples: {len(val_data)}")
    
    # Analyze instruction and response lengths
    train_df = pd.DataFrame(train_data)
    train_df['instruction_length'] = train_df['instruction'].apply(lambda x: len(x.split()))
    train_df['response_length'] = train_df['response'].apply(lambda x: len(x.split()))
    
    print(f"\nProcessed Data Statistics:")
    print(f"Instruction length - Mean: {train_df['instruction_length'].mean():.1f}, Median: {train_df['instruction_length'].median():.1f}")
    print(f"Response length - Mean: {train_df['response_length'].mean():.1f}, Median: {train_df['response_length'].median():.1f}")
    
    # Plot processed data lengths
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(train_df['instruction_length'], bins=20, alpha=0.7)
    plt.title('Instruction Length Distribution')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    plt.hist(train_df['response_length'], bins=20, alpha=0.7)
    plt.title('Response Length Distribution')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    # Show example processed data
    print("\nExample processed training instance:")
    example = train_data[0]
    print(f"Case: {example['name']}")
    print(f"Instruction: {example['instruction'][:200]}...")
    print(f"Response: {example['response'][:200]}...")
    
else:
    print("Processed data not found. Run preprocessing first.")

## Summary

Data exploration summary and recommendations:

In [None]:
print("=== DATA EXPLORATION SUMMARY ===")
print(f"Total cases loaded: {len(df)}")
print(f"First Amendment cases: {amendment_counts.get('First Amendment', 0)}")
print(f"Fourth Amendment cases: {amendment_counts.get('Fourth Amendment', 0)}")
print(f"Time span: {df['year'].min()} - {df['year'].max()}")
print(f"Average facts length: {df['facts_length'].mean():.1f} words")
print(f"Average conclusion length: {df['conclusion_length'].mean():.1f} words")

print("\n=== RECOMMENDATIONS ===")
print("1. Data quality is generally good with comprehensive case coverage")
print("2. Both amendment types are well represented")
print("3. Text lengths are appropriate for training")
print("4. Consider additional preprocessing for HTML tags and citations")
print("5. Monitor for potential overfitting on specific time periods")

print("\n=== NEXT STEPS ===")
print("1. Proceed with model training using the processed data")
print("2. Consider data augmentation if needed")
print("3. Evaluate model performance on both amendment types")
print("4. Monitor for bias towards specific legal concepts")

print("\nData exploration completed!")