In [1]:
import requests 
import PyPDF2
from io import BytesIO
import re

In [3]:
from chunk_scripts import *

In [4]:
test_url = "https://nystateassembly.granicus.com/DocumentViewer.php?file=nystateassembly_3c8d9e7f365d22f77436d2add9220ecc.pdf&view=1"

test = requests.get(test_url)

#converting to bytes to avoid download
pdf = BytesIO(test.content)
file = PyPDF2.PdfReader(pdf)

In [5]:
full_text = ""
for page in file.pages:
    full_text += page.extract_text() + "\n"


# Chunking 

### test

In [None]:
PATTERNS = {
        #Member speaking 
        'speaker': re.compile(
            r'^(MR\.|MRS\.|MS\.|ACTING SPEAKER|THE CLERK)\s+([A-Z\-\']+):\s+(.+?)(?=^(?:MR\.|MRS\.|MS\.|ACTING SPEAKER|THE CLERK|\(|\[))',
            re.MULTILINE | re.DOTALL
        ),
        
        #Bill information
        'bill_number': re.compile(r'(?:Assembly|Senate) No\.\s+([AS]\d{5}(?:-[A-Z])?)', re.IGNORECASE),
        'calendar_number': re.compile(r'Calendar No\.\s+(\d+)', re.IGNORECASE),
        'rules_report': re.compile(r'Rules Report No\.\s+(\d+)', re.IGNORECASE),
        
        #Session metadata
        'session_date': re.compile(r'^[\d]*([A-Z]+,\s+[A-Z]+\s+\d{1,2},\s+\d{4})', re.MULTILINE),
        'page_number': re.compile(r'NYS ASSEMBLY\s+JUNE \d{1,2}, \d{4}\s*\n\s*(\d+)', re.MULTILINE),
        
        #Interaction patterns
        'yield_question': re.compile(
            r'Will\s+(?:the\s+sponsor|(?:Mr\.|Mrs\.|Ms\.)\s+([A-Z\-\']+))\s+yield',
            re.IGNORECASE
        ),
        'direct_address': re.compile(
            r'((?:Mr\.|Mrs\.|Ms\.)\s+[A-Z\-\']+)',
            re.IGNORECASE
        ),
        'thank_response': re.compile(
            r'Thank you,?\s+((?:Mr\.|Mrs\.|Ms\.)\s+[A-Z\-\']+)',
            re.IGNORECASE
        ),
        
        #Questions (for counting)
        'question_mark': re.compile(r'\?'),
        
        #Procedural
        'motion': re.compile(r'\bmove\b', re.IGNORECASE),
        'call_committee': re.compile(r'call.*?committee', re.IGNORECASE),
    }



In [7]:
date_match = PATTERNS['session_date'].search(full_text)

if date_match:
    session_date = date_match.group(1)
    
session_date

'WEDNESDAY, JUNE 11, 2025'

In [8]:
all_member_names = []
speaker_text = []

for match in PATTERNS['speaker'].finditer(full_text):
    title, name, content = match.groups()
    
    normalized_name = f"{title} {name}"
    speaker_text.append({normalized_name : clean_speech_text(content)})
    
    
    if not title.startswith("THE CLERK"):
        all_member_names.append(normalized_name)



In [18]:
speaker_text[0]

{'ACTING SPEAKER HUNTER': 'The House will \ncome to order. \nGood morning, colleagues. \nIn the absence of clergy, let us pause for a moment of \nsilence.'}

In [15]:
check = extract_interactions(speaker_text, all_member_names)

check

[{'from_member': 'MRS. PEOPLES-STOKES',
  'to_member': 'MS. ROZIC',
  'interaction_type': 'address',
  'text_snippet': 'Ms. Rozic',
  'position': 3},
 {'from_member': 'MR. JENSEN',
  'to_member': 'MS. ROZIC',
  'interaction_type': 'address',
  'text_snippet': 'Ms. Rozic',
  'position': 31},
 {'from_member': 'MR. JENSEN',
  'to_member': 'MS. ROZIC',
  'interaction_type': 'address',
  'text_snippet': 'MS. ROZIC',
  'position': 57},
 {'from_member': 'MR. JENSEN',
  'to_member': 'MS. ROZIC',
  'interaction_type': 'address',
  'text_snippet': 'MS. ROZIC',
  'position': 66},
 {'from_member': 'MS. ROZIC',
  'to_member': 'MR. JENSEN',
  'interaction_type': 'address',
  'text_snippet': 'MR. JENSEN',
  'position': 70},
 {'from_member': 'MR. ANGELINO',
  'to_member': 'MS. WOERNER',
  'interaction_type': 'address',
  'text_snippet': 'MS. WOERNER',
  'position': 113},
 {'from_member': 'MR. ANGELINO',
  'to_member': 'MS. WOERNER',
  'interaction_type': 'address',
  'text_snippet': 'MS. WOERNER',
  '

### scripts 

In [14]:
PATTERNS = {
        #Member speaking 
        'speaker': re.compile(
            r'^(MR\.|MRS\.|MS\.|ACTING SPEAKER|THE CLERK)\s+([A-Z\-\']+):\s+(.+?)(?=^(?:MR\.|MRS\.|MS\.|ACTING SPEAKER|THE CLERK|\(|\[))',
            re.MULTILINE | re.DOTALL
        ),
        
        #Bill information
        'bill_number': re.compile(r'(?:Assembly|Senate) No\.\s+([AS]\d{5}(?:-[A-Z])?)', re.IGNORECASE),
        'calendar_number': re.compile(r'Calendar No\.\s+(\d+)', re.IGNORECASE),
        'rules_report': re.compile(r'Rules Report No\.\s+(\d+)', re.IGNORECASE),
        
        #Session metadata
        'session_date': re.compile(r'^[\d]*([A-Z]+,\s+[A-Z]+\s+\d{1,2},\s+\d{4})', re.MULTILINE),
        'page_number': re.compile(r'NYS ASSEMBLY\s+JUNE \d{1,2}, \d{4}\s*\n\s*(\d+)', re.MULTILINE),
        
        #Interaction patterns
        'yield_question': re.compile(
            r'Will\s+(?:the\s+sponsor|(?:Mr\.|Mrs\.|Ms\.)\s+([A-Z\-\']+))\s+yield',
            re.IGNORECASE
        ),
        'direct_address': re.compile(
            r'((?:Mr\.|Mrs\.|Ms\.)\s+[A-Z\-\']+)',
            re.IGNORECASE
        ),
        'thank_response': re.compile(
            r'Thank you,?\s+((?:Mr\.|Mrs\.|Ms\.)\s+[A-Z\-\']+)',
            re.IGNORECASE
        ),
        
        #Questions (for counting)
        'question_mark': re.compile(r'\?'),
        
        #Procedural
        'motion': re.compile(r'\bmove\b', re.IGNORECASE),
        'call_committee': re.compile(r'call.*?committee', re.IGNORECASE),
    }


def clean_speech_text(text: str) -> str:
    """
    Clean up speech text by removing date artifacts and other noise.
    
    Removes:
    - Date lines like "NYS ASSEMBLY                     JUNE 11, 2025"
    - Standalone page numbers
    - Excessive whitespace
    - Leading/trailing newlines
    
    """
    # Remove the NYS ASSEMBLY date line pattern
    # Pattern: "NYS ASSEMBLY" followed by spaces and date
    text = re.sub(
        r'\n?NYS ASSEMBLY\s+[A-Z]+\s+\d{1,2},\s+\d{4}\s*\n?',
        '\n',
        text
    )
    
    # Remove standalone page numbers (just digits on their own line)
    text = re.sub(r'\n\d{1,4}\n', '\n', text)
    
    # Remove page numbers at start of lines (like "2to dispense")
    text = re.sub(r'\n\d{1,3}([a-z])', r'\n\1', text)
    
    # Clean up multiple newlines to max 2
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Clean up multiple spaces
    text = re.sub(r' {2,}', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

def extract_bill_context(text: str) -> dict:
        """Extract current bill context from text"""
        context = {}
        
        # Extract bill number
        bill_match = PATTERNS['bill_number'].search(text)
        if bill_match:
            context['bill_number'] = bill_match.group(1)
        
        # Extract calendar number
        cal_match = PATTERNS['calendar_number'].search(text)
        if cal_match:
            context['calendar_number'] = cal_match.group(1)
        
        # Extract rules report
        rules_match = PATTERNS['rules_report'].search(text)
        if rules_match:
            context['rules_report_number'] = rules_match.group(1)
        
        return context
    
    
def extract_interactions(
    speaker_data: list[dict[str, str]],
    all_member_names: list[str] = None
) -> list[dict]:
    """
    Extract member to member interactions from speaker data.
    
    Detects:
    - Question/yield patterns ("Will X yield?")
    - Direct addressing ("Mr./Mrs./Ms. NAME")
    - Response patterns (consecutive speakers)
    
    Returns:
        List of interaction dictionaries with format:
            {
                'from_member': 'MR. WALSH',
                'to_member': 'MS. ROZIC',
                'interaction_type': 'question',
                'text_snippet': 'Will the sponsor yield?',
                'position': 42  # Index in speaker_data
            },
    """
    interactions = []
    
    # Extract all member names if not provided
    if all_member_names is None:
        all_member_names = []
        for entry in speaker_data:
            for speaker_key in entry.keys():
                # Normalize to "TITLE NAME" format
                normalized = speaker_key.upper().strip()
                if normalized not in all_member_names:
                    all_member_names.append(normalized)
    
    
    # Create lookup for member names (for faster matching)
    member_name_set = set(name.upper() for name in all_member_names)
    
    # Process each speaker entry
    for idx, entry in enumerate(speaker_data):
        # Get speaker name and text
        speaker = list(entry.keys())[0]
        speech_text = entry[speaker]
        
        # Normalize the speaker key
        from_member = speaker.upper().strip()
        
        # Skip if not a member (e.g., ACTING SPEAKER, THE CLERK)
        if 'ACTING SPEAKER' in from_member or 'CLERK' in from_member:
            continue
        
        # Check for yield/question pattern
        yield_match = PATTERNS['yield_question'].search(speech_text)
        if yield_match:
            # If named member, construct normalized name
            if yield_match.group(1):
                last_name = yield_match.group(1).upper()
                # Find the matching full normalized name
                to_member = _find_matching_member(last_name, member_name_set)
            # If "the sponsor", try to find from context
            else:
                to_member = _find_sponsor_from_context(speaker_data, idx)
            
            if to_member and to_member != from_member:
                interactions.append({
                    'from_member': from_member,
                    'to_member': to_member,
                    'interaction_type': 'question',
                    'text_snippet': yield_match.group(0),
                    'position': idx
                })
        
        # Check for direct address
        address_matches = PATTERNS['direct_address'].finditer(speech_text)
        
        for match in address_matches:
            # Normalize the full address (e.g., "Ms. Rozic" -> "MS. ROZIC")
            addressed_name = match.group(1).upper().strip()
            addressed_name = re.sub(r'\s+', ' ', addressed_name)  # Normalize spaces
            
            # Only include if it's a known member and not self
            if addressed_name in member_name_set and addressed_name != from_member:
                # Avoid duplicates from yield pattern
                if not any(
                    i['from_member'] == from_member and 
                    i['to_member'] == addressed_name and 
                    i['position'] == idx 
                    for i in interactions
                ):
                    interactions.append({
                        'from_member': from_member,
                        'to_member': addressed_name,
                        'interaction_type': 'address',
                        'text_snippet': match.group(0),
                        'position': idx
                    })
                break  # Only record first address per speech
        
        # Check for thank you response
        thank_match = PATTERNS['thank_response'].search(speech_text)
        if thank_match:
            addressed_name = thank_match.group(1).upper().strip()
            addressed_name = re.sub(r'\s+', ' ', addressed_name)
            
            if addressed_name in member_name_set and addressed_name != from_member:
                # Don't duplicate if already recorded
                if not any(
                    i['from_member'] == from_member and 
                    i['to_member'] == addressed_name and 
                    i['position'] == idx 
                    for i in interactions
                ):
                    interactions.append({
                        'from_member': from_member,
                        'to_member': addressed_name,
                        'interaction_type': 'response',
                        'text_snippet': thank_match.group(0),
                        'position': idx
                    })
    
    return interactions


def _find_matching_member(last_name: str, member_name_set: set) -> str:
    
    last_name_upper = last_name.upper()
    for full_name in member_name_set:
        # Check if this full name ends with the last name
        if full_name.endswith(last_name_upper):
            return full_name
    return None


def _find_sponsor_from_context(speaker_data: list[dict[str, str]], current_idx: int) -> str:
    '''
    
    '''
    # Look back up to 5 entries
    for i in range(current_idx - 1, max(0, current_idx - 6), -1):
        entry = speaker_data[i]
        speaker = list(entry.keys())[0]
        normalized_key = speaker.upper().strip()
        
        # Skip acting speaker entries
        if 'ACTING SPEAKER' in normalized_key or 'CLERK' in normalized_key:
            continue
        
        # Return the normalized name
        return normalized_key
    
    return None

In [None]:

# extract all participation chunks
for match in PATTERNS['speaker'].finditer(full_text):
    title, name, content = match.groups()

    # Skip clerk entries
    if title.startswith("THE CLERK"):
        continue

    # Find line numbers
    match_start = match.start()
    match_end = match.end()

    # Count lines up to this match
    text_before = full_text[:match_start]
    line_start = text_before.count('\n') + 1

    text_in_match = full_text[match_start:match_end]
    line_end = line_start + text_in_match.count('\n')

    # Extract bill context from surrounding text
    context_text = full_text[max(0, match_start - 500):match_end]
    bill_context = extract_bill_context(context_text)

    # Update persistent context
    #current_bill_context.update(bill_context)

    # Determine participation type
    normalized_name = f"{title} {name}"