In [50]:
from chunk_scripts import *
from scrape_scripts import *
from dotenv import load_dotenv
import os
import requests
load_dotenv()

True

In [31]:
from sqlalchemy import create_engine, Column, Integer, String, Text, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker

Base = declarative_base()

class Member(Base):
    __tablename__ = 'members'
    
    member_id = Column(Integer, primary_key=True)
    name = Column(String)
    district = Column(Integer)
    session_year = Column(Integer)

class Transcript(Base):
    __tablename__ = 'transcripts'
    
    date = Column(String, primary_key=True)
    text = Column(Text)

class TranscriptSegment(Base):
    __tablename__ = 'transcript_segments'
    
    segment_id = Column(Integer, primary_key=True, autoincrement=True)
    date = Column(String, ForeignKey('transcripts.date'))
    sequence_number = Column(Integer)
    member_id = Column(Integer, ForeignKey('members.member_id'))
    text = Column(Text)

class Activity(Base):
    __tablename__ = 'activity'
    
    activity_id = Column(Integer, primary_key=True, autoincrement=True)
    date = Column(String, ForeignKey('transcripts.date'))
    segment_id = Column(Integer, ForeignKey('transcript_segments.segment_id'))
    member_from = Column(Integer, ForeignKey('members.member_id'))
    member_to = Column(Integer, ForeignKey('members.member_id'))
    interaction = Column(String)
    sentiment = Column(String, default='neutral')
    text_snippet = Column(Text)  
    
    


In [56]:
server = os.getenv('DATABASE_URL')

engine = create_engine(server)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)  
session = Session() 

print('connected')


connected


# Download

In [7]:
n = 320
links = scrape_links(n)
trans = scrape_transcript_pdfs(links)

Processing 6-17-25 (1 parts)
  6-17-25...
    Extracted 659195 chars
  Total: 659195 chars

Processing 6-16-25 (1 parts)
  6-16-25...
    Extracted 560421 chars
  Total: 560421 chars

Processing 6-13-25 (1 parts)
  6-13-25...
    Extracted 128093 chars
  Total: 128093 chars

Processing 6-12-25 (1 parts)
  6-12-25-...
    Extracted 557032 chars
  Total: 557032 chars

Processing 6-11-25 (1 parts)
  6-11-25-...
    Extracted 484818 chars
  Total: 484818 chars

Processing 6-10-25 (2 parts)
  6-10-25-Part-1...
    Extracted 387018 chars
  6-10-25-Part-2...
    Extracted 387018 chars
  Total: 774072 chars

Processing 6-9-25 (2 parts)
  6-9-25-Part-1...
    Extracted 220880 chars
  6-9-25-Part-2...
    Extracted 220880 chars
  Total: 441796 chars

Processing 6-6-25 (1 parts)
  6-6-25-...
    Extracted 148669 chars
  Total: 148669 chars

Processing 6-5-25 (1 parts)
  6-5-25-...
    Extracted 233645 chars
  Total: 233645 chars

Processing 6-4-25 (1 parts)
  6-4-25-Part-3...
    Extracted 102116

In [8]:
file_path = "transcipts.json" # Specify the desired filename
with open(file_path, 'w') as json_file:
    json.dump(trans, json_file, indent=4) 

In [36]:
with open('transcipts.json', 'r') as f:
    transcripts_data = json.load(f)

for date, text in transcripts_data.items():
    transcript = Transcript(date=date, text=text)
    session.add(transcript)

session.commit()

# member list

In [13]:
import requests

all_members = []
seen_member_ids = set()

# Query each session year
for year in [2019, 2021, 2023, 2025]:
    url = f"https://legislation.nysenate.gov/api/3/members/{year}/assembly?limit=400&key={key}"
    response = requests.get(url).json()
    
    for member in response['result']['items']:
        member_id = member['memberId']
        
        # Only add if we haven't seen this member_id before
        if member_id not in seen_member_ids:
            all_members.append(member)
            seen_member_ids.add(member_id)

print(f"Total unique members: {len(all_members)}")

NameError: name 'key' is not defined

In [36]:
file_path = "members.json" # Specify the desired filename
with open(file_path, 'w') as json_file:
    json.dump(all_members, json_file, indent=4) 

In [37]:
with open('members.json', 'r') as f:
    members_data = json.load(f)

seen_ids = set()
for member_data in members_data:
    member_id = member_data['memberId']
    if member_id not in seen_ids:
        member = Member(
            member_id=member_id,
            name=member_data['shortName'],
            district=member_data['districtCode'],
            session_year=member_data['sessionYear']
        )
        session.add(member)
        seen_ids.add(member_id)

session.commit()
print(f"Loaded {len(seen_ids)} unique members from {len(members_data)} records")

Loaded 236 unique members from 236 records


# chunk

In [38]:
import re
from chunk_scripts import PATTERNS, clean_speech_text, extract_interactions

# Build name lookup dictionary from database
members = session.query(Member).all()
name_to_id = {}
for member in members:
    # Extract surname (last word of name)
    surname = member.name.split()[-1]
    name_to_id[surname] = member.member_id

# Process all transcripts
transcripts = session.query(Transcript).all()
print(f"Processing {len(transcripts)} transcripts...")

segments_created = 0
interactions_created = 0

for transcript in transcripts:
    date = transcript.date
    text = transcript.text
    seq_count = 0
    
    # Store segments for this transcript (needed for extract_interactions)
    transcript_segments = []
    
    # Find all speaker segments in this transcript
    for match in PATTERNS['speaker'].finditer(text):
        title, name, content = match.groups()
        normalized_name = f"{title} {name}"
        surname = normalized_name.split()[-1]
        
        # Look up member_id
        mem_id = name_to_id.get(surname, None)
        cleaned_text = clean_speech_text(content)
        
        # Create TranscriptSegment
        segment = TranscriptSegment(
            date=date,
            sequence_number=seq_count,
            member_id=mem_id,
            text=cleaned_text
        )
        session.add(segment)
        
        # Store for interaction extraction
        transcript_segments.append({
            "name": normalized_name,
            "member_id": mem_id,
            "text": cleaned_text,
            "date": date,
            "sequence": seq_count
        })
        
        seq_count += 1
        segments_created += 1
    
    # Commit segments so they get segment_ids
    session.commit()
    
    # Extract interactions for this transcript
    interactions = extract_interactions(transcript_segments)
    
    # Create Activity records
    for interaction in interactions:
        # Query for the segment_id we just created
        segment = session.query(TranscriptSegment).filter_by(
            date=interaction['date'],
            sequence_number=interaction['sequence']
        ).first()
        
        if segment is None:
            print(f"Warning: No segment found for date={interaction['date']}, sequence={interaction['sequence']}")
            continue
        
        activity = Activity(
            date=interaction['date'],
            segment_id=segment.segment_id,
            member_from=interaction['from_member_id'],
            member_to=interaction['to_member_id'],
            interaction=interaction['interaction_type']
        )
        session.add(activity)
        interactions_created += 1
    
    # Commit activities
    session.commit()
    print(f"Processed {date}: {seq_count} segments, {len(interactions)} interactions")

print(f"\nTotal segments created: {segments_created}")
print(f"Total interactions created: {interactions_created}")

Processing 270 transcripts...
Processed 6-17-25: 2383 segments, 128 interactions
Processed 6-16-25: 1682 segments, 97 interactions
Processed 6-13-25: 273 segments, 2 interactions
Processed 6-12-25: 1767 segments, 123 interactions
Processed 6-11-25: 1933 segments, 138 interactions
Processed 6-10-25: 2754 segments, 168 interactions
Processed 6-9-25: 1236 segments, 64 interactions
Processed 6-6-25: 405 segments, 14 interactions
Processed 6-5-25: 840 segments, 59 interactions
Processed 6-4-25: 274 segments, 8 interactions
Processed 6-4-25-Part--2: 274 segments, 8 interactions
Processed 6-4-25-Part--1: 274 segments, 8 interactions
Processed 5-29-25: 257 segments, 0 interactions
Processed 5-28-25: 558 segments, 30 interactions
Processed 5-27-25: 1400 segments, 86 interactions
Processed 5-22-25: 79 segments, 6 interactions
Processed 5-21-25: 189 segments, 1 interactions
Processed 5-20-25: 445 segments, 20 interactions
Processed 5-19-25: 101 segments, 0 interactions
Processed 5-15-25: 440 segm

# check / test

In [39]:
# Quick checks after running
print(f"Members: {session.query(Member).count()}")
print(f"Transcripts: {session.query(Transcript).count()}")
print(f"Segments: {session.query(TranscriptSegment).count()}")
print(f"Activities: {session.query(Activity).count()}")

# Spot check  data
print(session.query(Member).limit(5).all())
print(session.query(Activity).limit(5).all())

Members: 236
Transcripts: 270
Segments: 151870
Activities: 8484
[<__main__.Member object at 0x10feca650>, <__main__.Member object at 0x119e97d90>, <__main__.Member object at 0x119e97250>, <__main__.Member object at 0x119e95550>, <__main__.Member object at 0x119e95010>]
[<__main__.Activity object at 0x10cc15c50>, <__main__.Activity object at 0x10cc14dd0>, <__main__.Activity object at 0x10cc15ad0>, <__main__.Activity object at 0x10cc168d0>, <__main__.Activity object at 0x119d6f450>]


In [17]:
import secrets
def generate_api_key():
    
    return secrets.token_urlsafe(32)

keys = {
    "demo_key": generate_api_key(),
    "admin_key": generate_api_key(),
    "user_key": generate_api_key()
}

# Save to file
with open('api_keys.json', 'w') as f:
    json.dump(keys, f, indent=2)

print("API keys generated and saved to api_keys.json\n")

API keys generated and saved to api_keys.json

