### Import necessary packages

In [None]:
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline


### Setup variables and file paths

In [None]:
json_dir = Path("data/extracted/BioguideProfiles")

tenure_data = []

### Process each JSON file

In [None]:
for json_file in json_dir.glob("*.json"):
    try:
        with json_file.open('r', encoding="utf-8") as file:
            data = json.load(file)

            # Check if 'data' key exists; if not, use the root object
            member = data.get('data', data)
            usCongressBioId = member.get('usCongressBioId', 'Unknown')
            job_positions = member.get('jobPositions', [])
            profile_text = member.get('profileText', '')
            
            if not job_positions:
                print(f"No job positions found in file: {json_file}")
                continue
            
            # Extract start and end dates
            dates = []
            for position in job_positions:
                try:
                    congress_aff = position.get('congressAffiliation', {})
                    congress = congress_aff.get('congress', {})
                    
                    start_date = congress.get('startDate')
                    end_date = congress.get('endDate')
                    congress_num = congress.get('congressNumber')
                    chamber = position.get('job', {}).get('name')
                    
                    if start_date:
                        dates.append((start_date, end_date, congress_num, chamber))
                except AttributeError as e:
                    print(f"Structure error in position data in file {json_file}: {e}")
                    continue
                
            if not dates:
                print(f"No valid dates found in file: {json_file}")
                continue
            
            dates.sort(key=lambda x: x[0])
            start = datetime.strptime(dates[0][0], "%Y-%m-%d")
            last_end = dates[-1][1] if dates[-1][1] else datetime.now().strftime("%Y-%m-%d")
            end = datetime.strptime(last_end, "%Y-%m-%d")
            tenure_years = (end - start).days / 365.25 # NOTE: could replace with exact years calc func
            
            tenure_data.append({
                'id': usCongressBioId,
                'tenure_years': tenure_years,
                'start_date': start,
                'end_date': end,
                'congresses': [d[2] for d in dates if d[2] is not None],
                'chamber': dates[0][3] if dates[0][3] is not None else 'Unknown'
            })
            
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {json_file}")
    except KeyError as e:
        print(f"KeyError: {e} in file: {json_file}")
    except ValueError as e:
        print(f"ValueError: {e} in file: {json_file}")
    except FileNotFoundError:
        print(f"File not found: {json_file}")
    except Exception as e:
        print(f"An unexpected error occurred in file {json_file}: {e}")

### Load into pandas


In [None]:
df = pd.DataFrame(tenure_data)
display(df)

### Calculate average tenure per Congress

In [None]:
congress_tenure = {}

df_copy = df.copy()
exploded_df = df_copy.explode('congresses')
avg_tenure_by_congress = exploded_df.groupby('congresses')['tenure_years'].mean()

### Plot the average tenure

In [None]:
avg_tenure_by_congress.sort_index().plot(figsize=(12, 6), marker='o', grid=True, title="Average Tenure of Congress Members Over Time", xlabel="Congress Number", ylabel="Average Tenure (Years)")