In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from pathlib import Path
from ds100_utils import fetch_and_cache
from datetime import datetime
from IPython.display import display

import yaml

plt.rcParams['figure.figsize'] = (6, 6)
plt.rcParams['figure.dpi'] = 100
sns.set()

# Data

In [None]:
# From Lecture 4: current members of congress
base_url = 'https://github.com/unitedstates/congress-legislators/raw/master/'
current_path = 'legislators-current.yaml'
f = fetch_and_cache(base_url + current_path, current_path)
current_data = yaml.load(open(f))

def birthdays(data):
    return pd.DataFrame(
        columns=['leg_id', 'first', 'last', 'gender', 'terms', 'birthday'],
        data=[[x['id']['bioguide'], 
               x['name']['first'],
               x['name']['last'],
               x['bio']['gender'],
               len(x['terms']),
               pd.to_datetime(x['bio'].get('birthday'), yearfirst=True)
              ] for x in data])

current_legs = birthdays(current_data)
current_legs.head(3)

In [None]:
# Past members of congress
past_path = 'legislators-historical.yaml'
f = fetch_and_cache(base_url + past_path, past_path)
past_data = yaml.load(open(f))

past_legs = birthdays(past_data)
past_legs.head(3)

In [None]:
legs = pd.concat([past_legs, current_legs]).reset_index(drop=True)
assert legs.shape[0] == len(legs['leg_id'].unique()) # No repeats
legs['terms'].value_counts().plot(kind='bar');

# Study Design

In [None]:
bins = np.linspace(1700, 2000, 16)
sns.distplot(legs['birthday'].dropna().dt.year, bins=bins)
plt.xlabel('Year');
plt.xticks(bins, rotation=90)
plt.ylabel('Density');

## Question
According to the histogram, about what proportion of all members of congress were born between 1840 and 1880?

## Question
How do you compute the exact proportion using Pandas?

# Question
How do you think the missing birthdays bias the birthyear distribution?

```
(a) Low bins have too little density
(b) High bins have too little density
(c) All bins have too little density
(d) Impossible to tell
```

Could inspecting the data further help answer this question?

## Question

How would you better visualize the rate of missing values?

# Data Cleaning

In [None]:
past_data[0]

## Question

What will be the granularity of a table that includes term information?

In [None]:
columns=['leg_id', 'term', 'type', 'start', 'end', 'state', 'party']
data = []
for x in past_data + current_data:
    for i, t in enumerate(x['terms']):
        data.append([
            x['id']['bioguide'], 
            i + 1,
            t['type'],
            pd.to_datetime(t['start'], yearfirst=True),
            pd.to_datetime(t['end'], yearfirst=True),
            t['state'],
            t.get('party'),
        ])
        
terms = pd.DataFrame(columns=columns, data=data)
terms.head()

In [None]:
terms.shape

In [None]:
terms.groupby(['leg_id', 'start']).size().sort_values(ascending=False).head(15)

In [None]:
terms['party'].value_counts()

In [None]:
terms['party'].value_counts().head(8)

## Question

How many terms were under the American party and smaller parties?

# Question

What proportion of all members of congress ever changed parties?

# Visualization

In [None]:
a = terms.groupby('start')
starts = a.nunique()
starts.head(10)

## Question

What visualization will help determine which term starts follow from national elections?

```
(a) A histogram of start values in terms
(b) A line plot of start values in terms
(c) A histogram of leg_id values in starts
(d) A line plot of leg_id values in starts
```

How would you use this visualization to focus future analysis on the composition of congress after each national election?

In [None]:
biannual = pd.to_datetime([f'01-07-{n}' for n in range(1941, 2020, 2)])
biannual

## Question

What values/distributions should be visualized to determine whether the members of congress are older now than they used to be?

## Age at Start Date

Next up, we'll build a table of (term, election_date) pairs that contains the age of the person serving each term.

In [None]:
# a bit of hack inspired by https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas
# 
# a better tool for this will be SQL (to be discussed later)

dates_of_interest = biannual.values
term_endings = legs_with_terms["end"].values
term_beginnings = legs_with_terms["start"].values

i, j = np.where((dates_of_interest[:, None] >= term_beginnings) 
                & (dates_of_interest[:, None] <= term_endings))

df = pd.DataFrame(
    np.column_stack([biannual.values[i], legs_with_terms.values[j]]),
    columns=pd.Index(["election"]).append(legs_with_terms.columns)
)
df = df.astype({"election": "datetime64[ns]"})

df["age"] = np.round((df["election"] - df["birthday"]).dt.days / 365.0, 2)
df["type"] = df["type"].str.replace("sen", "Senate")
df["type"] = df["type"].str.replace("rep", "House")
df = df.rename(columns={"type": "chamber"})

df.head(10)

In [None]:
df.shape

In [None]:
df.groupby('election').size().plot();

## Question

What is a good way to visualize age change over time?

In [None]:
for chamber in ['Senate', 'House']:
    df[df['chamber']==chamber].groupby('election').mean()['age'].plot(label=chamber);
plt.legend();

In [None]:
for party in ['Democrat', 'Republican']:
    df[df['party']==party].groupby('election').mean()['age'].plot(label=party);
plt.legend();

In [None]:
df[df['election'].dt.year == 2019].hist('age');

In [None]:
df.boxplot('age', 'election', rot=90);

## Question

What does the boxplot reveal that was not obvious from the line plot?

Why does this boxplot have diagonal lines of outliers?