# Setup

In [None]:
import pandas as pd

# Read data

In [None]:
def read_data():
    """Read U.S. population data from Wikipedia."""
    # Setup
    URL = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population'
    bad_states = ('The fifty states', 'Fifty states + D.C.', 'Total U.S. territory')
    columns = {
        'State or territory': 'state',
        'Total seats in House of Representatives, 2013–2023': 'house_seats',
        'Percent of total U.S. pop., 2015[note 1]': 'pct_of_pop'
    }

    return (
        pd.read_html(URL, header=0)
        [1]
        .rename(columns=columns)
        .loc[lambda df: ~df.state.isin(bad_states), columns.values()]
    )

# Clean data

In [None]:
def clean_pct_of_pop(df):
    return df.pct_of_pop.str.strip('%').astype(float)

In [None]:
def clean_house_seats(df):
    return df.house_seats.str.extract('^\d+♠(\d+)', expand=False).fillna(0).astype(int)

In [None]:
def clean(df):
    return df.assign(
        pct_of_pop=clean_pct_of_pop,
        house_seats=clean_house_seats
    )

# Add columns

In [None]:
def assign_pct_of_house(df):
    return df.house_seats.div(df.house_seats.sum()).mul(100).round(2)

In [None]:
def assign(df):
    return df.assign(
        pct_of_house=assign_pct_of_house
    )

# Remove columns

In [None]:
def remove(df):
    return df.drop('house_seats', axis=1)

# Putting it together

In [None]:
df = read_data().pipe(clean).pipe(assign).pipe(remove).set_index('state').sort_index()

# To CSV

In [None]:
df.to_csv('../data/pop.csv')