# Inter-State Regions
This notebook includes all of the inter-state regions that are implied in the U.S. Census Bureau's data defining Metropolian and Micropolitan areas (I only include Metropolitan areas). The dataset is called: [U.S. Census Bureau - Metropolitan and Micropolitan Statistical Areas](https://www.census.gov/programs-surveys/metro-micro.html). 

## Final List

This is the final result of my processing. It's in the form of a dictionary where the name of the metropolitan area is the key, and a list of the included states is the value. 

In [1]:
regions_USCB = {
    'Allentown-Bethlehem-East Stroudsburg, PA-NJ': ['NJ', 'PA'],
    'Atlanta--Athens-Clarke County--Sandy Springs, GA-AL': ['AL', 'GA'],
    'Boise City-Mountain Home-Ontario, ID-OR': ['ID', 'OR'],
    'Boston-Worcester-Providence, MA-RI-NH': ['MA', 'NH', 'RI'],
    'Brookings-Crescent City, OR-CA': ['CA', 'OR'],
    'Burlington-Fort Madison, IA-IL': ['IA', 'IL'],
    'Cape Girardeau-Sikeston, MO-IL': ['IL', 'MO'],
    'Charleston-Huntington-Ashland, WV-OH-KY': ['KY', 'OH', 'WV'],
    'Charlotte-Concord, NC-SC': ['NC', 'SC'],
    'Chattanooga-Cleveland-Dalton, TN-GA-AL': ['AL', 'GA', 'TN'],
    'Chicago-Naperville, IL-IN-WI': ['IL', 'IN', 'WI'],
    'Cincinnati-Wilmington, OH-KY-IN': ['IN', 'KY', 'OH'],
    'Columbus-Auburn-Opelika, GA-AL': ['AL', 'GA'],
    'Dallas-Fort Worth, TX-OK': ['OK', 'TX'],
    'Davenport-Moline, IA-IL': ['IA', 'IL'],
    'Duluth-Grand Rapids, MN-WI': ['MN', 'WI'],
    'El Paso-Las Cruces, TX-NM': ['NM', 'TX'],
    'Evansville-Henderson, IN-KY': ['IN', 'KY'],
    'Fargo-Wahpeton, ND-MN': ['MN', 'ND'],
    'Huntsville-Decatur-Albertville, AL-TN': ['AL', 'TN'],
    'Jacksonville-Kingsland-Palatka, FL-GA': ['FL', 'GA'],
    'Johnson City-Kingsport-Bristol, TN-VA': ['TN', 'VA'],
    'Joplin-Miami, MO-OK-KS': ['KS', 'MO', 'OK'],
    'Kansas City-Overland Park-Kansas City, MO-KS': ['KS', 'MO'],
    'Keene-Brattleboro, NH-VT': ['NH', 'VT'],
    'La Crosse-Onalaska-Sparta, WI-MN': ['MN', 'WI'],
    'Louisville/Jefferson County--Elizabethtown, KY-IN': ['IN', 'KY'],
    'Marinette-Iron Mountain, WI-MI': ['MI', 'WI'],
    'Memphis-Clarksdale-Forrest City, TN-MS-AR': ['AR', 'MS', 'TN'],
    'Minneapolis-St. Paul, MN-WI': ['MN', 'WI'],
    'New Orleans-Metairie-Slidell, LA-MS': ['LA', 'MS'],
    'New York-Newark, NY-NJ-CT-PA': ['CT', 'NJ', 'NY', 'PA'],
    'Omaha-Fremont, NE-IA': ['IA', 'NE'],
    'Paducah-Mayfield, KY-IL': ['IL', 'KY'],
    'Parkersburg-Marietta-Vienna, WV-OH': ['OH', 'WV'],
    'Philadelphia-Reading-Camden, PA-NJ-DE-MD': ['DE', 'MD', 'NJ', 'PA'],
    'Pittsburgh-Weirton-Steubenville, PA-OH-WV': ['OH', 'PA', 'WV'],
    'Portland-Vancouver-Salem, OR-WA': ['OR', 'WA'],
    'Pullman-Moscow, WA-ID': ['ID', 'WA'],
    'Quincy-Hannibal, IL-MO': ['IL', 'MO'],
    'Reno-Carson City-Gardnerville Ranchos, NV-CA': ['CA', 'NV'],
    'Salt Lake City-Provo-Orem, UT-ID': ['ID', 'UT'],
    'Sioux City-Le Mars, IA-NE-SD': ['IA', 'NE', 'SD'],
    'South Bend-Elkhart-Mishawaka, IN-MI': ['IN', 'MI'],
    "Spokane-Spokane Valley-Coeur d'Alene, WA-ID": ['ID', 'WA'],
    'St. Louis-St. Charles-Farmington, MO-IL': ['IL', 'MO'],
    'Tallahassee-Bainbridge, FL-GA': ['FL', 'GA'],
    'Virginia Beach-Chesapeake, VA-NC': ['NC', 'VA'],
    'Washington-Baltimore-Arlington, DC-MD-VA-WV-PA': ['DC', 'MD', 'PA', 'VA', 'WV']
}

## The Work

This is how I produced the above list

In [2]:
import pandas as pd
import numpy as np

politans_df = pd.read_csv('../personal_data/metro_micro.csv')[['CBSA Title', 'Metropolitan/Micropolitan Statistical Area', 'CSA Title']]

In [3]:
# Extract the state abbreviation and augment with a new 'State' column
politans_df['State'] = politans_df['CSA Title'].str.split(', ').str[-1].str.split('-')

# Group by CSA Title (name of metro area)
grouped = politans_df.groupby('CSA Title')['State']

# For aggregating the states within a group
def aggregate_states(states_series):
    all_states = set()
    for state_list in states_series:
        all_states.update(state_list)
    return sorted(all_states)

# Apply aggregation
regions_series = grouped.apply(aggregate_states)

# Keep only those having at least two states
regions_series_multi_state = regions_series[regions_series.apply(lambda x: len(x) > 1)]

# Transform to a dictionary
regions_dict = regions_series_multi_state.to_dict()

# May also report just the state tuples
regions = sorted({str(states_list).replace("'", "") for states_list in regions_dict.values()})

In [4]:
regions_dict

{'Allentown-Bethlehem-East Stroudsburg, PA-NJ': ['NJ', 'PA'],
 'Atlanta--Athens-Clarke County--Sandy Springs, GA-AL': ['AL', 'GA'],
 'Boise City-Mountain Home-Ontario, ID-OR': ['ID', 'OR'],
 'Boston-Worcester-Providence, MA-RI-NH': ['MA', 'NH', 'RI'],
 'Brookings-Crescent City, OR-CA': ['CA', 'OR'],
 'Burlington-Fort Madison, IA-IL': ['IA', 'IL'],
 'Cape Girardeau-Sikeston, MO-IL': ['IL', 'MO'],
 'Charleston-Huntington-Ashland, WV-OH-KY': ['KY', 'OH', 'WV'],
 'Charlotte-Concord, NC-SC': ['NC', 'SC'],
 'Chattanooga-Cleveland-Dalton, TN-GA-AL': ['AL', 'GA', 'TN'],
 'Chicago-Naperville, IL-IN-WI': ['IL', 'IN', 'WI'],
 'Cincinnati-Wilmington, OH-KY-IN': ['IN', 'KY', 'OH'],
 'Columbus-Auburn-Opelika, GA-AL': ['AL', 'GA'],
 'Dallas-Fort Worth, TX-OK': ['OK', 'TX'],
 'Davenport-Moline, IA-IL': ['IA', 'IL'],
 'Duluth-Grand Rapids, MN-WI': ['MN', 'WI'],
 'El Paso-Las Cruces, TX-NM': ['NM', 'TX'],
 'Evansville-Henderson, IN-KY': ['IN', 'KY'],
 'Fargo-Wahpeton, ND-MN': ['MN', 'ND'],
 'Huntsville-

In [5]:
regions

['[AL, GA, TN]',
 '[AL, GA]',
 '[AL, TN]',
 '[AR, MS, TN]',
 '[CA, NV]',
 '[CA, OR]',
 '[CT, NJ, NY, PA]',
 '[DC, MD, PA, VA, WV]',
 '[DE, MD, NJ, PA]',
 '[FL, GA]',
 '[IA, IL]',
 '[IA, NE, SD]',
 '[IA, NE]',
 '[ID, OR]',
 '[ID, UT]',
 '[ID, WA]',
 '[IL, IN, WI]',
 '[IL, KY]',
 '[IL, MO]',
 '[IN, KY, OH]',
 '[IN, KY]',
 '[IN, MI]',
 '[KS, MO, OK]',
 '[KS, MO]',
 '[KY, OH, WV]',
 '[LA, MS]',
 '[MA, NH, RI]',
 '[MI, WI]',
 '[MN, ND]',
 '[MN, WI]',
 '[NC, SC]',
 '[NC, VA]',
 '[NH, VT]',
 '[NJ, PA]',
 '[NM, TX]',
 '[OH, PA, WV]',
 '[OH, WV]',
 '[OK, TX]',
 '[OR, WA]',
 '[TN, VA]']

## Create `interstate_regions.csv`

In [9]:
regions_series_multi_state.to_csv('interstate_regions.csv')

## Read `interstate_regions.csv`

In [25]:
import ast
df = pd.read_csv("interstate_regions.csv", index_col=0)
d = df.to_dict('split')
d = dict(zip(d["index"], d["data"]))
regions_dict = {key: ast.literal_eval(value[0]) for key, value in d.items()}

regions_dict


{'Allentown-Bethlehem-East Stroudsburg, PA-NJ': ['NJ', 'PA'],
 'Atlanta--Athens-Clarke County--Sandy Springs, GA-AL': ['AL', 'GA'],
 'Boise City-Mountain Home-Ontario, ID-OR': ['ID', 'OR'],
 'Boston-Worcester-Providence, MA-RI-NH': ['MA', 'NH', 'RI'],
 'Brookings-Crescent City, OR-CA': ['CA', 'OR'],
 'Burlington-Fort Madison, IA-IL': ['IA', 'IL'],
 'Cape Girardeau-Sikeston, MO-IL': ['IL', 'MO'],
 'Charleston-Huntington-Ashland, WV-OH-KY': ['KY', 'OH', 'WV'],
 'Charlotte-Concord, NC-SC': ['NC', 'SC'],
 'Chattanooga-Cleveland-Dalton, TN-GA-AL': ['AL', 'GA', 'TN'],
 'Chicago-Naperville, IL-IN-WI': ['IL', 'IN', 'WI'],
 'Cincinnati-Wilmington, OH-KY-IN': ['IN', 'KY', 'OH'],
 'Columbus-Auburn-Opelika, GA-AL': ['AL', 'GA'],
 'Dallas-Fort Worth, TX-OK': ['OK', 'TX'],
 'Davenport-Moline, IA-IL': ['IA', 'IL'],
 'Duluth-Grand Rapids, MN-WI': ['MN', 'WI'],
 'El Paso-Las Cruces, TX-NM': ['NM', 'TX'],
 'Evansville-Henderson, IN-KY': ['IN', 'KY'],
 'Fargo-Wahpeton, ND-MN': ['MN', 'ND'],
 'Huntsville-