In [25]:
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)

# Metro Statistical Area Data (population, GDP, lat/lng)

In [26]:
# Compiled data for population, lat/lng and GDP of a MSA
msa_pickle = '../data/pickled/msa_df.pickle'
with open(msa_pickle, 'rb') as file:
    msa_df = pickle.load(file)

In [27]:
msa_df.head()

Unnamed: 0,MetroArea,Population,MainCity,lat,lng,GDP_thousands_dollars
0,"Abilene, TX",181591,"Abilene, TX",32.448736,-99.733144,9468978
1,"Akron, OH",698398,"Akron, OH",41.081199,-81.518838,44562456
2,"Albany, GA",145508,"Albany, GA",31.578507,-84.155741,7312400
3,"Albany, OR",131496,"Albany, OR",44.636511,-123.105928,6107649
4,"Albany-Schenectady-Troy, NY",904682,"Albany, NY",42.652579,-73.756232,80302855


In [28]:
# The MetroArea values are the ones from the population data (2023). 
# Note that the GDP data was from 2022 and I found that there are
# a few new MSAs.
msa_df.loc[msa_df['GDP_thousands_dollars'].isnull()]

Unnamed: 0,MetroArea,Population,MainCity,lat,lng,GDP_thousands_dollars
11,"Amherst Town-Northampton, MA",162502,"Amherst Town, MA",42.373222,-72.519854,
45,"Bozeman, MT",126409,"Bozeman, MT",45.679312,-111.037259,
100,"Eagle Pass, TX",57762,"Eagle Pass, TX",28.709143,-100.499521,
150,"Helena, MT",96091,"Helena, MT",46.589145,-112.039106,
180,"Kenosha, WI",167488,"Kenosha, WI",42.584742,-87.821185,
231,"Minot, ND",75742,"Minot, ND",48.232967,-101.292291,
263,"Paducah, KY-IL",102267,"Paducah, KY",37.083389,-88.600048,
271,"Pinehurst-Southern Pines, NC",106898,"Pinehurst, NC",35.195434,-79.469477,
310,"Sandusky, OH",113838,"Sandusky, OH",41.456175,-82.711682,
329,"Slidell-Mandeville-Covington, LA",275583,"Slidell, LA",30.275195,-89.781174,


# Distances between MSAs

In [3]:
# Driving distance based on averages from Google Maps
dist_pickle = '../data/pickled/distance_df.pickle'
with open(dist_pickle, 'rb') as file:
    dist_df = pickle.load(file)

In [7]:
# Note that I currently kept 2 measures for each city pair
# For example, Chicago->New York and New York->Chicago
# This is because the distance and duration isn't the same, so may want to average them
dist_df.head()

Unnamed: 0,Origin,Destination,Distance_meters,Duration_seconds,Distance_miles
0,"Abilene, TX","Akron, OH",2137508.0,69643.0,1328.185483
1,"Abilene, TX","Albany, GA",1582948.0,54532.0,983.597982
2,"Abilene, TX","Albany, OR",3077199.0,104836.0,1912.08222
3,"Abilene, TX","Albany, NY",2917051.0,94703.0,1812.570897
4,"Abilene, TX","Albuquerque, NM",783883.0,26964.0,487.082164


# Flights between MSAs

In [22]:
# Flight data between MSAs
# Each row represents the sum for a specific flight route for the month.
# Data covers Jan-Apr 2024
flight_pickle = '../data/pickled/flights_df.pickle'
with open(flight_pickle, 'rb') as file:
    flights_df = pickle.load(file)

In [23]:
flights_df.head()

Unnamed: 0,DEPARTURES_SCHEDULED,DEPARTURES_PERFORMED,SEATS,PASSENGERS,DISTANCE,RAMP_TO_RAMP,AIR_TIME,UNIQUE_CARRIER,AIRLINE_ID,UNIQUE_CARRIER_NAME,REGION,CARRIER_GROUP_NEW,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_CITY_MARKET_ID,AIRCRAFT_GROUP,AIRCRAFT_TYPE,MONTH,YEAR,DISTANCE_GROUP,CLASS
215,0.0,1.0,5.0,4.0,233.0,60.0,42.0,3EQ,21974,"Scott Aviation, LLC d/b/a Silver Air",D,2,LAX,"Los Angeles, CA",32575,HSH,"Las Vegas, NV",32211,6,685,4,2024,1,L
539,0.0,1.0,6.0,1.0,247.0,60.0,42.0,3EQ,21974,"Scott Aviation, LLC d/b/a Silver Air",D,2,HOU,"Houston, TX",31453,ADS,"Dallas, TX",30194,0,94,4,2024,1,L
540,0.0,1.0,6.0,1.0,247.0,60.0,48.0,3EQ,21974,"Scott Aviation, LLC d/b/a Silver Air",D,2,ADS,"Dallas, TX",30194,HOU,"Houston, TX",31453,0,94,4,2024,1,L
541,0.0,1.0,6.0,1.0,268.0,54.0,48.0,3EQ,21974,"Scott Aviation, LLC d/b/a Silver Air",D,2,RNO,"Reno, NV",34570,PRB,"Paso Robles, CA",34236,6,639,4,2024,1,L
548,0.0,1.0,6.0,1.0,510.0,96.0,84.0,3EQ,21974,"Scott Aviation, LLC d/b/a Silver Air",D,2,SAF,"Santa Fe, NM",34674,LAS,"Las Vegas, NV",32211,0,94,4,2024,2,L
