In [1]:
import numpy as np
import pandas as pd
from sklearn import manifold, datasets, decomposition
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from ipywidgets import interact, fixed
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from matplotlib.patches import Ellipse
from scipy import linalg
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_percentage_error, mean_squared_error
import os
from sklearn.cluster import KMeans
import osmnx as ox
import geopandas as gpd
#import plotly.express as px
import urllib.request
import zipfile

import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
from sklearn.decomposition import PCA, FactorAnalysis
from factor_analyzer import FactorAnalyzer, calculate_kmo, calculate_bartlett_sphericity

  from pandas.core import (


In [2]:
osmdf = pd.read_csv('../../data/raw/msa-osm-stats.csv')
msa_emissions = pd.read_csv('../../data/tidy/msa-emissions-by-gas.csv')
msa_acs = pd.read_csv('../../data/tidy/msa-modeshare-employment-salary.csv')
msa_gdp = pd.read_csv('../../data/tidy/msa-gdp.csv')
msa_transit = pd.read_csv('../../data/tidy/msa-transit-vmt-fuel.csv')
land_cover = pd.read_csv('../../data/tidy/msa-land-cover-area.csv')
CDD = pd.read_csv('../../data/tidy/cooling-degree-days-average.csv')
HDD = pd.read_csv('../../data/tidy/heating-degree-days-average.csv')
climate = pd.read_csv('../../data/tidy/other-climate-data.csv')
climate

Unnamed: 0,MSA,Palmer Z Index,Average Temperature (F),Precipitation (inches)
0,"Abilene, TX",0.340000,65.000000,27.356667
1,"Akron, OH",0.165000,52.500000,40.550000
2,"Albany, GA",0.547500,67.725000,58.845000
3,"Albany-Lebanon, OR",-0.350000,50.200000,65.670000
4,"Albany-Schenectady-Troy, NY",-0.396000,48.860000,37.660000
...,...,...,...,...
364,"Yakima, WA",-0.860000,47.900000,22.810000
365,"York-Hanover, PA",-0.220000,55.000000,42.820000
366,"Youngstown-Warren-Boardman, OH-PA",0.783333,51.033333,45.596667
367,"Yuba City, CA",-1.580000,63.950000,12.295000


In [3]:
msa_emissions  = msa_emissions.drop(msa_emissions.index[0])

# Merge the DataFrames on the "msa" and "MSA" columns
merged_df = pd.merge(osmdf, msa_emissions , how='inner', left_on='msa', right_on='MSA')
merged_msa = merged_df.drop(columns=['MSA'])

In [4]:
merged_msa.edge_length_total = merged_msa.edge_length_total/1000 #from m to km
merged_msa.edge_length_avg = merged_msa.edge_length_avg/1000 #from m to km
merged_msa.street_length_total = merged_msa.street_length_total/1000 #m -> km
merged_msa.edge_density_km = merged_msa.edge_density_km/1000
merged_msa.street_density_km = merged_msa.street_density_km/1000

osm_stat_dict = {
'n': 'Node count',
'm': 'Edge count',
'k_avg': 'Degree average',
'edge_length_total': 'Edge length total (km)',
'edge_length_avg': 'Edge length average (km)',
'streets_per_node_avg': 'Streets per node average',
'intersection_count': 'Intersection count',
'street_length_total': 'Street length total (km)',
'street_segment_count': 'Street segment count',
'street_length_avg': 'Street length average (km)',
'circuity_avg': 'Circuity average',
'self_loop_proportion': 'Self-loop proportion',
'node_density_km': 'Node density (per sq km)',
'intersection_density_km': 'Intersection density (per sq km)',
'edge_density_km': 'Edge density (km/sq km)',
'street_density_km': 'Street density (km/sq km)',
'builing_area_sqkm': 'Building area (sq km)',
'area_sqkm': 'Area (sq km)',
'Carbon Dioxide':'Carbon Dioxide (Ton)',
'Methane':'Methane (Ton)',
'Nitrous Oxide':'Nitrous Oxide (Ton)'
}

merged_msa=merged_msa.rename(columns=osm_stat_dict)
merged_msa

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,Self-loop proportion,Node density (per sq km),Intersection density (per sq km),Edge density (km/sq km),Street density (km/sq km),Building area (sq km),Area (sq km),Carbon Dioxide (Ton),Methane (Ton),Nitrous Oxide (Ton)
0,"Abilene, TX",14357,39401,5.488751,17168.965935,0.435749,2.881452,11912,8925.436120,20606,...,0.004465,2.012523,1.669790,2.406697,1.251142,7.048505,7133.829924,532679.00283612,27.9820482395,5.0614236844
1,"Albany, GA",8466,22838,5.395228,9545.102272,0.417948,2.837468,6957,4989.592979,11954,...,0.002928,2.028965,1.667318,2.287583,1.195808,6.979580,4172.570196,263831.32031775423,20.83018452565225,4.532439606957801
2,"Akron, OH",26865,70900,5.278243,15191.985931,0.214273,2.756896,21200,8039.294972,36980,...,0.015089,11.231303,8.862967,6.351230,3.360944,14.307097,2391.975398,2007386.7393773547,167.52808604555003,25.02054581165
3,"Albany-Lebanon, OR",7132,17906,5.021312,8280.294430,0.462431,2.568144,5101,4225.438910,9147,...,0.005466,1.193324,0.853498,1.385456,0.706999,7.519705,5976.582645,955233.1995377,98.0240557692,21.2062527474
4,"Albany-Schenectady-Troy, NY",36977,96867,5.239311,27872.441864,0.287739,2.755794,30254,14555.424691,50900,...,0.033320,4.962653,4.060365,3.740738,1.953472,41.455466,7451.054339,878381.4813259091,57.6102337069432,11.738693121229677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,"Worcester, MA-CT",38037,95787,5.036517,24604.031211,0.256862,2.657938,29828,12894.683514,50471,...,0.017416,6.986970,5.479069,4.519485,2.368609,74.415961,5443.990594,2333313.226772765,169.85238166675,31.5916402819
364,"York-Hanover, PA",16622,44476,5.351462,13291.255851,0.298841,2.781194,13507,6831.204674,23064,...,0.006287,7.049714,5.728582,5.637080,2.897247,18.432839,2357.826232,2067360.2114782073,219.9754925947,47.809417099600005
365,"Yuba City, CA",8419,21665,5.146692,7925.868395,0.365837,2.678465,6360,4094.273240,11260,...,0.002220,2.597365,1.962138,2.445227,1.263133,1.594076,3241.362503,288696.43436585995,7.997357688249999,3.84967807065
366,"Youngstown-Warren-Boardman, OH-PA",24885,66650,5.356641,19738.540791,0.296152,2.794736,19982,10381.621775,34674,...,0.006085,5.511548,4.425628,4.371706,2.299329,15.897550,4515.065516,1022715.3651003597,100.22097520127433,15.492663858217666


In [5]:
df = pd.merge(merged_msa, msa_acs , how='inner', left_on='msa', right_on='MSA')


df['msa'] = df['msa'].str.strip()
msa_gdp['MSA'] = msa_gdp['MSA'].str.strip()
final_df = pd.merge(df, msa_gdp , how='inner', left_on='msa', right_on='MSA')
# final_df.drop(columns=['MSA'], inplace=True)
# final_df.to_csv('../../data/tidy/msa-emissions-road-network-socioeconomic-merged.csv', index=False)
final_df

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,"Transportation and warehousing, and utilities",Information,"Finance and insurance, and real estate and rental and leasing","Professional, scientific, and management, and administrative and waste management services","Educational services, and health care and social assistance","Arts, entertainment, and recreation, and accommodation and food services","Other services, except public administration",Public administration,MSA_y,GDP
0,"Abilene, TX",14357,39401,5.488751,17168.965935,0.435749,2.881452,11912,8925.436120,20606,...,4445,812,5328,5003,21134,6295,4044,4989,"Abilene, TX",9448299
1,"Albany, GA",8466,22838,5.395228,9545.102272,0.417948,2.837468,6957,4989.592979,11954,...,3482,801,2878,4978,15223,4942,3729,5113,"Albany, GA",7309594
2,"Akron, OH",26865,70900,5.278243,15191.985931,0.214273,2.756896,21200,8039.294972,36980,...,17816,5610,21305,34973,83021,32632,16613,10873,"Akron, OH",42289042
3,"Albany-Lebanon, OR",7132,17906,5.021312,8280.294430,0.462431,2.568144,5101,4225.438910,9147,...,2828,398,2248,4135,13943,4020,3034,3670,"Albany-Lebanon, OR",5671303
4,"Albany-Schenectady-Troy, NY",36977,96867,5.239311,27872.441864,0.287739,2.755794,30254,14555.424691,50900,...,18280,7811,30577,51511,121009,37259,20784,44192,"Albany-Schenectady-Troy, NY",78294954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,"Worcester, MA-CT",38037,95787,5.036517,24604.031211,0.256862,2.657938,29828,12894.683514,50471,...,21374,8750,29588,52199,135238,36367,21256,18038,"Worcester, MA-CT",57736191
351,"York-Hanover, PA",16622,44476,5.351462,13291.255851,0.298841,2.781194,13507,6831.204674,23064,...,15527,2906,13026,20653,49522,16491,11242,11345,"York-Hanover, PA",24082525
352,"Yuba City, CA",8419,21665,5.146692,7925.868395,0.365837,2.678465,6360,4094.273240,11260,...,4301,762,3060,5963,15148,6370,3223,5095,"Yuba City, CA",9258388
353,"Youngstown-Warren-Boardman, OH-PA",24885,66650,5.356641,19738.540791,0.296152,2.794736,19982,10381.621775,34674,...,12297,3461,10634,16956,63029,22487,11181,9146,"Youngstown-Warren-Boardman, OH-PA",23410595


In [6]:
final_df = pd.merge(final_df, msa_transit , how='outer', left_on='msa', right_on='MSA')
# merged_df_new = pd.merge(final_df, land_cover, left_on='msa', right_on='MSA', how='inner')
# # final_df = pd.merge(final_df, land_cover , how='outer', left_on='msa', right_on='MSA')
# merged_df_new
final_df

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,Other Fuel (gal/gal equivalent),Electric Propulsion (kwh),Electric Battery (kwh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles)
0,"Abilene, TX",14357.0,39401.0,5.488751,17168.965935,0.435749,2.881452,11912.0,8925.436120,20606.0,...,,,,,,,,,,
1,"Akron, OH",26865.0,70900.0,5.278243,15191.985931,0.214273,2.756896,21200.0,8039.294972,36980.0,...,0.0,0.0,0.0,2619706.0,560570.0,0.0,3166425.0,0.0,0.0,0.0
2,"Albany, GA",8466.0,22838.0,5.395228,9545.102272,0.417948,2.837468,6957.0,4989.592979,11954.0,...,0.0,0.0,0.0,148053.0,12499.0,0.0,439611.0,0.0,0.0,0.0
3,"Albany-Lebanon, OR",7132.0,17906.0,5.021312,8280.294430,0.462431,2.568144,5101.0,4225.438910,9147.0,...,,,,,,,,,,
4,"Albany-Schenectady-Troy, NY",36977.0,96867.0,5.239311,27872.441864,0.287739,2.755794,30254.0,14555.424691,50900.0,...,0.0,0.0,42126.0,8632669.0,1343264.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,"Yakima, WA",10920.0,28902.0,5.293407,14155.026874,0.489759,2.709982,8275.0,7249.500452,14773.0,...,0.0,0.0,0.0,730634.0,275527.0,0.0,0.0,0.0,0.0,0.0
366,"York-Hanover, PA",16622.0,44476.0,5.351462,13291.255851,0.298841,2.781194,13507.0,6831.204674,23064.0,...,0.0,0.0,0.0,515734.0,6299147.0,0.0,553503.0,0.0,0.0,0.0
367,"Youngstown-Warren-Boardman, OH-PA",24885.0,66650.0,5.356641,19738.540791,0.296152,2.794736,19982.0,10381.621775,34674.0,...,0.0,0.0,0.0,1467533.0,486469.0,0.0,0.0,0.0,0.0,0.0
368,"Yuba City, CA",8419.0,21665.0,5.146692,7925.868395,0.365837,2.678465,6360.0,4094.273240,11260.0,...,0.0,0.0,0.0,926082.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
final_df = final_df.drop(columns=['MSA_x', 'MSA_y','MSA'])
final_df = final_df.fillna(0)
final_df = final_df[final_df.iloc[:, 0] != 0]
final_df = final_df.reset_index(drop=True)
final_df.to_csv('../../data/tidy/completed-dataset_1.csv', index=False)
final_df

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,Other Fuel (gal/gal equivalent),Electric Propulsion (kwh),Electric Battery (kwh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles)
0,"Abilene, TX",14357.0,39401.0,5.488751,17168.965935,0.435749,2.881452,11912.0,8925.436120,20606.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Akron, OH",26865.0,70900.0,5.278243,15191.985931,0.214273,2.756896,21200.0,8039.294972,36980.0,...,0.0,0.0,0.0,2619706.0,560570.0,0.0,3166425.0,0.0,0.0,0.0
2,"Albany, GA",8466.0,22838.0,5.395228,9545.102272,0.417948,2.837468,6957.0,4989.592979,11954.0,...,0.0,0.0,0.0,148053.0,12499.0,0.0,439611.0,0.0,0.0,0.0
3,"Albany-Lebanon, OR",7132.0,17906.0,5.021312,8280.294430,0.462431,2.568144,5101.0,4225.438910,9147.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Albany-Schenectady-Troy, NY",36977.0,96867.0,5.239311,27872.441864,0.287739,2.755794,30254.0,14555.424691,50900.0,...,0.0,0.0,42126.0,8632669.0,1343264.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,"Yakima, WA",10920.0,28902.0,5.293407,14155.026874,0.489759,2.709982,8275.0,7249.500452,14773.0,...,0.0,0.0,0.0,730634.0,275527.0,0.0,0.0,0.0,0.0,0.0
351,"York-Hanover, PA",16622.0,44476.0,5.351462,13291.255851,0.298841,2.781194,13507.0,6831.204674,23064.0,...,0.0,0.0,0.0,515734.0,6299147.0,0.0,553503.0,0.0,0.0,0.0
352,"Youngstown-Warren-Boardman, OH-PA",24885.0,66650.0,5.356641,19738.540791,0.296152,2.794736,19982.0,10381.621775,34674.0,...,0.0,0.0,0.0,1467533.0,486469.0,0.0,0.0,0.0,0.0,0.0
353,"Yuba City, CA",8419.0,21665.0,5.146692,7925.868395,0.365837,2.678465,6360.0,4094.273240,11260.0,...,0.0,0.0,0.0,926082.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
DD = pd.merge(CDD, HDD , how='outer', left_on='MSA', right_on='MSA')


In [9]:
DD =DD.merge(climate, on="MSA")


In [10]:
DD = DD.rename(columns={'MSA': 'msa_CDD'})
final_df = pd.merge(final_df, DD , how='inner', left_on='msa', right_on='msa_CDD')

final_df

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,Compressed Natural Gas (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),msa_CDD,Cooling Degree Days,Heating Degree Days,Palmer Z Index,Average Temperature (F),Precipitation (inches)
0,"Abilene, TX",14357.0,39401.0,5.488751,17168.965935,0.435749,2.881452,11912.0,8925.436120,20606.0,...,0.0,0.0,0.0,0.0,"Abilene, TX",2383.00,2332.000000,0.340000,65.000000,27.356667
1,"Akron, OH",26865.0,70900.0,5.278243,15191.985931,0.214273,2.756896,21200.0,8039.294972,36980.0,...,3166425.0,0.0,0.0,0.0,"Akron, OH",823.00,5338.000000,0.165000,52.500000,40.550000
2,"Albany, GA",8466.0,22838.0,5.395228,9545.102272,0.417948,2.837468,6957.0,4989.592979,11954.0,...,439611.0,0.0,0.0,0.0,"Albany, GA",2546.75,1526.000000,0.547500,67.725000,58.845000
3,"Albany-Lebanon, OR",7132.0,17906.0,5.021312,8280.294430,0.462431,2.568144,5101.0,4225.438910,9147.0,...,0.0,0.0,0.0,0.0,"Albany-Lebanon, OR",163.00,5550.000000,-0.350000,50.200000,65.670000
4,"Albany-Schenectady-Troy, NY",36977.0,96867.0,5.239311,27872.441864,0.287739,2.755794,30254.0,14555.424691,50900.0,...,0.0,0.0,0.0,0.0,"Albany-Schenectady-Troy, NY",529.40,6367.200000,-0.396000,48.860000,37.660000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,"Yakima, WA",10920.0,28902.0,5.293407,14155.026874,0.489759,2.709982,8275.0,7249.500452,14773.0,...,0.0,0.0,0.0,0.0,"Yakima, WA",254.00,6455.000000,-0.860000,47.900000,22.810000
348,"York-Hanover, PA",16622.0,44476.0,5.351462,13291.255851,0.298841,2.781194,13507.0,6831.204674,23064.0,...,553503.0,0.0,0.0,0.0,"York-Hanover, PA",1050.00,4662.000000,-0.220000,55.000000,42.820000
349,"Youngstown-Warren-Boardman, OH-PA",24885.0,66650.0,5.356641,19738.540791,0.296152,2.794736,19982.0,10381.621775,34674.0,...,0.0,0.0,0.0,0.0,"Youngstown-Warren-Boardman, OH-PA",643.00,5695.333333,0.783333,51.033333,45.596667
350,"Yuba City, CA",8419.0,21665.0,5.146692,7925.868395,0.365837,2.678465,6360.0,4094.273240,11260.0,...,0.0,0.0,0.0,0.0,"Yuba City, CA",1969.50,2338.000000,-1.580000,63.950000,12.295000


In [11]:
final_df = final_df.dropna(subset=['msa'])

In [12]:
# Example index list to delete, replace it with your actual index list
indices_to_delete = [6,25,36,42,51,56,64,94,138,145,148,156,180,185,187,193,210,223,226,245,286,288,317,333,355,360,371]  # Example indices to delete

# Delete rows from land_cover dataframe based on the indices
land_cover_cleaned = land_cover.drop(indices_to_delete)

# Reindex the dataframe after deleting the rows
land_cover_cleaned = land_cover_cleaned.reset_index(drop=True)
land_cover_cleaned

Unnamed: 0,MSA,Barren Land (Rock/Sand/Clay),Cultivated Crops,Deciduous Forest,"Developed, High Intensity","Developed, Low Intensity","Developed, Medium Intensity","Developed, Open Space",Emergent Herbaceous Wetlands,Evergreen Forest,Grassland/Herbaceous,Mixed Forest,Open Water,Pasture/Hay,Shrubland,Woody Wetlands
0,"Abilene, TX",6728.0,2099812.0,153889.0,27003.0,108973.0,76541.0,258626.0,2087.0,473501.0,638685.0,23295.0,47490.0,28297.0,3980466.0,10880.0
1,"Akron, OH",5972.0,173140.0,811485.0,65804.0,368917.0,181242.0,354969.0,22025.0,7673.0,25169.0,66296.0,64747.0,384584.0,8219.0,119363.0
2,"Albany, GA",2013.0,2102623.0,68475.0,19771.0,143836.0,46683.0,208229.0,53826.0,1247126.0,159669.0,253748.0,58219.0,39741.0,79667.0,1151533.0
3,"Albany, OR",45972.0,363901.0,6858.0,22567.0,76593.0,55590.0,86436.0,42935.0,3811640.0,284662.0,76560.0,52290.0,1167763.0,491627.0,51173.0
4,"Albany-Schenectady-Troy, NY",27105.0,226497.0,1921802.0,90837.0,363813.0,227610.0,500605.0,68212.0,952523.0,75086.0,1835704.0,172390.0,1126778.0,57824.0,636018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,"Yakima, WA",104233.0,1657160.0,8093.0,26821.0,211223.0,104022.0,268798.0,88246.0,3773414.0,3621831.0,5845.0,53340.0,75506.0,2269078.0,120144.0
348,"York-Hanover, PA",5516.0,769578.0,643266.0,39546.0,173571.0,79688.0,285870.0,2865.0,9908.0,7046.0,176414.0,17956.0,370022.0,24673.0,14710.0
349,"Youngstown-Warren-Boardman, OH-PA",15195.0,720646.0,1809599.0,57322.0,364040.0,139347.0,439246.0,28271.0,14428.0,34521.0,120829.0,106522.0,852683.0,19317.0,298084.0
350,"Yuba City, CA",31366.0,1869042.0,51807.0,14763.0,69990.0,62000.0,127618.0,45932.0,558235.0,433681.0,25995.0,55545.0,3786.0,235328.0,19142.0


In [13]:
# Example index list to delete, replace it with your actual index list
indices_to_delete = [163,256,328]  # Example indices to delete

# Delete rows from land_cover dataframe based on the indices
final_df_cleaned = final_df.drop(indices_to_delete)

# Reindex the dataframe after deleting the rows
final_df_cleaned = final_df_cleaned.reset_index(drop=True)
final_df_cleaned

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,Compressed Natural Gas (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),msa_CDD,Cooling Degree Days,Heating Degree Days,Palmer Z Index,Average Temperature (F),Precipitation (inches)
0,"Abilene, TX",14357.0,39401.0,5.488751,17168.965935,0.435749,2.881452,11912.0,8925.436120,20606.0,...,0.0,0.0,0.0,0.0,"Abilene, TX",2383.00,2332.000000,0.340000,65.000000,27.356667
1,"Akron, OH",26865.0,70900.0,5.278243,15191.985931,0.214273,2.756896,21200.0,8039.294972,36980.0,...,3166425.0,0.0,0.0,0.0,"Akron, OH",823.00,5338.000000,0.165000,52.500000,40.550000
2,"Albany, GA",8466.0,22838.0,5.395228,9545.102272,0.417948,2.837468,6957.0,4989.592979,11954.0,...,439611.0,0.0,0.0,0.0,"Albany, GA",2546.75,1526.000000,0.547500,67.725000,58.845000
3,"Albany-Lebanon, OR",7132.0,17906.0,5.021312,8280.294430,0.462431,2.568144,5101.0,4225.438910,9147.0,...,0.0,0.0,0.0,0.0,"Albany-Lebanon, OR",163.00,5550.000000,-0.350000,50.200000,65.670000
4,"Albany-Schenectady-Troy, NY",36977.0,96867.0,5.239311,27872.441864,0.287739,2.755794,30254.0,14555.424691,50900.0,...,0.0,0.0,0.0,0.0,"Albany-Schenectady-Troy, NY",529.40,6367.200000,-0.396000,48.860000,37.660000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,"Yakima, WA",10920.0,28902.0,5.293407,14155.026874,0.489759,2.709982,8275.0,7249.500452,14773.0,...,0.0,0.0,0.0,0.0,"Yakima, WA",254.00,6455.000000,-0.860000,47.900000,22.810000
345,"York-Hanover, PA",16622.0,44476.0,5.351462,13291.255851,0.298841,2.781194,13507.0,6831.204674,23064.0,...,553503.0,0.0,0.0,0.0,"York-Hanover, PA",1050.00,4662.000000,-0.220000,55.000000,42.820000
346,"Youngstown-Warren-Boardman, OH-PA",24885.0,66650.0,5.356641,19738.540791,0.296152,2.794736,19982.0,10381.621775,34674.0,...,0.0,0.0,0.0,0.0,"Youngstown-Warren-Boardman, OH-PA",643.00,5695.333333,0.783333,51.033333,45.596667
347,"Yuba City, CA",8419.0,21665.0,5.146692,7925.868395,0.365837,2.678465,6360.0,4094.273240,11260.0,...,0.0,0.0,0.0,0.0,"Yuba City, CA",1969.50,2338.000000,-1.580000,63.950000,12.295000


In [14]:
# Concatenate final_df and land_cover_df along columns
merged_df = pd.concat([final_df_cleaned, land_cover_cleaned], axis=1)

# Display the merged dataframe
merged_df


Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,"Developed, Medium Intensity","Developed, Open Space",Emergent Herbaceous Wetlands,Evergreen Forest,Grassland/Herbaceous,Mixed Forest,Open Water,Pasture/Hay,Shrubland,Woody Wetlands
0,"Abilene, TX",14357.0,39401.0,5.488751,17168.965935,0.435749,2.881452,11912.0,8925.436120,20606.0,...,76541.0,258626.0,2087.0,473501.0,638685.0,23295.0,47490.0,28297.0,3980466.0,10880.0
1,"Akron, OH",26865.0,70900.0,5.278243,15191.985931,0.214273,2.756896,21200.0,8039.294972,36980.0,...,181242.0,354969.0,22025.0,7673.0,25169.0,66296.0,64747.0,384584.0,8219.0,119363.0
2,"Albany, GA",8466.0,22838.0,5.395228,9545.102272,0.417948,2.837468,6957.0,4989.592979,11954.0,...,46683.0,208229.0,53826.0,1247126.0,159669.0,253748.0,58219.0,39741.0,79667.0,1151533.0
3,"Albany-Lebanon, OR",7132.0,17906.0,5.021312,8280.294430,0.462431,2.568144,5101.0,4225.438910,9147.0,...,55590.0,86436.0,42935.0,3811640.0,284662.0,76560.0,52290.0,1167763.0,491627.0,51173.0
4,"Albany-Schenectady-Troy, NY",36977.0,96867.0,5.239311,27872.441864,0.287739,2.755794,30254.0,14555.424691,50900.0,...,227610.0,500605.0,68212.0,952523.0,75086.0,1835704.0,172390.0,1126778.0,57824.0,636018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,"Yuba City, CA",8419.0,21665.0,5.146692,7925.868395,0.365837,2.678465,6360.0,4094.273240,11260.0,...,104022.0,268798.0,88246.0,3773414.0,3621831.0,5845.0,53340.0,75506.0,2269078.0,120144.0
348,"Yuma, AZ",9291.0,25210.0,5.426757,8925.887915,0.354061,2.840491,7505.0,4715.715695,13203.0,...,79688.0,285870.0,2865.0,9908.0,7046.0,176414.0,17956.0,370022.0,24673.0,14710.0
349,,,,,,,,,,,...,139347.0,439246.0,28271.0,14428.0,34521.0,120829.0,106522.0,852683.0,19317.0,298084.0
350,,,,,,,,,,,...,62000.0,127618.0,45932.0,558235.0,433681.0,25995.0,55545.0,3786.0,235328.0,19142.0


In [15]:
merged_df = merged_df.drop(columns=['MSA', merged_df.columns[-1]])

# Display the final dataframe
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,msa,Node count,Edge count,Degree average,Edge length total (km),Edge length average (km),Streets per node average,Intersection count,Street length total (km),Street segment count,...,"Developed, Low Intensity","Developed, Medium Intensity","Developed, Open Space",Emergent Herbaceous Wetlands,Evergreen Forest,Grassland/Herbaceous,Mixed Forest,Open Water,Pasture/Hay,Shrubland
0,"Abilene, TX",14357.0,39401.0,5.488751,17168.965935,0.435749,2.881452,11912.0,8925.436120,20606.0,...,108973.0,76541.0,258626.0,2087.0,473501.0,638685.0,23295.0,47490.0,28297.0,3980466.0
1,"Akron, OH",26865.0,70900.0,5.278243,15191.985931,0.214273,2.756896,21200.0,8039.294972,36980.0,...,368917.0,181242.0,354969.0,22025.0,7673.0,25169.0,66296.0,64747.0,384584.0,8219.0
2,"Albany, GA",8466.0,22838.0,5.395228,9545.102272,0.417948,2.837468,6957.0,4989.592979,11954.0,...,143836.0,46683.0,208229.0,53826.0,1247126.0,159669.0,253748.0,58219.0,39741.0,79667.0
3,"Albany-Lebanon, OR",7132.0,17906.0,5.021312,8280.294430,0.462431,2.568144,5101.0,4225.438910,9147.0,...,76593.0,55590.0,86436.0,42935.0,3811640.0,284662.0,76560.0,52290.0,1167763.0,491627.0
4,"Albany-Schenectady-Troy, NY",36977.0,96867.0,5.239311,27872.441864,0.287739,2.755794,30254.0,14555.424691,50900.0,...,363813.0,227610.0,500605.0,68212.0,952523.0,75086.0,1835704.0,172390.0,1126778.0,57824.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,"Yakima, WA",10920.0,28902.0,5.293407,14155.026874,0.489759,2.709982,8275.0,7249.500452,14773.0,...,141547.0,79800.0,153208.0,134320.0,718608.0,75251.0,28921.0,125193.0,4793.0,123171.0
345,"York-Hanover, PA",16622.0,44476.0,5.351462,13291.255851,0.298841,2.781194,13507.0,6831.204674,23064.0,...,324414.0,127264.0,643575.0,4211.0,347841.0,117114.0,475535.0,79155.0,1106545.0,106990.0
346,"Youngstown-Warren-Boardman, OH-PA",24885.0,66650.0,5.356641,19738.540791,0.296152,2.794736,19982.0,10381.621775,34674.0,...,337495.0,296595.0,347027.0,50227.0,437405.0,66256.0,1253599.0,198166.0,271529.0,52538.0
347,"Yuba City, CA",8419.0,21665.0,5.146692,7925.868395,0.365837,2.678465,6360.0,4094.273240,11260.0,...,211223.0,104022.0,268798.0,88246.0,3773414.0,3621831.0,5845.0,53340.0,75506.0,2269078.0


In [16]:

merged_df.to_csv('../../data/tidy/completed-dataset.csv', index=False)