# Name: Margaret Nguyen

# Data Manipulation: Merging BNA data and Pennsylvania municipality data


Assignment: Merge the 51 municipalities from Pennsylvania that you obtained the people for bikes BNA score for with the data from the CSV file. They should all be present in the CSV plus another 2530 some odd other municipalities. You will likely have to merge them using the name of the municipality, but be careful as there are municipalities in Pennsylvania that have very similar names, i.e (Lancaster township & Lancaster City).

In [105]:
# Import packages
import pandas as pd
import re

In [106]:
# Read the csv files 
crash_data = pd.read_csv("/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/2017_TO_2021_MUNI_CRASH_DATA.csv", low_memory=False)
bna = pd.read_csv("/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/BNA_score.csv", low_memory=False)

In [107]:
# Clean datasets
df_bna = bna.drop(columns = ['Unnamed: 0'])
df_crash = crash_data.drop(columns = ['Unnamed: 0'])

# Keep only Pennsylvania municipalities from the BNA dataframe
df_pa = df_bna[df_bna["State"]=="PA"]

# Drop Country columns
df_pa = df_pa.drop(["Country", 'State'], axis=1)

# Reset index
df_pa = df_pa.reset_index(drop=True)

# View Pennsylvania municipalities data
df_pa.head(10)

Unnamed: 0,City,BNA Score
0,Allentown,29
1,Altoona,36
2,Ardmore,41
3,Athens,23
4,Beaver,36
5,Bellwood,31
6,Bethlehem,27
7,Birdsboro,59
8,Camp Hill,28
9,Carlisle,38


In [108]:
# Lower the city name from PA dataframe
df_pa["City"] = df_pa["City"].str.lower()

# Lower the city name from the crash dataframe
df_crash["Location"] = df_crash["MUNI_NAME"].str.lower()

# Split the location from the crash dataframe for merging
df_crash["Municipal Divisions"] = df_crash["Location"].str.split(' ').str[-1] # Municipal Divisions such as borough, township, and borough
df_crash["First Municipality"] = df_crash["Location"].str.split(' ').str[0]
df_crash["Second Municipality"] = df_crash["Location"].str.split(' ').str[1]

# Define a function to conditionally concatenate columns
def concatenate_municipalities(row):
    if "borough" not in row["Second Municipality"] and "city" not in row["Second Municipality"] and\
    "township" not in row["Second Municipality"]:
        return row["First Municipality"] + " " + row["Second Municipality"]
    else:
        return row["First Municipality"]

# Apply the function to create a new column
df_crash["City"] = df_crash.apply(concatenate_municipalities, axis=1)

# Drop unnecessary columns
df_crash = df_crash.drop(columns=["Location", "First Municipality", "Second Municipality"])

In [109]:
# Merge dataframes
df_pa_crash = df_crash.merge(df_pa, how="inner", on="City")

# View dataframe
df_pa_crash

Unnamed: 0,NAME,PENN_DOT_MUNI_ID,state,county,county_subdivision,POPULATION,LAND_AREA,BIKE_TO_WORK_EST,BIKE_TO_WORK_MARG,WALK_TO_WORK_EST,...,PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT,BICYCLE_SOLO_COUNT,BICYCLE_DEATH_SOLO_COUNT,BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT,PED_SOLO_COUNT,PED_DEATH_SOLO_COUNT,PED_SUSP_SERIOUS_INJ_SOLO_COUNT,Municipal Divisions,City,BNA Score
0,"Philadelphia city, Philadelphia County",67301,42,101,60000,1596865,134.1,14172,940,54269,...,214.0,505.0,10.0,24.0,2655.0,83.0,227.0,city,philadelphia,57
1,"Pittsburgh city, Allegheny County",2301,42,3,61000,303207,55.4,1920,290,15005,...,75.0,54.0,1.0,8.0,332.0,16.0,43.0,city,pittsburgh,28
2,"State College borough, Centre County",14410,42,27,73808,40352,4.6,729,297,3963,...,3.0,12.0,0.0,3.0,42.0,0.0,3.0,borough,state college,50
3,"Erie city, Erie County",25302,42,49,24000,95536,19.1,329,191,2003,...,10.0,52.0,0.0,1.0,94.0,5.0,12.0,city,erie,41
4,"Carlisle borough, Cumberland County",21402,42,41,11272,19869,5.5,235,104,1052,...,3.0,12.0,0.0,2.0,25.0,1.0,3.0,borough,carlisle,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,"Bellwood borough, Blair County",7401,42,13,5384,1600,0.4,0,11,25,...,,,,,,,,borough,bellwood,31
77,"Verona borough, Allegheny County",2468,42,3,80032,2731,0.5,0,11,28,...,,0.0,0.0,0.0,1.0,0.0,0.0,borough,verona,26
78,"Birdsboro borough, Berks County",6404,42,11,6504,5119,1.3,0,16,101,...,,3.0,0.0,0.0,3.0,0.0,1.0,borough,birdsboro,59
79,"West Grove borough, Chester County",15415,42,29,83104,2782,0.7,0,11,50,...,,0.0,0.0,0.0,1.0,0.0,0.0,borough,west grove,15


In [112]:
# Define the municipal division hierarchy
municipal_division_hierarchy = ['city', 'borough', 'township']

# Create a function to filter the DataFrame based on the hierarchy
def filter_by_hierarchy(group):
    for division in municipal_division_hierarchy:
        if division in group['Municipal Divisions'].values:
            return group[group['Municipal Divisions'] == division]
    return group  # Default to returning the original group if none of the preferred municipal divisions are found

# Apply the filter function to the DataFrame
filtered_df = df_pa_crash.groupby('City', group_keys=False).apply(filter_by_hierarchy)

# Reset the index
filtered_df = filtered_df.reset_index(drop=True)

# Display the DataFrame
filtered_df.head(10)

Unnamed: 0,NAME,PENN_DOT_MUNI_ID,state,county,county_subdivision,POPULATION,LAND_AREA,BIKE_TO_WORK_EST,BIKE_TO_WORK_MARG,WALK_TO_WORK_EST,...,PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT,BICYCLE_SOLO_COUNT,BICYCLE_DEATH_SOLO_COUNT,BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT,PED_SOLO_COUNT,PED_DEATH_SOLO_COUNT,PED_SUSP_SERIOUS_INJ_SOLO_COUNT,Municipal Divisions,City,BNA Score
0,"Allentown city, Lehigh County",39301,42,77,2000,125250,17.6,79,51,2396,...,29.0,28.0,0.0,0.0,154.0,1.0,20.0,city,allentown,29
1,"Altoona city, Blair County",7301,42,13,2184,44114,9.9,4,6,761,...,5.0,19.0,0.0,2.0,33.0,0.0,6.0,city,altoona,36
2,"Athens borough, Bradford County",8402,42,15,3392,3268,1.8,0,11,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,borough,athens,23
3,"Beaver borough, Beaver County",4404,42,7,4688,4438,0.9,0,11,75,...,,0.0,0.0,0.0,1.0,1.0,0.0,borough,beaver,36
4,"Bellwood borough, Blair County",7401,42,13,5384,1600,0.4,0,11,25,...,,,,,,,,borough,bellwood,31
5,"Bethlehem city, Northampton County",48301,42,95,6088,55816,19.1,129,156,1194,...,7.0,26.0,0.0,1.0,59.0,3.0,7.0,city,bethlehem,27
6,"Bethlehem city, Lehigh County",39302,42,77,6088,19783,4.4,0,18,122,...,0.0,6.0,0.0,0.0,7.0,0.0,0.0,city,bethlehem,27
7,"Birdsboro borough, Berks County",6404,42,11,6504,5119,1.3,0,16,101,...,,3.0,0.0,0.0,3.0,0.0,1.0,borough,birdsboro,59
8,"Camp Hill borough, Cumberland County",21401,42,41,11000,8115,2.1,21,23,137,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,borough,camp hill,28
9,"Carlisle borough, Cumberland County",21402,42,41,11272,19869,5.5,235,104,1052,...,3.0,12.0,0.0,2.0,25.0,1.0,3.0,borough,carlisle,38


In [111]:
# Assuming 'data' is a subdirectory in your current working directory
folder_path = 'data/'
file_name = 'df_pa_crash.csv'

# Combine the folder path and file name to create the full file path
full_file_path = folder_path + file_name

# Export dataframe to csv file
filtered_df.to_csv(full_file_path, index=True)