# Name: Margaret Nguyen

# Data Manipulation: Merging BNA data and Pennsylvania municipality data


Assignment: Merge the 51 municipalities from Pennsylvania that you obtained the people for bikes BNA score for with the data from the CSV file. They should all be present in the CSV plus another 2530 some odd other municipalities. You will likely have to merge them using the name of the municipality, but be careful as there are municipalities in Pennsylvania that have very similar names, i.e (Lancaster township & Lancaster City).

In [42]:
# Import packages
import pandas as pd
import re

In [43]:
# Read the csv files 
crash_data = pd.read_csv("/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/2017_TO_2021_MUNI_CRASH_DATA.csv", low_memory=False)
bna = pd.read_csv("/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/BNA_score.csv", low_memory=False)

In [44]:
# Clean datasets
df_bna = bna.drop(columns = ['Unnamed: 0'])
df_crash = crash_data.drop(columns = ['Unnamed: 0'])

# Keep only Pennsylvania municipalities from the BNA dataframe
df_pa = df_bna[df_bna["State"]=="PA"]

# Drop Country columns
df_pa = df_pa.drop(["Country"], axis=1)

# Reset index
df_pa = df_pa.reset_index(drop=True)

# View Pennsylvania municipalities data
df_pa.head(10)

Unnamed: 0,City,State,BNA Score
0,Allentown,PA,29
1,Altoona,PA,36
2,Ardmore,PA,41
3,Athens,PA,23
4,Beaver,PA,36
5,Bellwood,PA,31
6,Bethlehem,PA,27
7,Birdsboro,PA,59
8,Camp Hill,PA,28
9,Carlisle,PA,38


In [45]:
df_pa

Unnamed: 0,City,State,BNA Score
0,Allentown,PA,29
1,Altoona,PA,36
2,Ardmore,PA,41
3,Athens,PA,23
4,Beaver,PA,36
5,Bellwood,PA,31
6,Bethlehem,PA,27
7,Birdsboro,PA,59
8,Camp Hill,PA,28
9,Carlisle,PA,38


In [47]:
# Lower the city name from PA dataframe
df_pa["City"] = df_pa["City"].str.lower()

# Lower the city name from the crash dataframe
df_crash["Location"] = df_crash["MUNI_NAME"].str.lower()

# Split the location from the crash dataframe for merging
df_crash["Municipal Divisions"] = df_crash["Location"].str.split(' ').str[-1] # Municipal Divisions such as borough, township, and borough
df_crash["First Municipality"] = df_crash["Location"].str.split(' ').str[0]
df_crash["Second Municipality"] = df_crash["Location"].str.split(' ').str[1]

# Define a function to conditionally concatenate columns
def concatenate_municipalities(row):
    if "borough" not in row["Second Municipality"] and "city" not in row["Second Municipality"] and\
    "township" not in row["Second Municipality"]:
        return row["First Municipality"] + " " + row["Second Municipality"]
    else:
        return row["First Municipality"]

# Apply the function to create a new column
df_crash["City"] = df_crash.apply(concatenate_municipalities, axis=1)

# Drop unnecessary columns
df_crash = df_crash.drop(columns=["Location", "First Municipality", "Second Municipality"])

In [49]:
df_crash

Unnamed: 0,NAME,PENN_DOT_MUNI_ID,state,county,county_subdivision,POPULATION,LAND_AREA,BIKE_TO_WORK_EST,BIKE_TO_WORK_MARG,WALK_TO_WORK_EST,...,PED_DEATH_BY_AUTO_COUNT,PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT,BICYCLE_SOLO_COUNT,BICYCLE_DEATH_SOLO_COUNT,BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT,PED_SOLO_COUNT,PED_DEATH_SOLO_COUNT,PED_SUSP_SERIOUS_INJ_SOLO_COUNT,Municipal Divisions,City
0,"Philadelphia city, Philadelphia County",67301,42,101,60000,1596865,134.1,14172,940,54269,...,82.0,214.0,505.0,10.0,24.0,2655.0,83.0,227.0,city,philadelphia
1,"Pittsburgh city, Allegheny County",2301,42,3,61000,303207,55.4,1920,290,15005,...,6.0,75.0,54.0,1.0,8.0,332.0,16.0,43.0,city,pittsburgh
2,"State College borough, Centre County",14410,42,27,73808,40352,4.6,729,297,3963,...,0.0,3.0,12.0,0.0,3.0,42.0,0.0,3.0,borough,state college
3,"Erie city, Erie County",25302,42,49,24000,95536,19.1,329,191,2003,...,3.0,10.0,52.0,0.0,1.0,94.0,5.0,12.0,city,erie
4,"Ferguson township, Centre County",14206,42,27,25624,19236,47.7,259,116,335,...,0.0,0.0,3.0,1.0,0.0,3.0,0.0,1.0,township,ferguson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,"Pine township, Crawford County",20215,42,39,60312,429,6.5,0,11,2,...,,,,,,,,,township,pine
2565,"Richmond township, Crawford County",20217,42,39,64600,1359,36.7,0,11,0,...,,,,,,,,,township,richmond
2566,"Rockdale township, Crawford County",20218,42,39,65440,1258,36.0,0,11,11,...,,,,,,,,,township,rockdale
2567,"Rome township, Crawford County",20219,42,39,65960,1879,41.3,0,11,37,...,,,,,,,,,township,rome


In [None]:
df_crash[]

# Merge dataframes
df_pa_crash = df_crash.merge(df_pa, how="inner", on="City")

# View dataframe
df_pa_crash.head(10)

In [None]:
# Extract the city name from crash dataframe for matching names (merging)
df_pa_crash["NAME"] = df_pa_crash["City"].str.split('\n')

In [6]:
# Assuming 'data' is a subdirectory in your current working directory
folder_path = 'data/'
file_name = 'df_pa_crash.csv'

# Combine the folder path and file name to create the full file path
full_file_path = folder_path + file_name

# Export dataframe to csv file
df_pa_crash.to_csv(full_file_path, index=True)