# Name: Margaret Nguyen

# Data Manipulation: Merging BNA data and Pennsylvania municipality data


Assignment: Merge the 51 municipalities from Pennsylvania that you obtained the people for bikes BNA score for with the data from the CSV file. They should all be present in the CSV plus another 2530 some odd other municipalities. You will likely have to merge them using the name of the municipality, but be careful as there are municipalities in Pennsylvania that have very similar names, i.e (Lancaster township & Lancaster City).

In [201]:
# Import packages
import pandas as pd
import re

In [202]:
# Read the csv files 
crash_data = pd.read_csv("/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/2017_TO_2021_MUNI_CRASH_DATA.csv", low_memory=False)
bna = pd.read_csv("/Users/margaret06/Documents/GitHub/Carlisle_Borough_Transportation_Study/data/BNA_score.csv", low_memory=False)

In [203]:
# Clean datasets
df_bna = bna.drop(columns = ['Unnamed: 0'])
df_crash = crash_data.drop(columns = ['Unnamed: 0'])

# Keep only Pennsylvania municipalities from the BNA dataframe
df_pa = df_bna[df_bna["State"]=="PA"]

# Drop Country columns
df_pa = df_pa.drop(["Country", 'State'], axis=1)

# Reset index
df_pa = df_pa.reset_index(drop=True)

# View Pennsylvania municipalities data
df_pa.head(10)

Unnamed: 0,City,BNA Score
0,Allentown,29
1,Altoona,36
2,Ardmore,41
3,Athens,23
4,Beaver,36
5,Bellwood,31
6,Bethlehem,27
7,Birdsboro,59
8,Camp Hill,28
9,Carlisle,38


In [204]:
# Lower the city name from PA dataframe
df_pa["City"] = df_pa["City"].str.lower()

# Lower the city name from the crash dataframe
df_crash["Location"] = df_crash["MUNI_NAME"].str.lower()

# Split the location from the crash dataframe for merging
df_crash["Municipal Divisions"] = df_crash["Location"].str.split(' ').str[-1] # Municipal Divisions such as borough, township, and borough
df_crash["First Municipality"] = df_crash["Location"].str.split(' ').str[0]
df_crash["Second Municipality"] = df_crash["Location"].str.split(' ').str[1]

# Define a function to conditionally concatenate columns
def concatenate_municipalities(row):
    if "borough" not in row["Second Municipality"] and "city" not in row["Second Municipality"] and\
    "township" not in row["Second Municipality"]:
        return row["First Municipality"] + " " + row["Second Municipality"]
    else:
        return row["First Municipality"]

# Apply the function to create a new column
df_crash["City"] = df_crash.apply(concatenate_municipalities, axis=1)

# Drop unnecessary columns
df_crash = df_crash.drop(columns=["Location", "First Municipality", "Second Municipality"])

# Merge dataframes
df_pa_crash = df_crash.merge(df_pa, how="inner", on="City")

In [205]:
# Define the municipal division hierarchy
municipal_division_hierarchy = ['city', 'borough', 'township']

# Create a function to filter the DataFrame based on the hierarchy
def filter_by_hierarchy(group):
    for division in municipal_division_hierarchy:
        if division in group['Municipal Divisions'].values:
            return group[group['Municipal Divisions'] == division]
    return group  # Default to returning the original group if none of the preferred municipal divisions are found

# Apply the filter function to the DataFrame
filtered_df = df_pa_crash.groupby('City', group_keys=False).apply(filter_by_hierarchy)

# Reset the index
filtered_df = filtered_df.reset_index(drop=True)

# Display the DataFrame
filtered_df.head(10)

Unnamed: 0,NAME,PENN_DOT_MUNI_ID,state,county,county_subdivision,POPULATION,LAND_AREA,BIKE_TO_WORK_EST,BIKE_TO_WORK_MARG,WALK_TO_WORK_EST,...,PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT,BICYCLE_SOLO_COUNT,BICYCLE_DEATH_SOLO_COUNT,BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT,PED_SOLO_COUNT,PED_DEATH_SOLO_COUNT,PED_SUSP_SERIOUS_INJ_SOLO_COUNT,Municipal Divisions,City,BNA Score
0,"Allentown city, Lehigh County",39301,42,77,2000,125250,17.6,79,51,2396,...,29.0,28.0,0.0,0.0,154.0,1.0,20.0,city,allentown,29
1,"Altoona city, Blair County",7301,42,13,2184,44114,9.9,4,6,761,...,5.0,19.0,0.0,2.0,33.0,0.0,6.0,city,altoona,36
2,"Athens borough, Bradford County",8402,42,15,3392,3268,1.8,0,11,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,borough,athens,23
3,"Beaver borough, Beaver County",4404,42,7,4688,4438,0.9,0,11,75,...,,0.0,0.0,0.0,1.0,1.0,0.0,borough,beaver,36
4,"Bellwood borough, Blair County",7401,42,13,5384,1600,0.4,0,11,25,...,,,,,,,,borough,bellwood,31
5,"Bethlehem city, Northampton County",48301,42,95,6088,55816,19.1,129,156,1194,...,7.0,26.0,0.0,1.0,59.0,3.0,7.0,city,bethlehem,27
6,"Bethlehem city, Lehigh County",39302,42,77,6088,19783,4.4,0,18,122,...,0.0,6.0,0.0,0.0,7.0,0.0,0.0,city,bethlehem,27
7,"Birdsboro borough, Berks County",6404,42,11,6504,5119,1.3,0,16,101,...,,3.0,0.0,0.0,3.0,0.0,1.0,borough,birdsboro,59
8,"Camp Hill borough, Cumberland County",21401,42,41,11000,8115,2.1,21,23,137,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,borough,camp hill,28
9,"Carlisle borough, Cumberland County",21402,42,41,11272,19869,5.5,235,104,1052,...,3.0,12.0,0.0,2.0,25.0,1.0,3.0,borough,carlisle,38


In [206]:
# Check for duplicate rows based on the "City" column
duplicateRows = filtered_df[filtered_df.duplicated(subset=['City'], keep=False)]

# Print the duplicate rows
duplicateRows.head(5)

Unnamed: 0,NAME,PENN_DOT_MUNI_ID,state,county,county_subdivision,POPULATION,LAND_AREA,BIKE_TO_WORK_EST,BIKE_TO_WORK_MARG,WALK_TO_WORK_EST,...,PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT,BICYCLE_SOLO_COUNT,BICYCLE_DEATH_SOLO_COUNT,BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT,PED_SOLO_COUNT,PED_DEATH_SOLO_COUNT,PED_SUSP_SERIOUS_INJ_SOLO_COUNT,Municipal Divisions,City,BNA Score
5,"Bethlehem city, Northampton County",48301,42,95,6088,55816,19.1,129,156,1194,...,7.0,26.0,0.0,1.0,59.0,3.0,7.0,city,bethlehem,27
6,"Bethlehem city, Lehigh County",39302,42,77,6088,19783,4.4,0,18,122,...,0.0,6.0,0.0,0.0,7.0,0.0,0.0,city,bethlehem,27
10,"Cranberry township, Venango County",60206,42,121,16944,6333,69.8,0,16,51,...,,,,,,,,township,cranberry,24
11,"Cranberry township, Butler County",10212,42,19,16920,32594,22.8,0,24,150,...,3.0,1.0,1.0,0.0,6.0,1.0,0.0,township,cranberry,24
27,"Peters township, Washington County",62222,42,125,59608,22728,19.6,0,21,44,...,0.0,1.0,0.0,0.0,4.0,1.0,1.0,township,peters,21


In [207]:
# Import packages
from selenium import webdriver # Imports the webdriver module from the Selenium library.
import time 
import random
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException # Handle situations where an element is not found on a web page.
from selenium.common.exceptions import StaleElementReferenceException # Handle the stale element exception by re-finding the element or taking appropriate action
from selenium.webdriver.common.keys import Keys # Be used for various keyboard interactions when automating web testing or interactions with web elements.

In [1]:
# Purpose: The People for Bike (BNA) data does not include county information.
# Therefore, I use population data to determine which county or borough BNA mentioned.
# Scrape single page
def get_population():
    places = []
    
    # Construct the XPath expressions with the current page number
    place_xpath = '/html/body/div/ui-view/div[1]/div[1]/div/div[1]/h2'
    population_xpath = '/html/body/div/ui-view/div[1]/div[2]/ul/li[1]/span'
    
    # Get place
    place_element = driver.find_element('xpath', place_xpath)
    place = place_element.text.strip()
        
    # Get total population
    population_element = driver.find_element('xpath', population_xpath)
    population = population_element.text.strip()
    
    # Include in the list
    places.append([place, population])
    
    return places

In [209]:
# Webscraping to get the population for the accuracy in matching the data because People for Bike does not have county data

# Define an empty dataframe that will be populated
df_population = pd.DataFrame()

# Create the dataframe of cities that need to get the population information
df_city = pd.DataFrame(duplicateRows['City'].unique(), columns=['City'])

# Initialize the webdriver with a context manager to ensure it's closed properly
with webdriver.Chrome("/Users/margaret06/Downloads/chromedriver-mac-arm64/chromedriver") as driver:
    for city in df_city['City']:
        # Go to the website
        driver.get('https://bna.peopleforbikes.org/#/places////')
        
        # Input the place name to scrape the missing data
        place_input = driver.find_element('xpath', '/html/body/div/ui-view/div[1]/div[1]/div[1]/div/div/input')
        place_input.send_keys(f"{city}")
        place_input.send_keys(Keys.ENTER)

        # Wait between 2-3 seconds
        time.sleep(random.uniform(2, 3))

        # Filter the country
        country_search = driver.find_element('xpath', '/html/body/div/ui-view/div[1]/div[1]/div[2]/div[1]/div/select/option[25]')
        country_search.click()

        # Wait between 2-3 seconds
        time.sleep(random.uniform(2, 3))

        # Filter the state
        state_search = driver.find_element('xpath', '/html/body/div/ui-view/div[1]/div[1]/div[2]/div[2]/div/select/option[40]')
        state_search.click()
        
        # The detailed information
        further_inf = driver.find_element('xpath', '//*[@id="scrollHeaders"]/section[1]/div[1]')
        
        # Create an ActionChains object
        actions = ActionChains(driver)

        # Double-click the element
        actions.double_click(further_inf).perform()

        # Wait between 2-3 seconds
        time.sleep(random.uniform(2, 3))

        # Scrape the data
        total_population = get_population()

        # DataFrame
        df_single = pd.DataFrame(total_population, columns = ['City', 'Total Population'])

        # df_out that keeps the entire data
        df_population = pd.concat([df_population, df_single])
        
# Close the WebDriver when done
driver.quit()

# Reset index
df_population = df_population.reset_index(drop=True)

  with webdriver.Chrome("/Users/margaret06/Downloads/chromedriver-mac-arm64/chromedriver") as driver:


In [210]:
# Make the duplicated data into the pandas dataframe
df_duplicated = pd.DataFrame(duplicateRows)

# Lower the letters in the City column in the population dataframe 
df_population['City'] = df_population['City'].str.lower()

# Merge them for comparison
df_merged = df_duplicated.merge(df_population, how="inner", on="City")

# Convert "POPULATION" and "Total Population" columns to integers
df_merged['POPULATION'] = pd.to_numeric(df_merged['POPULATION'], errors='coerce').astype('Int64')
df_merged['Total Population'] = pd.to_numeric(df_merged['Total Population'], errors='coerce').astype('Int64')

# Calculate the absolute difference between "POPULATION" and "Total Population"
df_merged['Population_Difference'] = abs(df_merged['POPULATION'] - df_merged['Total Population'])

# Find the index of the row with the smallest absolute difference for each unique "City" name
min_diff_indices = df_merged.groupby('City')['Population_Difference'].idxmin()

# Keep the rows with the smallest absolute differences in df_merged
df_merged = df_merged.loc[min_diff_indices]

# Remove the 'Population_Difference' column if you don't need it
df_merged = df_merged.drop(columns=['Population_Difference'])

In [219]:
# Clean final dataframe
df_pa_crash = filtered_df.copy()

# Drop duplicated rows
df_pa_crash = df_pa_crash.drop_duplicates(subset=['City'], keep=False)

# Merge the dataframes
df_pa_crash = pd.concat([df_pa_crash, df_merged])

# Drop unnecessary column
df_pa_crash = df_pa_crash.drop(columns=['Total Population', 'Municipal Divisions'])

# Sort values
df_pa_crash = df_pa_crash.sort_values(by=['City'])

# Reset index
df_pa_crash = df_pa_crash.reset_index(drop=True)

# Show data
df_pa_crash.head(10)

Unnamed: 0,NAME,PENN_DOT_MUNI_ID,state,county,county_subdivision,POPULATION,LAND_AREA,BIKE_TO_WORK_EST,BIKE_TO_WORK_MARG,WALK_TO_WORK_EST,...,PED_DEATH_BY_AUTO_COUNT,PED_SUSP_SERIOUS_INJ_BY_AUTO_COUNT,BICYCLE_SOLO_COUNT,BICYCLE_DEATH_SOLO_COUNT,BICYCLE_SUSP_SERIOUS_INJ_SOLO_COUNT,PED_SOLO_COUNT,PED_DEATH_SOLO_COUNT,PED_SUSP_SERIOUS_INJ_SOLO_COUNT,City,BNA Score
0,"Allentown city, Lehigh County",39301,42,77,2000,125250,17.6,79,51,2396,...,2.0,29.0,28.0,0.0,0.0,154.0,1.0,20.0,allentown,29
1,"Altoona city, Blair County",7301,42,13,2184,44114,9.9,4,6,761,...,1.0,5.0,19.0,0.0,2.0,33.0,0.0,6.0,altoona,36
2,"Athens borough, Bradford County",8402,42,15,3392,3268,1.8,0,11,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,athens,23
3,"Beaver borough, Beaver County",4404,42,7,4688,4438,0.9,0,11,75,...,,,0.0,0.0,0.0,1.0,1.0,0.0,beaver,36
4,"Bellwood borough, Blair County",7401,42,13,5384,1600,0.4,0,11,25,...,,,,,,,,,bellwood,31
5,"Bethlehem city, Northampton County",48301,42,95,6088,55816,19.1,129,156,1194,...,0.0,7.0,26.0,0.0,1.0,59.0,3.0,7.0,bethlehem,27
6,"Birdsboro borough, Berks County",6404,42,11,6504,5119,1.3,0,16,101,...,,,3.0,0.0,0.0,3.0,0.0,1.0,birdsboro,59
7,"Camp Hill borough, Cumberland County",21401,42,41,11000,8115,2.1,21,23,137,...,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,camp hill,28
8,"Carlisle borough, Cumberland County",21402,42,41,11272,19869,5.5,235,104,1052,...,0.0,3.0,12.0,0.0,2.0,25.0,1.0,3.0,carlisle,38
9,"Cranberry township, Butler County",10212,42,19,16920,32594,22.8,0,24,150,...,0.0,3.0,1.0,1.0,0.0,6.0,1.0,0.0,cranberry,24


In [220]:
# Assuming 'data' is a subdirectory in your current working directory
folder_path = 'data/'
file_name = 'df_pa_crash.csv'

# Combine the folder path and file name to create the full file path
full_file_path = folder_path + file_name

# Export dataframe to csv file
df_pa_crash.to_csv(full_file_path, index=True)