# Name: Margaret Nguyen

# Data Manipulation: Merging BNA data and Massachusetts Crash Data (included ACS Data)

**Assignment: Merge the data of the municipalities in Massachusetts, including the BNA score from the PeopleForBikes dataset, with the Massachusetts Crash Data sourced from [this zip file](https://github.com/margaret613/Carlisle_Borough_Transportation_Study/blob/main/data/df_mass_acs.csv.gz).**

In [78]:
# Import packages
import pandas as pd
import re

In [79]:
# Read the csv files 
# This Municipality’s crash data is from 2017 to 2021
bna = pd.read_csv("./data/BNA_score.csv", low_memory=False)
crash_data = pd.read_csv('./data/df_mass_acs.csv', low_memory=False)

In [80]:
# Clean datasets
df_bna = bna.drop(columns = ['Unnamed: 0'])
mass_crash = crash_data.drop(columns = ['Unnamed: 0'])

# Keep only Massachusetts municipalities from the BNA dataframe
df_ma = df_bna[df_bna["State"]=="MA"]

# Drop Country columns
df_ma = df_ma.drop(["Country", 'State'], axis=1)

# Reset index
df_ma.reset_index(inplace=True, drop=True)

# Check the shape of df_ma
print(df_ma.shape)

# Show dataframe
mass_crash.head()

(46, 2)


Unnamed: 0,OBJECTID,CRASH_NUMB,CITY_TOWN_NAME,CRASH_DATE_TEXT,CRASH_TIME,CRASH_DATETIME,CRASH_HOUR,CRASH_STATUS,CRASH_SEVERITY_DESCR,MAX_INJR_SVRTY_CL,...,DRIVE_SOLO_TO_WORK_MARG,CARPOOL_TO_WORK_EST,CARPOOL_TO_WORK_MARG,PUBTRANS_TO_WORK_EST,PUBTRANS_TO_WORK_MARG,state,county,county_subdivision,MUNI_NAME,COUNTY_NAME
0,118,4323882,BOSTON,01/18/2017,6:38 PM,2017/01/18 18:37:59+00,06:00PM to 06:59PM,Closed,Property damage only (none injured),No injury,...,2632,19973,1210,102376,2474,25,25,7000,Boston city,Suffolk County
1,1583,4323882,BOSTON,01/18/2017,6:38 PM,2017/01/18 18:37:59+00,06:00PM to 06:59PM,Closed,Property damage only (none injured),No injury,...,2632,19973,1210,102376,2474,25,25,7000,Boston city,Suffolk County
2,2196,4313687,BOSTON,01/03/2017,5:32 PM,2017/01/03 17:32:00+00,05:00PM to 05:59PM,Closed,Non-fatal injury,Non-fatal injury - Non-incapacitating,...,2632,19973,1210,102376,2474,25,25,7000,Boston city,Suffolk County
3,2197,4313687,BOSTON,01/03/2017,5:32 PM,2017/01/03 17:32:00+00,05:00PM to 05:59PM,Closed,Non-fatal injury,Non-fatal injury - Non-incapacitating,...,2632,19973,1210,102376,2474,25,25,7000,Boston city,Suffolk County
4,5156,4322841,BOSTON,01/10/2017,7:48 AM,2017/01/10 07:47:59+00,07:00AM to 07:59AM,Closed,Non-fatal injury,Non-fatal injury - Incapacitating,...,2632,19973,1210,102376,2474,25,25,7000,Boston city,Suffolk County


## Merge BNA data and Massachusetts Crash Data (included ACS Data)

In [81]:
# Lower the letters in the City column in the mass_crash dataframe 
mass_crash['City'] = mass_crash['CITY_TOWN_NAME'].str.lower()

# Lower the letters in the City column in the df_bna dataframe 
df_ma['City'] = df_ma['City'].str.lower()

# Replace barnstable town and easthampton town
df_ma['City'] = df_ma['City'].replace('barnstable town', 'barnstable')
df_ma['City'] = df_ma['City'].replace('easthampton town', 'easthampton')

# Merge them for comparison
df_merged = mass_crash.merge(df_ma, how="inner", on="City")

# Show dataframe
df_merged['CITY_TOWN_NAME'].unique()

array(['BOSTON', 'BROOKLINE', 'REVERE', 'BRAINTREE', 'BARNSTABLE',
       'WALTHAM', 'MALDEN', 'NORWOOD', 'FALL RIVER', 'SPRINGFIELD',
       'CAMBRIDGE', 'GREENFIELD', 'WOBURN', 'WESTFIELD', 'SOMERVILLE',
       'AMHERST', 'ACTON', 'WORCESTER', 'LEXINGTON', 'NEEDHAM',
       'CHELMSFORD', 'FOXBOROUGH', 'NORTHAMPTON', 'MELROSE', 'MILLBURY',
       'LOWELL', 'SALEM', 'NEWBURYPORT', 'NATICK', 'PITTSFIELD',
       'BILLERICA', 'FRAMINGHAM', 'NANTUCKET', 'FALMOUTH', 'PROVINCETOWN',
       'WAKEFIELD', 'DEDHAM', 'LAKEVILLE', 'NEWTON', 'AYER', 'HOLBROOK',
       'WESTBOROUGH', 'BEDFORD', 'EASTHAMPTON', 'WAYLAND'], dtype=object)

In [82]:
# Assuming 'data' is a subdirectory in your current working directory
folder_path = 'data/'
file_name = 'df_mass_bna.csv'

# Combine the folder path and file name to create the full file path
full_file_path = folder_path + file_name

# Export dataframe to csv file
df_merged.to_csv(full_file_path, index=True)

## Compress the CSV file before uploading it to GitHub

In [83]:
import gzip
import shutil

# Path to the CSV file you want to compress
csv_file_path = 'data/df_mass_bna.csv'

# Path for the compressed file
compressed_file_path = 'data/df_mass_bna.csv.gz'

# Open the CSV file for reading
with open(csv_file_path, 'rb') as f_in:
    # Open the compressed file for writing
    with gzip.open(compressed_file_path, 'wb') as f_out:
        # Copy the contents of the CSV file to the compressed file
        shutil.copyfileobj(f_in, f_out)

print(f'File compressed to: {compressed_file_path}')

File compressed to: data/df_mass_bna.csv.gz
