In [1]:
import pandas as pd
import googlemaps
import numpy as np
from IPython.display import clear_output, display

In [2]:
data = pd.read_csv('raw_data.csv')

In [3]:
data.head()

Unnamed: 0,location,address11,city1,zipcode1,id
0,,,DETROIT,,2
1,,,DETROIT,,8
2,,,DETROIT,,13
3,,,DETROIT,,14
4,,,BROWNSTOWN,,19


In [4]:
data['location'].value_counts()

Hospital              438
Home                   46
Street                 31
Dwelling               16
Auto/Street            10
Vacant Dwelling        10
Field                   8
Park                    7
Parking lot             6
Hospice Facility        5
Motel                   4
Hotel                   4
Alley                   4
Auto                    3
Freeway                 3
Place of Business       3
Highway                 2
Marina                  2
Adult Foster Care       1
Body of Water           1
Abandoned Building      1
Wooded Area             1
Rehab. Center           1
Restaurant              1
Assisted Living         1
Apartment Building      1
Yard                    1
Unknown                 1
Name: location, dtype: int64

### Get the rows that are labeled as location "Hospital"

In [70]:
rama_hosp = data[data['location'] == 'Hospital']

In [72]:
# 2 rows are missing city. We will have to drop these or perhaps look them up manually for now. 
rama_hosp.count()

location     438
address11    438
city1        436
zipcode1       0
id           438
dtype: int64

In [74]:
# There are 85 unique hospital name values
rama_hosp['address11'].value_counts()

Beaumont Hospital                28
HENRY FORD HOSPITAL              25
SINAI  GRACE  HOSPITAL           24
HENRY  FORD  HOSPITAL            21
Detroit Receiving Hospital       21
                                 ..
DETROIT  RECEIVING   HOSPITAL     1
Beaumont  Hospital                1
Detroit Recieving Hopsital        1
Promedica Monroe Hospital         1
BEAUMONT - HERITAGE               1
Name: address11, Length: 85, dtype: int64

### Scrape a list of all hospitals from the Michigan Health and Hospital Association website

https://www.mha.org/Our-Hospitals

In [75]:
from bs4 import BeautifulSoup
import requests, re

In [14]:
url = 'https://www.mha.org/Our-Hospitals'
response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

In [76]:
divs = soup.findAll(class_= 'hospital-list')    

data_list = []
for div in divs:
    hosp_data_list = []
    
    hosp_name = str(div.find("h3"))[4:-5]
    hosp_type = str(div.find("strong"))[8:-9]
    hosp_address = str(div.find(class_ = "hospital-address"))
    
    # Parse hospital address with regex
    regex_result = re.search(r'>([ \w].+)<br/>(\w.+), (\w..)(\d.+)</p>', hosp_address)
    # print(regex_result.groups())
    if regex_result.group(1).strip()[-1] == ',':
        clean_address = regex_result.group(1).strip()[:-1]
    else:
        clean_address = regex_result.group(1).strip()
    
    city = regex_result.group(2)
    state = regex_result.group(3)
    zip_code = regex_result.group(4)
    
    data_list.append([hosp_name, hosp_type, clean_address, city, state, zip_code])

# Create the Pandas DataFrame with the list of hospitals in Michigan
hosp_list = pd.DataFrame(data_list, columns = ['hosp_name', 'hosp_type', 'address1', 'city', 'state', 'zip_code'])
    

In [17]:
# Save the data to CSV so we can skip the above step if we come back to this analysis later
hosp_list.to_csv('mi_hosp_list.csv')

In [9]:
# hosp_list = pd.read_csv('mi_hosp_list.csv')

In [77]:
hosp_list.head()

Unnamed: 0,hosp_name,hosp_type,address1,city,state,zip_code
0,Ascension Borgess Allegan Hospital,Community Hospital|Critical Access Hospital,555 Linn St,Allegan,MI,49010-1524
1,Ascension Borgess Hospital,Teaching Hospital|Community Hospital,1521 Gull Rd,Kalamazoo,MI,49048-1640
2,Ascension Borgess-Lee Hospital,Community Hospital|Critical Access Hospital,420 W High St,Dowagiac,MI,49047-1943
3,Ascension Borgess-Pipp Hospital,Long Term Acute Care Hospital|Community Hospital,411 Naomi St,Plainwell,MI,49080-1222
4,Ascension Brighton Center for Recovery,,12851 Grand River Rd,Brighton,MI,48116-8506


In [78]:
# Combine the hospital name and city so we don't have to worry about merge matching on multiple columns
hosp_list['name_and_city'] = hosp_list['hosp_name'] + ' ' + hosp_list['city']

In [79]:
hosp_list.head()

Unnamed: 0,hosp_name,hosp_type,address1,city,state,zip_code,name_and_city
0,Ascension Borgess Allegan Hospital,Community Hospital|Critical Access Hospital,555 Linn St,Allegan,MI,49010-1524,Ascension Borgess Allegan Hospital Allegan
1,Ascension Borgess Hospital,Teaching Hospital|Community Hospital,1521 Gull Rd,Kalamazoo,MI,49048-1640,Ascension Borgess Hospital Kalamazoo
2,Ascension Borgess-Lee Hospital,Community Hospital|Critical Access Hospital,420 W High St,Dowagiac,MI,49047-1943,Ascension Borgess-Lee Hospital Dowagiac
3,Ascension Borgess-Pipp Hospital,Long Term Acute Care Hospital|Community Hospital,411 Naomi St,Plainwell,MI,49080-1222,Ascension Borgess-Pipp Hospital Plainwell
4,Ascension Brighton Center for Recovery,,12851 Grand River Rd,Brighton,MI,48116-8506,Ascension Brighton Center for Recovery Brighton


### Fuzzy match original list of hospitals with list from the Michigan Health and Hospital Association website

In [80]:
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz

# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

In [None]:
# Commbine hospital name and city in Rama's data so we don't have to worry about merge matching
# on mulitple columns
rama_hosp['name_and_city'] = rama_hosp['address11'] + ' ' + rama_hosp['city1']

In [82]:
rama_hosp

Unnamed: 0,location,address11,city1,zipcode1,id,name_and_city
4704,Hospital,ST. MARY MERCY HOSPITAL,LIVONIA,,12443,ST. MARY MERCY HOSPITAL LIVONIA
4705,Hospital,Beaumont Hospital,Dearborn,,12444,Beaumont Hospital Dearborn
4706,Hospital,HENRY FORD HOSPITAL,DETROIT,,12450,HENRY FORD HOSPITAL DETROIT
4707,Hospital,HENRY FORD HOSPITAL,DETROIT,,12454,HENRY FORD HOSPITAL DETROIT
4708,Hospital,Beaumont Hospital,DEARBORN,,12456,Beaumont Hospital DEARBORN
...,...,...,...,...,...,...
5304,Hospital,BEAUMONT HOSPITAL,DEARBORN,,15530,BEAUMONT HOSPITAL DEARBORN
5310,Hospital,Henry Ford Hospital,Detroit,,15562,Henry Ford Hospital Detroit
5311,Hospital,ST. JOHN HOSPITAL,detroit,,15569,ST. JOHN HOSPITAL detroit
5314,Hospital,St. Mary Mercy,LIVONIA,,15594,St. Mary Mercy LIVONIA


In [83]:
rama_hosp_no_na = rama_hosp.dropna(subset=['city1'])

In [85]:
# 2 rows with missing city dropped - 438 rows down to 436
rama_hosp_no_na

Unnamed: 0,location,address11,city1,zipcode1,id,name_and_city
4704,Hospital,ST. MARY MERCY HOSPITAL,LIVONIA,,12443,ST. MARY MERCY HOSPITAL LIVONIA
4705,Hospital,Beaumont Hospital,Dearborn,,12444,Beaumont Hospital Dearborn
4706,Hospital,HENRY FORD HOSPITAL,DETROIT,,12450,HENRY FORD HOSPITAL DETROIT
4707,Hospital,HENRY FORD HOSPITAL,DETROIT,,12454,HENRY FORD HOSPITAL DETROIT
4708,Hospital,Beaumont Hospital,DEARBORN,,12456,Beaumont Hospital DEARBORN
...,...,...,...,...,...,...
5304,Hospital,BEAUMONT HOSPITAL,DEARBORN,,15530,BEAUMONT HOSPITAL DEARBORN
5310,Hospital,Henry Ford Hospital,Detroit,,15562,Henry Ford Hospital Detroit
5311,Hospital,ST. JOHN HOSPITAL,detroit,,15569,ST. JOHN HOSPITAL detroit
5314,Hospital,St. Mary Mercy,LIVONIA,,15594,St. Mary Mercy LIVONIA


In [None]:
# Create a function that will be applied to each address in Rama's original data. 
# The funciton will use the token_set_ratio method to compare the name+city combo in Rama's data to
# the name+city combo in the hosp_list data.

def get_token_ratio_name(address):
    token_result = process.extract(address, hosp_list['name_and_city'], scorer=fuzz.token_set_ratio)
    return token_result[0]  # Keep only the highest ranked match

rama_hosp_no_na['token_ratio'] = rama_hosp_no_na['name_and_city'].apply(get_token_ratio_name)  

In [None]:
# Parse the token_ratio for each row and pull the index id for the corresponding row in the 
# hosp_list table.
# Grab the zip_code from that corresponding row

def get_zip_code(item):
    return hosp_list.iloc[item[2]]['zip_code']

rama_hosp_no_na['zip_match'] = rama_hosp_no_na['token_ratio'].apply(get_zip_code)

In [134]:
rama_hosp_no_na.head(20)

Unnamed: 0,location,address11,city1,zipcode1,id,name_and_city,token_ratio,zip_match
4704,Hospital,ST. MARY MERCY HOSPITAL,LIVONIA,,12443,ST. MARY MERCY HOSPITAL LIVONIA,"(St. Mary Mercy Hospital Livonia, 100, 165)",48154-1988
4705,Hospital,Beaumont Hospital,Dearborn,,12444,Beaumont Hospital Dearborn,"(Beaumont Hospital, Dearborn Dearborn, 100, 23)",48124-4089
4706,Hospital,HENRY FORD HOSPITAL,DETROIT,,12450,HENRY FORD HOSPITAL DETROIT,"(Henry Ford Hospital Detroit, 100, 61)",48202-2608
4707,Hospital,HENRY FORD HOSPITAL,DETROIT,,12454,HENRY FORD HOSPITAL DETROIT,"(Henry Ford Hospital Detroit, 100, 61)",48202-2608
4708,Hospital,Beaumont Hospital,DEARBORN,,12456,Beaumont Hospital DEARBORN,"(Beaumont Hospital, Dearborn Dearborn, 100, 23)",48124-4089
4709,Hospital,HENRY FORD HOSPITAL,DETROIT,,12462,HENRY FORD HOSPITAL DETROIT,"(Henry Ford Hospital Detroit, 100, 61)",48202-2608
4710,Hospital,SINAI-GRACE HOSPITAL,DETROIT,,12463,SINAI-GRACE HOSPITAL DETROIT,"(DMC Sinai-Grace Hospital Detroit, 100, 48)",48235-2679
4711,Hospital,HENRY FORD HOSPITAL,DETROIT,,12468,HENRY FORD HOSPITAL DETROIT,"(Henry Ford Hospital Detroit, 100, 61)",48202-2608
4713,Hospital,BEAUMONT HOSPITAL,DEARBORN,,12471,BEAUMONT HOSPITAL DEARBORN,"(Beaumont Hospital, Dearborn Dearborn, 100, 23)",48124-4089
4714,Hospital,ST. JOHN HOSPITAL,DETROIT,,12472,ST. JOHN HOSPITAL DETROIT,"(Ascension St. John Hospital Detroit, 100, 12)",48236-2172


In [135]:
# Write our results to CSV
rama_hosp_no_na.to_csv('matched_file.csv')

### Fuzzywuzzy testing

Testing the various ratio methods to see which will work best for this data. Appears that partial_ratio and token_set_ratio work best. Going to use token_set_ratio because it tries to account for differences in the strings. Many of the hospital names are spelled differently with things like owning hospital name included. 

https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings

In [124]:
print(fuzz.ratio("Detroit Receiving Hospital DETROIT", 'Select Specialty Hospital - NW Detroit Detroit'))
print(fuzz.ratio("Detroit Receiving Hospital DETROIT", 'DMC Heart Hospital Detroit'))
print(fuzz.ratio("Detroit Receiving Hospital DETROIT", 'DMC Detroit Receiving Hospital and University Health Center Detroit'))

42
50
55


In [125]:
print(fuzz.partial_ratio("Detroit Receiving Hospital DETROIT", 'Select Specialty Hospital - NW Detroit Detroit'))
print(fuzz.partial_ratio("Detroit Receiving Hospital DETROIT", 'DMC Heart Hospital Detroit'))
print(fuzz.partial_ratio("Detroit Receiving Hospital DETROIT", 'DMC Detroit Receiving Hospital and University Health Center Detroit'))

50
50
79


In [126]:
print(fuzz.token_sort_ratio("Detroit Receiving Hospital DETROIT", 'Select Specialty Hospital - NW Detroit Detroit'))
print(fuzz.token_sort_ratio("Detroit Receiving Hospital DETROIT", 'DMC Heart Hospital Detroit'))
print(fuzz.token_sort_ratio("Detroit Receiving Hospital DETROIT", 'DMC Detroit Receiving Hospital and University Health Center Detroit'))

74
70
67


In [128]:
print(fuzz.partial_token_sort_ratio("Detroit Receiving Hospital DETROIT", 'Select Specialty Hospital - NW Detroit Detroit'))
print(fuzz.partial_token_sort_ratio("Detroit Receiving Hospital DETROIT", 'DMC Heart Hospital Detroit'))
print(fuzz.partial_token_sort_ratio("Detroit Receiving Hospital DETROIT", 'DMC Detroit Receiving Hospital and University Health Center Detroit'))

79
81
79


In [129]:
print(fuzz.token_set_ratio("Detroit Receiving Hospital DETROIT", 'Select Specialty Hospital - NW Detroit Detroit'))
print(fuzz.token_set_ratio("Detroit Receiving Hospital DETROIT", 'DMC Heart Hospital Detroit'))
print(fuzz.token_set_ratio("Detroit Receiving Hospital DETROIT", 'DMC Detroit Receiving Hospital and University Health Center Detroit'))

76
76
100
