## Rats in the Restaurant - Data Cleaning and Transformation
#### County of Los Angeles Open Data Source
* Dataset #1: https://data.lacounty.gov/Health/LOS-ANGELES-COUNTY-RESTAURANT-AND-MARKET-INSPECTIO/6ni6-h5kp
* Dataset #2: https://data.lacounty.gov/Health/LOS-ANGELES-COUNTY-RESTAURANT-AND-MARKET-VIOLATION/8jyd-4pv9
* Dataset #3: https://data.lacounty.gov/Health/Los-Angeles-County-City-and-Community-Health-Profi/capb-kusk

In [1]:
# Import dependencies
import pandas as pd
import re
import datetime as dt

In [2]:
# Get datasets
inspections_dataset = pd.read_csv("./Resources/LOS_ANGELES_COUNTY_RESTAURANT_AND_MARKET_INSPECTIONS.csv")
violations_dataset = pd.read_csv("./Resources/LOS_ANGELES_COUNTY_RESTAURANT_AND_MARKET_VIOLATIONS.csv")
communityhealth_dataset = pd.read_csv("./Resources/Los_Angeles_County_City_and_Community_Health_Profiles_2018.csv")

In [3]:
# Expand output display to view all
pd.set_option('display.max_columns', 999)

In [4]:
# Create Inspections DF
inspections_df = pd.DataFrame(inspections_dataset)
inspections_df.head()

Unnamed: 0,ACTIVITY DATE,OWNER ID,OWNER NAME,FACILITY ID,FACILITY NAME,RECORD ID,PROGRAM NAME,PROGRAM STATUS,PROGRAM ELEMENT (PE),PE DESCRIPTION,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SERVICE CODE,SERVICE DESCRIPTION,SCORE,GRADE,SERIAL NUMBER,EMPLOYEE ID,Location
0,09/10/2018,OW0105348,"GUCKENHEIMER SERVICES, LLC.",FA0242046,SERVERY- NICKELODEON,PR0190194,SERVERY- NICKELODEON,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,203 W OLIVE AVE # C,BURBANK,CA,91502,1,ROUTINE INSPECTION,96,A,DARRFUZBW,EE0000495,POINT (-118.314661 34.175253)
1,07/19/2018,OW0246461,ANASTACIOS POLITIS,FA0252769,TOMS JR BURGERS,PR0202127,TOMS JR BURGERS,ACTIVE,1632,RESTAURANT (0-30) SEATS HIGH RISK,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,1,ROUTINE INSPECTION,98,A,DA0XQVMTN,EE0001130,POINT (-118.292543 34.010859)
2,08/15/2018,OW0010130,DJ BIBINGKAHAN CORPORATION,FA0011237,DJ BIBINGKAHAN,PR0035416,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,1515 E AMAR RD,WEST COVINA,CA,91792,1,ROUTINE INSPECTION,98,A,DAMPOJNY8,EE0000500,POINT (-117.913926 34.030964)
3,07/16/2018,OW0020051,KULWINDER KAUR,FA0061073,DOROSE LIQUOR,PR0027907,DOROSE LIQUOR,ACTIVE,1610,"FOOD MKT RETAIL (1-1,999 SF) LOW RISK",13560 ROSCOE BLVD,PANORAMA CITY,CA,91402,1,ROUTINE INSPECTION,91,A,DAUTU3DPD,EE0000045,POINT (-118.428399 34.221664)
4,09/07/2018,OW0246329,JUAN C OROZCO,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,PR0201914,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,1641,RESTAURANT (151 + ) SEATS HIGH RISK,29002 AGOURA RD,AGOURA HILLS,CA,91301,1,ROUTINE INSPECTION,90,A,DAUEU4NGF,EE0000526,POINT (-118.756808 34.143452)


In [5]:
# Create Violations DF
violations_df = pd.DataFrame(violations_dataset)
violations_df.head()

Unnamed: 0,SERIAL NUMBER,VIOLATION STATUS,VIOLATION CODE,VIOLATION DESCRIPTION,POINTS
0,DA000211Z,OUT OF COMPLIANCE,F006,# 06. Adequate handwashing facilities supplied...,2.0
1,DA000211Z,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly bui...",1.0
2,DA000211Z,OUT OF COMPLIANCE,F014,# 14. Food contact surfaces: clean and sanitized,2.0
3,DA000211Z,OUT OF COMPLIANCE,F029,"# 29. Toxic substances properly identified, st...",1.0
4,DA000211Z,OUT OF COMPLIANCE,F035,# 35. Equipment/Utensils - approved; installed...,1.0


In [6]:
# Create Community Health DF
community_health_df = pd.DataFrame(communityhealth_dataset)
community_health_df.head()

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,LE,Prop_prsc,Prop_3rdg,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,No_libr,Prop_empl,Prop_emsu,Prop_vote,Rte_resp,Prop_PA,Prop_groc,Prop_60mi,No_farm,No_EBT,Prop_foin,No_CalF,Rte_CalF,Prop_FRPM,Prop_obse,Prop_DM,Rte_coin,Rte_brin,Prop_smok,Rte_luca,Rte_COPD,Rte_CVD,Prop_hbu1,Prop_hbu2,Prop_ownr,Prop_rentr,No_hless,Prop_asth,Rte_crim,Rte_hom,Rte_alco,Propt_envi,No_gasw,Rte_te17,Rte_te19,Prop_LBW,Rte_IMR,Prop_1stt,Prop_depr,Rte_suic,Rte_UOD,Rte_syin,Rte_goin,Rte_hiv,Prop_uinC,Prop_uinA,Prop_duinC,Prop_duinA,Propt_HPI,Rte_mein,Rte_cein,Rte_luin,Prop_fru,Prop_bev,Prop_hyp,Prop_marj,Prop_HI,Prop_grad,Prop_trua
0,Alhambra,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,0.0007,0.1303,0.3356,0.5021,0.2112,84.55,0.665,0.5353,0.1934,0.228,0.2497,0.3289,53582,1.0,0.94,0.385,0.6235,0.7706,0.2731,0.5239,0.111,1,1,**,15799,0.3300,0.6598,0.1358,0.104,45.7963,125.9019,0.1272,25.4872,21.7535,170.914,0.447,0.232,0.404,0.596,64,0.0449,168,**,12,6,2,2.1042,9.1180,0.0815,3.1022,0.8614,0.0523,7,3,14,66,208,0.0598,0.207,0.0929,0.2291,43.3,**,**,34.6,0.2,0.2,0.2,0.1,0.0,1.0,0.1
1,Altadena,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.002,0.0988,0.2281,0.2021,0.0379,82.11,0.74,0.3942,0.105,0.1397,0.2918,0.4635,86050,2.0,0.92,0.7335,0.736,1.0975,0.3482,0.4524,0.12,1,1,**,-,-,0.559,0.2441,0.1358,38.367,189.2143,0.1129,25.2189,28.4887,211.035,0.401,0.193,0.719,0.281,58,0.0915,162,**,6,62,0,3.9610,7.6173,0.0703,**,0.8859,0.1099,10,8,19,119,330,0.0333,0.1745,0.066,0.0924,75.5,33.8,**,35.3,0.1,0.3,0.3,0.2,0.1,0.9,0.4
2,Arcadia,56992,0.1794,0.6389,0.1817,0.0115,0.125,0.2304,0.6315,0.0014,0.0003,0.0774,0.2042,0.4877,0.1888,85.42,0.675,0.7927,0.0789,0.1601,0.243,0.518,79934,1.0,0.941,0.5691,0.6271,1.9654,0.2662,0.511,0.131,0,0,**,6638,0.1800,0.2066,0.057,0.0819,32.9547,145.2497,0.1285,22.5416,22.058,173.371,0.402,0.214,0.596,0.404,12,**,146,**,19,27,2,**,**,0.059,**,0.8588,0.0493,9,**,8,46,56,0.0564,0.1245,0.1232,0.1876,73.4,22.0,**,29.5,0.1,0.2,0.2,0.0,0.0,1.0,0.1
3,Azusa,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,0.0013,0.1636,0.3935,0.3082,0.1156,81.16,0.439,0.339,0.2232,0.2932,0.2873,0.1962,53135,1.0,0.927,0.6201,0.6569,1.0168,0.3759,0.5997,0.121,0,0,0.2006,8868,0.5800,0.806,0.2607,0.1198,31.3309,134.9485,0.1174,34.8652,33.1891,218.754,0.474,0.24,0.527,0.473,55,0.0660,354,6,14,48,0,3.5084,9.2802,0.0681,**,0.881,0.0895,8,**,27,116,196,0.0708,0.2404,0.1117,0.1918,34.2,**,**,29.7,0.1,0.4,0.2,0.1,0.0,0.9,0.0
4,Baldwin Park,74438,0.2623,0.633,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,0.0008,0.1591,0.4116,0.4453,0.1908,82.99,0.47,0.3227,0.374,0.317,0.1962,0.1128,51742,1.0,0.898,0.5712,0.6318,0.3406,0.3164,0.5444,0.125,1,1,0.2531,14206,0.7400,0.8729,0.2652,0.1307,43.8584,110.0378,0.1045,28.2446,31.2708,195.522,0.498,0.227,0.572,0.428,111,0.0904,391,5,10,9,0,7.3997,20.1220,0.0856,4.1881,0.8827,0.0544,**,5,21,98,171,0.0582,0.3259,0.0704,0.1874,22.3,**,**,26.5,0.1,0.4,0.3,0.1,0.0,1.0,0.4


### INSPECTIONS_DF DATA CLEANUP:
#### Get data from  'PE DESCRIPTION' column
* Type
* No. of seats
* Risk

In [7]:
#Remove all 'PE DESCRIPTION' that are not restaurants
inspections_df = inspections_df.loc[inspections_df['PE DESCRIPTION'].str.contains('RESTAURANT')]
inspections_df.head()

Unnamed: 0,ACTIVITY DATE,OWNER ID,OWNER NAME,FACILITY ID,FACILITY NAME,RECORD ID,PROGRAM NAME,PROGRAM STATUS,PROGRAM ELEMENT (PE),PE DESCRIPTION,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SERVICE CODE,SERVICE DESCRIPTION,SCORE,GRADE,SERIAL NUMBER,EMPLOYEE ID,Location
0,09/10/2018,OW0105348,"GUCKENHEIMER SERVICES, LLC.",FA0242046,SERVERY- NICKELODEON,PR0190194,SERVERY- NICKELODEON,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,203 W OLIVE AVE # C,BURBANK,CA,91502,1,ROUTINE INSPECTION,96,A,DARRFUZBW,EE0000495,POINT (-118.314661 34.175253)
1,07/19/2018,OW0246461,ANASTACIOS POLITIS,FA0252769,TOMS JR BURGERS,PR0202127,TOMS JR BURGERS,ACTIVE,1632,RESTAURANT (0-30) SEATS HIGH RISK,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,1,ROUTINE INSPECTION,98,A,DA0XQVMTN,EE0001130,POINT (-118.292543 34.010859)
2,08/15/2018,OW0010130,DJ BIBINGKAHAN CORPORATION,FA0011237,DJ BIBINGKAHAN,PR0035416,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,1515 E AMAR RD,WEST COVINA,CA,91792,1,ROUTINE INSPECTION,98,A,DAMPOJNY8,EE0000500,POINT (-117.913926 34.030964)
4,09/07/2018,OW0246329,JUAN C OROZCO,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,PR0201914,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,1641,RESTAURANT (151 + ) SEATS HIGH RISK,29002 AGOURA RD,AGOURA HILLS,CA,91301,1,ROUTINE INSPECTION,90,A,DAUEU4NGF,EE0000526,POINT (-118.756808 34.143452)
5,09/18/2018,OW0123199,REDONDO ENTERPRISES LLC,FA0158101,MCDONALD'S #10681,PR0146191,MCDONALD'S #10681,ACTIVE,1637,RESTAURANT (61-150) SEATS MODERATE RISK,5725 FLORENCE AVE,BELL GARDENS,CA,90201,1,ROUTINE INSPECTION,91,A,DARQIUA45,EE0000437,POINT (-118.163665 33.967791)


In [8]:
# Get 'type', 'seats', and 'risk' from 'PE DESCRIPTION' and add them as columns to the Inspections DF
def find_est(str):
    est=re.compile('.+(?=\()')
    return est.search(str).group(0)

def find_size(str):
    size=re.compile('(?<=\().+(?=\))')
    return size.search(str).group(0)

def find_risk(str):
    return (' ').join(str.split(' ')[-2:])

inspections_df['TYPE'] = inspections_df['PE DESCRIPTION'].apply(find_est)
inspections_df['SEATS'] = inspections_df['PE DESCRIPTION'].apply(find_size)
inspections_df['RISK'] = inspections_df['PE DESCRIPTION'].apply(find_risk)

In [9]:
# Display DF
inspections_df.head()

Unnamed: 0,ACTIVITY DATE,OWNER ID,OWNER NAME,FACILITY ID,FACILITY NAME,RECORD ID,PROGRAM NAME,PROGRAM STATUS,PROGRAM ELEMENT (PE),PE DESCRIPTION,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SERVICE CODE,SERVICE DESCRIPTION,SCORE,GRADE,SERIAL NUMBER,EMPLOYEE ID,Location,TYPE,SEATS,RISK
0,09/10/2018,OW0105348,"GUCKENHEIMER SERVICES, LLC.",FA0242046,SERVERY- NICKELODEON,PR0190194,SERVERY- NICKELODEON,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,203 W OLIVE AVE # C,BURBANK,CA,91502,1,ROUTINE INSPECTION,96,A,DARRFUZBW,EE0000495,POINT (-118.314661 34.175253),RESTAURANT,31-60,HIGH RISK
1,07/19/2018,OW0246461,ANASTACIOS POLITIS,FA0252769,TOMS JR BURGERS,PR0202127,TOMS JR BURGERS,ACTIVE,1632,RESTAURANT (0-30) SEATS HIGH RISK,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,1,ROUTINE INSPECTION,98,A,DA0XQVMTN,EE0001130,POINT (-118.292543 34.010859),RESTAURANT,0-30,HIGH RISK
2,08/15/2018,OW0010130,DJ BIBINGKAHAN CORPORATION,FA0011237,DJ BIBINGKAHAN,PR0035416,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,1515 E AMAR RD,WEST COVINA,CA,91792,1,ROUTINE INSPECTION,98,A,DAMPOJNY8,EE0000500,POINT (-117.913926 34.030964),RESTAURANT,0-30,MODERATE RISK
4,09/07/2018,OW0246329,JUAN C OROZCO,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,PR0201914,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,1641,RESTAURANT (151 + ) SEATS HIGH RISK,29002 AGOURA RD,AGOURA HILLS,CA,91301,1,ROUTINE INSPECTION,90,A,DAUEU4NGF,EE0000526,POINT (-118.756808 34.143452),RESTAURANT,151 +,HIGH RISK
5,09/18/2018,OW0123199,REDONDO ENTERPRISES LLC,FA0158101,MCDONALD'S #10681,PR0146191,MCDONALD'S #10681,ACTIVE,1637,RESTAURANT (61-150) SEATS MODERATE RISK,5725 FLORENCE AVE,BELL GARDENS,CA,90201,1,ROUTINE INSPECTION,91,A,DARQIUA45,EE0000437,POINT (-118.163665 33.967791),RESTAURANT,61-150,MODERATE RISK


In [10]:
# Drop columns
inspections_df = inspections_df.drop(columns = ['OWNER ID', 
                                               'OWNER NAME',
                                               'RECORD ID', 
                                               'PROGRAM ELEMENT (PE)', 
                                               "PE DESCRIPTION", 
                                               'SERVICE CODE',
                                               'SERVICE DESCRIPTION',  
                                               'EMPLOYEE ID', 
                                               "TYPE", 
                                               "RISK"])
inspections_df.head()

Unnamed: 0,ACTIVITY DATE,FACILITY ID,FACILITY NAME,PROGRAM NAME,PROGRAM STATUS,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SCORE,GRADE,SERIAL NUMBER,Location,SEATS
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,POINT (-118.314661 34.175253),31-60
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,98,A,DA0XQVMTN,POINT (-118.292543 34.010859),0-30
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,POINT (-117.913926 34.030964),0-30
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,POINT (-118.756808 34.143452),151 +
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,POINT (-118.163665 33.967791),61-150


#### Get data from 'Location' column
* LAT
* LNG

In [11]:
# Display "Location" column
inspections_df[["Location"]].head()

Unnamed: 0,Location
0,POINT (-118.314661 34.175253)
1,POINT (-118.292543 34.010859)
2,POINT (-117.913926 34.030964)
4,POINT (-118.756808 34.143452)
5,POINT (-118.163665 33.967791)


In [12]:
# Split column
location_data = inspections_df["Location"].str.split(n=2, expand=True)
location_data.head()

Unnamed: 0,0,1,2
0,POINT,(-118.314661,34.175253)
1,POINT,(-118.292543,34.010859)
2,POINT,(-117.913926,34.030964)
4,POINT,(-118.756808,34.143452)
5,POINT,(-118.163665,33.967791)


In [13]:
# Drop column '0' aka 'POINT'
lat_lng_values = location_data.drop(columns=[0])
lat_lng_values.head()

Unnamed: 0,1,2
0,(-118.314661,34.175253)
1,(-118.292543,34.010859)
2,(-117.913926,34.030964)
4,(-118.756808,34.143452)
5,(-118.163665,33.967791)


In [14]:
# Get 'LAT' column
lats_split = lat_lng_values[1].str.split(pat='(', expand=True)
lat = lats_split.drop(columns=[0])
lat.columns = ['LAT']
lat.head()

Unnamed: 0,LAT
0,-118.314661
1,-118.292543
2,-117.913926
4,-118.756808
5,-118.163665


In [15]:
# Get 'LNG' column
lng_split = lat_lng_values[2].str.split(pat=')', expand=True)
lng = lng_split.drop(columns=[1])
lng.columns = ['LNG']
lng.head()

Unnamed: 0,LNG
0,34.175253
1,34.010859
2,34.030964
4,34.143452
5,33.967791


In [16]:
# Merge 'LAT' into Inspections DF
inspections_df = inspections_df.merge(lat, left_index = True, right_index = True)
inspections_df.head()

Unnamed: 0,ACTIVITY DATE,FACILITY ID,FACILITY NAME,PROGRAM NAME,PROGRAM STATUS,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SCORE,GRADE,SERIAL NUMBER,Location,SEATS,LAT
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,POINT (-118.314661 34.175253),31-60,-118.314661
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,98,A,DA0XQVMTN,POINT (-118.292543 34.010859),0-30,-118.292543
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,POINT (-117.913926 34.030964),0-30,-117.913926
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,POINT (-118.756808 34.143452),151 +,-118.756808
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,POINT (-118.163665 33.967791),61-150,-118.163665


In [17]:
# Merge 'LNG' into Inspections DF
inspections_df = inspections_df.merge(lng, left_index = True, right_index = True)
inspections_df.head()

Unnamed: 0,ACTIVITY DATE,FACILITY ID,FACILITY NAME,PROGRAM NAME,PROGRAM STATUS,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SCORE,GRADE,SERIAL NUMBER,Location,SEATS,LAT,LNG
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,POINT (-118.314661 34.175253),31-60,-118.314661,34.175253
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,98,A,DA0XQVMTN,POINT (-118.292543 34.010859),0-30,-118.292543,34.010859
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,POINT (-117.913926 34.030964),0-30,-117.913926,34.030964
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,POINT (-118.756808 34.143452),151 +,-118.756808,34.143452
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,POINT (-118.163665 33.967791),61-150,-118.163665,33.967791


In [18]:
# Drop 'Location' column
clean_inspections_df = inspections_df.drop(columns = ["Location"])
clean_inspections_df.head()

Unnamed: 0,ACTIVITY DATE,FACILITY ID,FACILITY NAME,PROGRAM NAME,PROGRAM STATUS,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SCORE,GRADE,SERIAL NUMBER,SEATS,LAT,LNG
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,31-60,-118.314661,34.175253
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037-1867,98,A,DA0XQVMTN,0-30,-118.292543,34.010859
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,0-30,-117.913926,34.030964
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,151 +,-118.756808,34.143452
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,61-150,-118.163665,33.967791


#### Clean 'Facility Zip' data to show 5-digits only

In [19]:
def clean_zip(x):
    return x[:5]

clean_inspections_df['FACILITY ZIP'] = clean_inspections_df['FACILITY ZIP'].apply(clean_zip)

# Dislay DF
clean_inspections_df.head()

Unnamed: 0,ACTIVITY DATE,FACILITY ID,FACILITY NAME,PROGRAM NAME,PROGRAM STATUS,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SCORE,GRADE,SERIAL NUMBER,SEATS,LAT,LNG
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,31-60,-118.314661,34.175253
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037,98,A,DA0XQVMTN,0-30,-118.292543,34.010859
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,0-30,-117.913926,34.030964
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,151 +,-118.756808,34.143452
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,61-150,-118.163665,33.967791


In [20]:
# Check null values
for column in clean_inspections_df.columns:
    print(f"Column {column} has {clean_inspections_df[column].isnull().sum()} null values.")

Column ACTIVITY DATE has 0 null values.
Column FACILITY ID has 0 null values.
Column FACILITY NAME has 0 null values.
Column PROGRAM NAME has 0 null values.
Column PROGRAM STATUS has 0 null values.
Column FACILITY ADDRESS has 0 null values.
Column FACILITY CITY has 0 null values.
Column FACILITY STATE has 0 null values.
Column FACILITY ZIP has 0 null values.
Column SCORE has 0 null values.
Column GRADE has 68 null values.
Column SERIAL NUMBER has 0 null values.
Column SEATS has 0 null values.
Column LAT has 5889 null values.
Column LNG has 5889 null values.


In [21]:
# Drop null values
revised_clean_inspections_df = clean_inspections_df.dropna()
revised_clean_inspections_df.head()

Unnamed: 0,ACTIVITY DATE,FACILITY ID,FACILITY NAME,PROGRAM NAME,PROGRAM STATUS,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SCORE,GRADE,SERIAL NUMBER,SEATS,LAT,LNG
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,31-60,-118.314661,34.175253
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037,98,A,DA0XQVMTN,0-30,-118.292543,34.010859
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,0-30,-117.913926,34.030964
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,151 +,-118.756808,34.143452
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,61-150,-118.163665,33.967791


In [22]:
# Check #2 for null values
for column in revised_clean_inspections_df.columns:
    print(f"Column {column} has {revised_clean_inspections_df[column].isnull().sum()} null values.")

Column ACTIVITY DATE has 0 null values.
Column FACILITY ID has 0 null values.
Column FACILITY NAME has 0 null values.
Column PROGRAM NAME has 0 null values.
Column PROGRAM STATUS has 0 null values.
Column FACILITY ADDRESS has 0 null values.
Column FACILITY CITY has 0 null values.
Column FACILITY STATE has 0 null values.
Column FACILITY ZIP has 0 null values.
Column SCORE has 0 null values.
Column GRADE has 0 null values.
Column SERIAL NUMBER has 0 null values.
Column SEATS has 0 null values.
Column LAT has 0 null values.
Column LNG has 0 null values.


In [23]:
# Rename columns to replace 'space' with '_'
revised2_clean_inspect = revised_clean_inspections_df.rename(columns = {"ACTIVITY DATE" : "ACTIVITY_DATE",
                                                                        "FACILITY ID" : "FACILITY_ID",
                                                                        "FACILITY NAME" : "FACILITY_NAME",
                                                                        "PROGRAM NAME" : "PROGRAM_NAME",
                                                                        "PROGRAM STATUS" : "PROGRAM_STATUS",
                                                                        "FACILITY ADDRESS" : "FACILITY_ADDRESS",
                                                                        "FACILITY CITY" : "FACILITY_CITY",
                                                                        "FACILITY STATE" : "FACILITY_STATE",
                                                                        "FACILITY ZIP" : "FACILITY_ZIP",
                                                                        "SERIAL NUMBER" : "SERIAL_NUMBER"})

# Display DF
revised2_clean_inspect.head()

Unnamed: 0,ACTIVITY_DATE,FACILITY_ID,FACILITY_NAME,PROGRAM_NAME,PROGRAM_STATUS,FACILITY_ADDRESS,FACILITY_CITY,FACILITY_STATE,FACILITY_ZIP,SCORE,GRADE,SERIAL_NUMBER,SEATS,LAT,LNG
0,09/10/2018,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,31-60,-118.314661,34.175253
1,07/19/2018,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037,98,A,DA0XQVMTN,0-30,-118.292543,34.010859
2,08/15/2018,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,0-30,-117.913926,34.030964
4,09/07/2018,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,151 +,-118.756808,34.143452
5,09/18/2018,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,61-150,-118.163665,33.967791


In [24]:
# Check data types
for column in revised2_clean_inspect.columns:
    print(f"Column {column} has data type -- {revised2_clean_inspect[column].dtype}.")

Column ACTIVITY_DATE has data type -- object.
Column FACILITY_ID has data type -- object.
Column FACILITY_NAME has data type -- object.
Column PROGRAM_NAME has data type -- object.
Column PROGRAM_STATUS has data type -- object.
Column FACILITY_ADDRESS has data type -- object.
Column FACILITY_CITY has data type -- object.
Column FACILITY_STATE has data type -- object.
Column FACILITY_ZIP has data type -- object.
Column SCORE has data type -- int64.
Column GRADE has data type -- object.
Column SERIAL_NUMBER has data type -- object.
Column SEATS has data type -- object.
Column LAT has data type -- object.
Column LNG has data type -- object.


In [25]:
# Change select column(s) to string
revised2_clean_inspect[["FACILITY_ID", 
                       "FACILITY_NAME", 
                       "PROGRAM_NAME", 
                       "PROGRAM_STATUS", 
                       "FACILITY_ADDRESS", 
                       "FACILITY_CITY", 
                       "FACILITY_STATE", 
                       "FACILITY_ZIP", 
                       "GRADE", 
                       "SERIAL_NUMBER", 
                       "SEATS"]] = revised2_clean_inspect[["FACILITY_ID", 
                                                           "FACILITY_NAME", 
                                                           "PROGRAM_NAME", 
                                                           "PROGRAM_STATUS", 
                                                           "FACILITY_ADDRESS", 
                                                           "FACILITY_CITY", 
                                                           "FACILITY_STATE", 
                                                           "FACILITY_ZIP", 
                                                           "GRADE", 
                                                           "SERIAL_NUMBER", 
                                                           "SEATS"]].astype(str)

# Check data types
revised2_clean_inspect.dtypes

ACTIVITY_DATE       object
FACILITY_ID         object
FACILITY_NAME       object
PROGRAM_NAME        object
PROGRAM_STATUS      object
FACILITY_ADDRESS    object
FACILITY_CITY       object
FACILITY_STATE      object
FACILITY_ZIP        object
SCORE                int64
GRADE               object
SERIAL_NUMBER       object
SEATS               object
LAT                 object
LNG                 object
dtype: object

In [26]:
# Change select column(s) to interger
revised2_clean_inspect[["LAT", "LNG"]] = revised2_clean_inspect[["LAT", "LNG"]].astype(float)

# Check data types
revised2_clean_inspect.dtypes

ACTIVITY_DATE        object
FACILITY_ID          object
FACILITY_NAME        object
PROGRAM_NAME         object
PROGRAM_STATUS       object
FACILITY_ADDRESS     object
FACILITY_CITY        object
FACILITY_STATE       object
FACILITY_ZIP         object
SCORE                 int64
GRADE                object
SERIAL_NUMBER        object
SEATS                object
LAT                 float64
LNG                 float64
dtype: object

In [27]:
# Change select column(s) to datetime
revised2_clean_inspect["ACTIVITY_DATE"] = pd.to_datetime(revised2_clean_inspect["ACTIVITY_DATE"])

# Check data types
revised2_clean_inspect.dtypes

ACTIVITY_DATE       datetime64[ns]
FACILITY_ID                 object
FACILITY_NAME               object
PROGRAM_NAME                object
PROGRAM_STATUS              object
FACILITY_ADDRESS            object
FACILITY_CITY               object
FACILITY_STATE              object
FACILITY_ZIP                object
SCORE                        int64
GRADE                       object
SERIAL_NUMBER               object
SEATS                       object
LAT                        float64
LNG                        float64
dtype: object

In [28]:
# Display DF
revised2_clean_inspect.head()

Unnamed: 0,ACTIVITY_DATE,FACILITY_ID,FACILITY_NAME,PROGRAM_NAME,PROGRAM_STATUS,FACILITY_ADDRESS,FACILITY_CITY,FACILITY_STATE,FACILITY_ZIP,SCORE,GRADE,SERIAL_NUMBER,SEATS,LAT,LNG
0,2018-09-10,FA0242046,SERVERY- NICKELODEON,SERVERY- NICKELODEON,ACTIVE,203 W OLIVE AVE # C,BURBANK,CA,91502,96,A,DARRFUZBW,31-60,-118.314661,34.175253
1,2018-07-19,FA0252769,TOMS JR BURGERS,TOMS JR BURGERS,ACTIVE,1030 W MARTIN LUTHER KING JR BLVD STE 108,LOS ANGELES,CA,90037,98,A,DA0XQVMTN,0-30,-118.292543,34.010859
2,2018-08-15,FA0011237,DJ BIBINGKAHAN,DJ BIBINGKAHAN BAKESHOP,ACTIVE,1515 E AMAR RD,WEST COVINA,CA,91792,98,A,DAMPOJNY8,0-30,-117.913926,34.030964
4,2018-09-07,FA0252595,MEJICO GRILL AND TEQUILLA LOUNGE,MEJICO GRILL AND TEQUILLA LOUNGE,ACTIVE,29002 AGOURA RD,AGOURA HILLS,CA,91301,90,A,DAUEU4NGF,151 +,-118.756808,34.143452
5,2018-09-18,FA0158101,MCDONALD'S #10681,MCDONALD'S #10681,ACTIVE,5725 FLORENCE AVE,BELL GARDENS,CA,90201,91,A,DARQIUA45,61-150,-118.163665,33.967791


In [29]:
# Check shape
revised2_clean_inspect.shape

(165906, 15)

In [30]:
# Export to csv
revised2_clean_inspect.to_csv("./Resources/Clean/clean_inspections.csv")

### VIOLATIONS_DF DATA CLEANUP:

In [31]:
# Display DF
violations_df.head()

Unnamed: 0,SERIAL NUMBER,VIOLATION STATUS,VIOLATION CODE,VIOLATION DESCRIPTION,POINTS
0,DA000211Z,OUT OF COMPLIANCE,F006,# 06. Adequate handwashing facilities supplied...,2.0
1,DA000211Z,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly bui...",1.0
2,DA000211Z,OUT OF COMPLIANCE,F014,# 14. Food contact surfaces: clean and sanitized,2.0
3,DA000211Z,OUT OF COMPLIANCE,F029,"# 29. Toxic substances properly identified, st...",1.0
4,DA000211Z,OUT OF COMPLIANCE,F035,# 35. Equipment/Utensils - approved; installed...,1.0


In [32]:
# Check data types
violations_df.dtypes

SERIAL NUMBER             object
VIOLATION  STATUS         object
VIOLATION CODE            object
VIOLATION DESCRIPTION     object
POINTS                   float64
dtype: object

In [33]:
# Change all columns, but 'POINTS' to string
violations_df[["SERIAL NUMBER", 
               "VIOLATION  STATUS", 
               "VIOLATION CODE", 
               "VIOLATION DESCRIPTION"]] = violations_df[["SERIAL NUMBER", 
                                                          "VIOLATION  STATUS", 
                                                          "VIOLATION CODE", 
                                                          "VIOLATION DESCRIPTION"]].astype(str)

# Check data types
violations_df.dtypes

SERIAL NUMBER             object
VIOLATION  STATUS         object
VIOLATION CODE            object
VIOLATION DESCRIPTION     object
POINTS                   float64
dtype: object

In [34]:
# Rename columns to replace 'space' with '_'
clean_violations_df = violations_df.rename(columns = {"SERIAL NUMBER": "SERIAL_NUMER", 
                                                      "VIOLATION  STATUS": "VIOLATION_STATUS", 
                                                      "VIOLATION CODE": "VIOLATION_CODE", 
                                                      "VIOLATION DESCRIPTION": "VIOLATION_DESCRIPTION"})

# Display DF
clean_violations_df.head()

Unnamed: 0,SERIAL_NUMER,VIOLATION_STATUS,VIOLATION_CODE,VIOLATION_DESCRIPTION,POINTS
0,DA000211Z,OUT OF COMPLIANCE,F006,# 06. Adequate handwashing facilities supplied...,2.0
1,DA000211Z,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly bui...",1.0
2,DA000211Z,OUT OF COMPLIANCE,F014,# 14. Food contact surfaces: clean and sanitized,2.0
3,DA000211Z,OUT OF COMPLIANCE,F029,"# 29. Toxic substances properly identified, st...",1.0
4,DA000211Z,OUT OF COMPLIANCE,F035,# 35. Equipment/Utensils - approved; installed...,1.0


In [35]:
# Check for null values
for column in clean_violations_df.columns:
    print(f"Column {column} has {clean_violations_df[column].isnull().sum()} null values.")

Column SERIAL_NUMER has 0 null values.
Column VIOLATION_STATUS has 0 null values.
Column VIOLATION_CODE has 0 null values.
Column VIOLATION_DESCRIPTION has 0 null values.
Column POINTS has 0 null values.


In [36]:
# Check data shape
clean_violations_df.shape

(971331, 5)

In [37]:
# Export to csv
clean_violations_df.to_csv("./Resources/Clean/clean_violations.csv")

### COMMUNITY_HEALTH_DF DATA CLEANUP:

In [38]:
# Display DF
community_health_df.head()

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,LE,Prop_prsc,Prop_3rdg,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,No_libr,Prop_empl,Prop_emsu,Prop_vote,Rte_resp,Prop_PA,Prop_groc,Prop_60mi,No_farm,No_EBT,Prop_foin,No_CalF,Rte_CalF,Prop_FRPM,Prop_obse,Prop_DM,Rte_coin,Rte_brin,Prop_smok,Rte_luca,Rte_COPD,Rte_CVD,Prop_hbu1,Prop_hbu2,Prop_ownr,Prop_rentr,No_hless,Prop_asth,Rte_crim,Rte_hom,Rte_alco,Propt_envi,No_gasw,Rte_te17,Rte_te19,Prop_LBW,Rte_IMR,Prop_1stt,Prop_depr,Rte_suic,Rte_UOD,Rte_syin,Rte_goin,Rte_hiv,Prop_uinC,Prop_uinA,Prop_duinC,Prop_duinA,Propt_HPI,Rte_mein,Rte_cein,Rte_luin,Prop_fru,Prop_bev,Prop_hyp,Prop_marj,Prop_HI,Prop_grad,Prop_trua
0,Alhambra,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,0.0007,0.1303,0.3356,0.5021,0.2112,84.55,0.665,0.5353,0.1934,0.228,0.2497,0.3289,53582,1.0,0.94,0.385,0.6235,0.7706,0.2731,0.5239,0.111,1,1,**,15799,0.3300,0.6598,0.1358,0.104,45.7963,125.9019,0.1272,25.4872,21.7535,170.914,0.447,0.232,0.404,0.596,64,0.0449,168,**,12,6,2,2.1042,9.1180,0.0815,3.1022,0.8614,0.0523,7,3,14,66,208,0.0598,0.207,0.0929,0.2291,43.3,**,**,34.6,0.2,0.2,0.2,0.1,0.0,1.0,0.1
1,Altadena,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.002,0.0988,0.2281,0.2021,0.0379,82.11,0.74,0.3942,0.105,0.1397,0.2918,0.4635,86050,2.0,0.92,0.7335,0.736,1.0975,0.3482,0.4524,0.12,1,1,**,-,-,0.559,0.2441,0.1358,38.367,189.2143,0.1129,25.2189,28.4887,211.035,0.401,0.193,0.719,0.281,58,0.0915,162,**,6,62,0,3.9610,7.6173,0.0703,**,0.8859,0.1099,10,8,19,119,330,0.0333,0.1745,0.066,0.0924,75.5,33.8,**,35.3,0.1,0.3,0.3,0.2,0.1,0.9,0.4
2,Arcadia,56992,0.1794,0.6389,0.1817,0.0115,0.125,0.2304,0.6315,0.0014,0.0003,0.0774,0.2042,0.4877,0.1888,85.42,0.675,0.7927,0.0789,0.1601,0.243,0.518,79934,1.0,0.941,0.5691,0.6271,1.9654,0.2662,0.511,0.131,0,0,**,6638,0.1800,0.2066,0.057,0.0819,32.9547,145.2497,0.1285,22.5416,22.058,173.371,0.402,0.214,0.596,0.404,12,**,146,**,19,27,2,**,**,0.059,**,0.8588,0.0493,9,**,8,46,56,0.0564,0.1245,0.1232,0.1876,73.4,22.0,**,29.5,0.1,0.2,0.2,0.0,0.0,1.0,0.1
3,Azusa,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,0.0013,0.1636,0.3935,0.3082,0.1156,81.16,0.439,0.339,0.2232,0.2932,0.2873,0.1962,53135,1.0,0.927,0.6201,0.6569,1.0168,0.3759,0.5997,0.121,0,0,0.2006,8868,0.5800,0.806,0.2607,0.1198,31.3309,134.9485,0.1174,34.8652,33.1891,218.754,0.474,0.24,0.527,0.473,55,0.0660,354,6,14,48,0,3.5084,9.2802,0.0681,**,0.881,0.0895,8,**,27,116,196,0.0708,0.2404,0.1117,0.1918,34.2,**,**,29.7,0.1,0.4,0.2,0.1,0.0,0.9,0.0
4,Baldwin Park,74438,0.2623,0.633,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,0.0008,0.1591,0.4116,0.4453,0.1908,82.99,0.47,0.3227,0.374,0.317,0.1962,0.1128,51742,1.0,0.898,0.5712,0.6318,0.3406,0.3164,0.5444,0.125,1,1,0.2531,14206,0.7400,0.8729,0.2652,0.1307,43.8584,110.0378,0.1045,28.2446,31.2708,195.522,0.498,0.227,0.572,0.428,111,0.0904,391,5,10,9,0,7.3997,20.1220,0.0856,4.1881,0.8827,0.0544,**,5,21,98,171,0.0582,0.3259,0.0704,0.1874,22.3,**,**,26.5,0.1,0.4,0.3,0.1,0.0,1.0,0.4


In [39]:
# Capitalize "GEONAME" values
community_health_df["GEONAME"] = community_health_df["GEONAME"].str.upper()
community_health_df.head()

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,LE,Prop_prsc,Prop_3rdg,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,No_libr,Prop_empl,Prop_emsu,Prop_vote,Rte_resp,Prop_PA,Prop_groc,Prop_60mi,No_farm,No_EBT,Prop_foin,No_CalF,Rte_CalF,Prop_FRPM,Prop_obse,Prop_DM,Rte_coin,Rte_brin,Prop_smok,Rte_luca,Rte_COPD,Rte_CVD,Prop_hbu1,Prop_hbu2,Prop_ownr,Prop_rentr,No_hless,Prop_asth,Rte_crim,Rte_hom,Rte_alco,Propt_envi,No_gasw,Rte_te17,Rte_te19,Prop_LBW,Rte_IMR,Prop_1stt,Prop_depr,Rte_suic,Rte_UOD,Rte_syin,Rte_goin,Rte_hiv,Prop_uinC,Prop_uinA,Prop_duinC,Prop_duinA,Propt_HPI,Rte_mein,Rte_cein,Rte_luin,Prop_fru,Prop_bev,Prop_hyp,Prop_marj,Prop_HI,Prop_grad,Prop_trua
0,ALHAMBRA,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,0.0007,0.1303,0.3356,0.5021,0.2112,84.55,0.665,0.5353,0.1934,0.228,0.2497,0.3289,53582,1.0,0.94,0.385,0.6235,0.7706,0.2731,0.5239,0.111,1,1,**,15799,0.3300,0.6598,0.1358,0.104,45.7963,125.9019,0.1272,25.4872,21.7535,170.914,0.447,0.232,0.404,0.596,64,0.0449,168,**,12,6,2,2.1042,9.1180,0.0815,3.1022,0.8614,0.0523,7,3,14,66,208,0.0598,0.207,0.0929,0.2291,43.3,**,**,34.6,0.2,0.2,0.2,0.1,0.0,1.0,0.1
1,ALTADENA,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.002,0.0988,0.2281,0.2021,0.0379,82.11,0.74,0.3942,0.105,0.1397,0.2918,0.4635,86050,2.0,0.92,0.7335,0.736,1.0975,0.3482,0.4524,0.12,1,1,**,-,-,0.559,0.2441,0.1358,38.367,189.2143,0.1129,25.2189,28.4887,211.035,0.401,0.193,0.719,0.281,58,0.0915,162,**,6,62,0,3.9610,7.6173,0.0703,**,0.8859,0.1099,10,8,19,119,330,0.0333,0.1745,0.066,0.0924,75.5,33.8,**,35.3,0.1,0.3,0.3,0.2,0.1,0.9,0.4
2,ARCADIA,56992,0.1794,0.6389,0.1817,0.0115,0.125,0.2304,0.6315,0.0014,0.0003,0.0774,0.2042,0.4877,0.1888,85.42,0.675,0.7927,0.0789,0.1601,0.243,0.518,79934,1.0,0.941,0.5691,0.6271,1.9654,0.2662,0.511,0.131,0,0,**,6638,0.1800,0.2066,0.057,0.0819,32.9547,145.2497,0.1285,22.5416,22.058,173.371,0.402,0.214,0.596,0.404,12,**,146,**,19,27,2,**,**,0.059,**,0.8588,0.0493,9,**,8,46,56,0.0564,0.1245,0.1232,0.1876,73.4,22.0,**,29.5,0.1,0.2,0.2,0.0,0.0,1.0,0.1
3,AZUSA,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,0.0013,0.1636,0.3935,0.3082,0.1156,81.16,0.439,0.339,0.2232,0.2932,0.2873,0.1962,53135,1.0,0.927,0.6201,0.6569,1.0168,0.3759,0.5997,0.121,0,0,0.2006,8868,0.5800,0.806,0.2607,0.1198,31.3309,134.9485,0.1174,34.8652,33.1891,218.754,0.474,0.24,0.527,0.473,55,0.0660,354,6,14,48,0,3.5084,9.2802,0.0681,**,0.881,0.0895,8,**,27,116,196,0.0708,0.2404,0.1117,0.1918,34.2,**,**,29.7,0.1,0.4,0.2,0.1,0.0,0.9,0.0
4,BALDWIN PARK,74438,0.2623,0.633,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,0.0008,0.1591,0.4116,0.4453,0.1908,82.99,0.47,0.3227,0.374,0.317,0.1962,0.1128,51742,1.0,0.898,0.5712,0.6318,0.3406,0.3164,0.5444,0.125,1,1,0.2531,14206,0.7400,0.8729,0.2652,0.1307,43.8584,110.0378,0.1045,28.2446,31.2708,195.522,0.498,0.227,0.572,0.428,111,0.0904,391,5,10,9,0,7.3997,20.1220,0.0856,4.1881,0.8827,0.0544,**,5,21,98,171,0.0582,0.3259,0.0704,0.1874,22.3,**,**,26.5,0.1,0.4,0.3,0.1,0.0,1.0,0.4


In [40]:
# Drop columns as discussed
new_community_health_df = community_health_df.drop(columns = ["LE", 
                                                            "Prop_prsc", 
                                                            "Prop_3rdg", 
                                                            "No_libr", 
                                                            "Prop_empl", 
                                                            "Prop_emsu", 
                                                            "Prop_vote", 
                                                            "Rte_resp", 
                                                            "Prop_PA", 
                                                            "No_EBT", 
                                                            "No_CalF", 
                                                            "Rte_CalF", 
                                                            "Prop_FRPM", 
                                                            "Rte_coin", 
                                                            "Rte_brin", 
                                                            "Prop_smok", 
                                                            "Rte_luca", 
                                                            "Rte_COPD", 
                                                            "Prop_hbu1", 
                                                            "Prop_hbu2", 
                                                            "Prop_asth", 
                                                            "Rte_hom", 
                                                            "No_gasw", 
                                                            "Rte_te17", 
                                                            "Rte_te19", 
                                                            "Prop_LBW", 
                                                            "Rte_IMR", 
                                                            "Prop_1stt", 
                                                            "Rte_suic", 
                                                            "Rte_UOD", 
                                                            "Rte_syin", 
                                                            "Rte_goin", 
                                                            "Rte_hiv", 
                                                            "Prop_uinC", 
                                                            "Prop_uinA", 
                                                            "Prop_duinC", 
                                                            "Prop_duinA", 
                                                            "Rte_mein", 
                                                            "Rte_cein", 
                                                            "Rte_luin", 
                                                            "Prop_HI", 
                                                            "Prop_grad", 
                                                            "Prop_trua",
                                                            "Prop_foin"])

# Display DF
new_community_health_df.head()

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
0,ALHAMBRA,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,0.0007,0.1303,0.3356,0.5021,0.2112,0.1934,0.228,0.2497,0.3289,53582,0.5239,0.111,1,0.1358,0.104,170.914,0.404,0.596,64,168,12,6,0.0523,43.3,0.2,0.2,0.2,0.1
1,ALTADENA,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.002,0.0988,0.2281,0.2021,0.0379,0.105,0.1397,0.2918,0.4635,86050,0.4524,0.12,1,0.2441,0.1358,211.035,0.719,0.281,58,162,6,62,0.1099,75.5,0.1,0.3,0.3,0.2
2,ARCADIA,56992,0.1794,0.6389,0.1817,0.0115,0.125,0.2304,0.6315,0.0014,0.0003,0.0774,0.2042,0.4877,0.1888,0.0789,0.1601,0.243,0.518,79934,0.511,0.131,0,0.057,0.0819,173.371,0.596,0.404,12,146,19,27,0.0493,73.4,0.1,0.2,0.2,0.0
3,AZUSA,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,0.0013,0.1636,0.3935,0.3082,0.1156,0.2232,0.2932,0.2873,0.1962,53135,0.5997,0.121,0,0.2607,0.1198,218.754,0.527,0.473,55,354,14,48,0.0895,34.2,0.1,0.4,0.2,0.1
4,BALDWIN PARK,74438,0.2623,0.633,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,0.0008,0.1591,0.4116,0.4453,0.1908,0.374,0.317,0.1962,0.1128,51742,0.5444,0.125,1,0.2652,0.1307,195.522,0.572,0.428,111,391,10,9,0.0544,22.3,0.1,0.4,0.3,0.1


In [61]:
# Replace '**' with '0' as discussed
new_community_health_df.replace('**', 0)

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
0,ALHAMBRA,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,0.0007,0.1303,0.3356,0.5021,0.2112,0.1934,0.2280,0.2497,0.3289,53582,0.5239,0.111,1,0.1358,0.1040,170.9140,0.404,0.5960,64,168,12,6,0.0523,43.3,0.2,0.2,0.2,0.1
1,ALTADENA,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.0020,0.0988,0.2281,0.2021,0.0379,0.1050,0.1397,0.2918,0.4635,86050,0.4524,0.120,1,0.2441,0.1358,211.0350,0.719,0.2810,58,162,6,62,0.1099,75.5,0.1,0.3,0.3,0.2
2,ARCADIA,56992,0.1794,0.6389,0.1817,0.0115,0.1250,0.2304,0.6315,0.0014,0.0003,0.0774,0.2042,0.4877,0.1888,0.0789,0.1601,0.2430,0.5180,79934,0.5110,0.131,0,0.0570,0.0819,173.3710,0.596,0.4040,12,146,19,27,0.0493,73.4,0.1,0.2,0.2,0.0
3,AZUSA,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,0.0013,0.1636,0.3935,0.3082,0.1156,0.2232,0.2932,0.2873,0.1962,53135,0.5997,0.121,0,0.2607,0.1198,218.7540,0.527,0.4730,55,354,14,48,0.0895,34.2,0.1,0.4,0.2,0.1
4,BALDWIN PARK,74438,0.2623,0.6330,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,0.0008,0.1591,0.4116,0.4453,0.1908,0.3740,0.3170,0.1962,0.1128,51742,0.5444,0.125,1,0.2652,0.1307,195.5220,0.572,0.4280,111,391,10,9,0.0544,22.3,0.1,0.4,0.3,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,WEST HOLLYWOOD,35797,0.0639,0.7801,0.1560,0.0310,0.1081,0.7984,0.0603,0.0014,0.0008,0.1403,0.2798,0.2587,0.1023,0.0456,0.0884,0.2404,0.6257,56317,0.9850,0.062,1,0.2207,0.0804,176.8870,0.201,0.7990,81,768,61,29,0.1503,70.6,0.4,0.2,0.2,0.2
83,WEST WHITTIER-LOS NIETOS,25441,0.2426,0.6253,0.1321,0.0080,0.8775,0.0954,0.0147,0.0032,0.0013,0.0912,0.2858,0.2691,0.0987,0.2858,0.3170,0.2711,0.1262,65363,0.4991,0.107,0,0.3584,0.1234,217.1750,0.721,0.2790,36,307,9,26,0.0791,41.2,0.1,0.4,0.2,0.1
84,WESTMONT,32835,0.2823,0.6208,0.0968,0.4945,0.4912,0.0096,0.0025,0.0012,0.0009,0.3546,0.6729,0.2324,0.1142,0.3041,0.2962,0.3272,0.0725,26808,0.1988,0.184,0,0.3195,0.1852,308.4830,0.306,0.6940,365,1514,7,69,0.0703,0.6,0.2,0.4,0.3,0.0
85,WHITTIER,88078,0.2304,0.6385,0.1311,0.0098,0.6750,0.2752,0.0361,0.0028,0.0011,0.1081,0.3274,0.1770,0.0542,0.1672,0.2376,0.3531,0.2421,66457,0.3930,0.125,1,0.2773,0.1291,199.0170,0.569,0.4310,258,293,16,63,0.0756,53.8,0.2,0.3,0.3,0.1


In [62]:
new_community_health_df.loc[new_community_health_df['Prop_Eng'] == '**']

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
54,MANHATTAN BEACH,35170,0.2216,0.6243,0.1541,0.0086,0.0746,0.8182,0.096,0.0014,0.0013,0.0304,0.0652,0.1152,**,0.0216,0.0637,0.1792,0.7355,143527,0.6644,0.106,1,0.0857,0.0405,133.239,0.681,0.319,4,151,33,92,0.1697,98.6,0.2,0.2,0.3,0.1


In [65]:
new_community_health_df.loc[new_community_health_df['Prop_obse'] == '**']

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
8,BEVERLY HILLS,34652,0.1574,0.6406,0.2021,0.0209,0.0591,0.8212,0.0976,0.0009,0.0003,0.0801,0.1604,0.3841,0.1077,0.0505,0.128,0.2106,0.6109,97327,0.7074,0.049,1,**,**,138.562,0.412,0.588,16,306,46,43,0.1924,88.1,0.2,0.1,0.4,0.2


In [63]:
new = new_community_health_df.replace(to_replace = "**", value = 0)
new.sample(50)

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
35,LAWNDALE,33381,0.2474,0.6579,0.0946,0.105,0.6036,0.1716,0.1053,0.0028,0.0117,0.165,0.3355,0.3763,0.151,0.2996,0.2482,0.2957,0.1565,47540,0.945,0.078,1,0.248,0.098,198.231,0.325,0.675,43,581,14,78,0.0691,31.4,0.2,0.3,0.2,0.2
10,CALABASAS,24182,0.2031,0.6378,0.1591,0.0152,0.0662,0.8261,0.0911,0.0013,0.0002,0.0562,0.1255,0.2618,0.036,0.0302,0.1202,0.2129,0.6367,106050,0.2029,0.206,1,0.1094,0.0,127.516,0.702,0.298,0,62,19,85,0.1412,91.6,0.1,0.2,0.2,0.1
51,LOS ANGELES COUNCIL DISTRICT 15,269467,0.2741,0.6188,0.1071,0.1272,0.6377,0.1634,0.0648,0.0022,0.0048,0.2472,0.4887,0.3177,0.1367,0.3079,0.2505,0.2733,0.1683,46423,0.517,0.101,3,0.2736,0.1266,215.8852,0.399,0.601,1773,697,12,66,0.0804,20.2,0.1,0.5,0.2,0.1
62,PASADENA,140883,0.2012,0.6477,0.1511,0.1034,0.3442,0.3919,0.1581,0.0015,0.0008,0.1306,0.2262,0.3037,0.0886,0.1257,0.1385,0.2354,0.5004,72402,0.5965,0.103,3,0.2233,0.0823,213.696,0.436,0.564,530,339,24,43,0.1247,66.4,0.2,0.3,0.3,0.1
60,PALMDALE,159690,0.2807,0.6328,0.0864,0.1545,0.5498,0.2498,0.0413,0.0031,0.0014,0.2018,0.4764,0.2624,0.1,0.2583,0.2592,0.3341,0.1483,52392,0.2773,0.318,0,0.3054,0.1217,236.912,0.632,0.368,212,436,9,95,0.0964,24.4,0.1,0.4,0.3,0.1
58,MONTEREY PARK,61121,0.1724,0.6155,0.2121,0.0033,0.2743,0.0458,0.6754,0.001,0.0003,0.1414,0.4133,0.536,0.2847,0.2127,0.2443,0.2385,0.3044,54097,0.5435,0.086,1,0.1058,0.1222,145.13,0.52,0.48,7,214,14,4,0.0442,32.1,0.1,0.2,0.3,0.0
27,HAWTHORNE,87772,0.2627,0.6418,0.0954,0.2813,0.5288,0.1069,0.0687,0.0021,0.0123,0.1863,0.4591,0.3477,0.1339,0.2619,0.2454,0.3023,0.1904,44504,0.8672,0.109,0,0.2351,0.1182,244.015,0.266,0.734,100,745,11,69,0.0793,29.3,0.2,0.4,0.3,0.2
23,GARDENA,60677,0.215,0.6208,0.1642,0.2395,0.3927,0.0911,0.2682,0.0016,0.0068,0.1401,0.2806,0.3372,0.1451,0.1818,0.2703,0.3161,0.2318,47674,0.7461,0.076,1,0.277,0.0991,208.034,0.512,0.488,186,557,26,76,0.0584,41.9,0.1,0.5,0.3,0.1
1,ALTADENA,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.002,0.0988,0.2281,0.2021,0.0379,0.105,0.1397,0.2918,0.4635,86050,0.4524,0.12,1,0.2441,0.1358,211.035,0.719,0.281,58,162,6,62,0.1099,75.5,0.1,0.3,0.3,0.2
12,CERRITOS,49250,0.1645,0.6091,0.2264,0.0696,0.1253,0.1562,0.6453,0.0011,0.0025,0.0398,0.0891,0.4471,0.1142,0.0831,0.1464,0.2886,0.4819,90321,0.6596,0.128,1,0.2342,0.0948,166.02,0.791,0.209,27,225,16,83,0.041,79.7,0.1,0.3,0.2,0.0


In [64]:
new.loc[new['Prop_Eng'] == '**']

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj


In [58]:
new.loc[new['Prop_obse'] == '**']

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj


In [60]:
new.loc[new['Prop_DM'] == "**"]

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj


In [43]:
# Replace '-' with '0' as discussed
new_community_health_df.replace('-', 0)

Unnamed: 0,GEONAME,Pop_Tot,Prop_18y,Prop_64y,Prop_65y+,Prop_Blk,Prop_Lat,Prop_Whi,Prop_Asi,Prop_Ami,Prop_NHO,Prop_FPL1,Prop_FPL2,Prop_forb,Prop_Eng,Prop_edLH,Prop_edHG,Prop_edSC,Prop_edCG,MHI,Prop_groc,Prop_60mi,No_farm,Prop_obse,Prop_DM,Rte_CVD,Prop_ownr,Prop_rentr,No_hless,Rte_crim,Rte_alco,Propt_envi,Prop_depr,Propt_HPI,Prop_fru,Prop_bev,Prop_hyp,Prop_marj
0,ALHAMBRA,86705,0.1831,0.6504,0.1665,0.0133,0.3435,0.0913,0.5498,0.0014,0.0007,0.1303,0.3356,0.5021,0.2112,0.1934,0.2280,0.2497,0.3289,53582,0.5239,0.111,1,0.1358,0.1040,170.9140,0.404,0.5960,64,168,12,6,0.0523,43.3,0.2,0.2,0.2,0.1
1,ALTADENA,42525,0.2072,0.6221,0.1707,0.2374,0.2905,0.4129,0.0553,0.0019,0.0020,0.0988,0.2281,0.2021,0.0379,0.1050,0.1397,0.2918,0.4635,86050,0.4524,0.120,1,0.2441,0.1358,211.0350,0.719,0.2810,58,162,6,62,0.1099,75.5,0.1,0.3,0.3,0.2
2,ARCADIA,56992,0.1794,0.6389,0.1817,0.0115,0.1250,0.2304,0.6315,0.0014,0.0003,0.0774,0.2042,0.4877,0.1888,0.0789,0.1601,0.2430,0.5180,79934,0.5110,0.131,0,0.0570,0.0819,173.3710,0.596,0.4040,12,146,19,27,0.0493,73.4,0.1,0.2,0.2,0.0
3,AZUSA,49479,0.2507,0.6538,0.0956,0.0292,0.6838,0.1938,0.0895,0.0024,0.0013,0.1636,0.3935,0.3082,0.1156,0.2232,0.2932,0.2873,0.1962,53135,0.5997,0.121,0,0.2607,0.1198,218.7540,0.527,0.4730,55,354,14,48,0.0895,34.2,0.1,0.4,0.2,0.1
4,BALDWIN PARK,74438,0.2623,0.6330,0.1047,0.0099,0.7934,0.0432,0.1514,0.0012,0.0008,0.1591,0.4116,0.4453,0.1908,0.3740,0.3170,0.1962,0.1128,51742,0.5444,0.125,1,0.2652,0.1307,195.5220,0.572,0.4280,111,391,10,9,0.0544,22.3,0.1,0.4,0.3,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,WEST HOLLYWOOD,35797,0.0639,0.7801,0.1560,0.0310,0.1081,0.7984,0.0603,0.0014,0.0008,0.1403,0.2798,0.2587,0.1023,0.0456,0.0884,0.2404,0.6257,56317,0.9850,0.062,1,0.2207,0.0804,176.8870,0.201,0.7990,81,768,61,29,0.1503,70.6,0.4,0.2,0.2,0.2
83,WEST WHITTIER-LOS NIETOS,25441,0.2426,0.6253,0.1321,0.0080,0.8775,0.0954,0.0147,0.0032,0.0013,0.0912,0.2858,0.2691,0.0987,0.2858,0.3170,0.2711,0.1262,65363,0.4991,0.107,0,0.3584,0.1234,217.1750,0.721,0.2790,36,307,9,26,0.0791,41.2,0.1,0.4,0.2,0.1
84,WESTMONT,32835,0.2823,0.6208,0.0968,0.4945,0.4912,0.0096,0.0025,0.0012,0.0009,0.3546,0.6729,0.2324,0.1142,0.3041,0.2962,0.3272,0.0725,26808,0.1988,0.184,0,0.3195,0.1852,308.4830,0.306,0.6940,365,1514,7,69,0.0703,0.6,0.2,0.4,0.3,0.0
85,WHITTIER,88078,0.2304,0.6385,0.1311,0.0098,0.6750,0.2752,0.0361,0.0028,0.0011,0.1081,0.3274,0.1770,0.0542,0.1672,0.2376,0.3531,0.2421,66457,0.3930,0.125,1,0.2773,0.1291,199.0170,0.569,0.4310,258,293,16,63,0.0756,53.8,0.2,0.3,0.3,0.1


In [44]:
# Check data types
for column in new_community_health_df.columns:
    print(f"Column {column} has data type -- {community_health_df[column].dtype}.")

Column GEONAME has data type -- object.
Column Pop_Tot has data type -- int64.
Column Prop_18y has data type -- float64.
Column Prop_64y has data type -- float64.
Column Prop_65y+ has data type -- float64.
Column Prop_Blk has data type -- float64.
Column Prop_Lat has data type -- float64.
Column Prop_Whi has data type -- float64.
Column Prop_Asi has data type -- float64.
Column Prop_Ami has data type -- float64.
Column Prop_NHO has data type -- float64.
Column Prop_FPL1 has data type -- float64.
Column Prop_FPL2 has data type -- float64.
Column Prop_forb has data type -- float64.
Column Prop_Eng has data type -- object.
Column Prop_edLH has data type -- float64.
Column Prop_edHG has data type -- float64.
Column Prop_edSC has data type -- float64.
Column Prop_edCG has data type -- float64.
Column MHI has data type -- int64.
Column Prop_groc has data type -- float64.
Column Prop_60mi has data type -- float64.
Column No_farm has data type -- object.
Column Prop_obse has data type -- objec

In [45]:
# Change select columns to string
new_community_health_df[["GEONAME", 
                         "Prop_Eng", 
                         "No_farm", 
                         "Prop_obse", 
                         "Propt_envi", 
                         "Propt_HPI"]] = new_community_health_df[["GEONAME", 
                                                                  "Prop_Eng", 
                                                                  "No_farm", 
                                                                  "Prop_obse", 
                                                                  "Propt_envi", 
                                                                  "Propt_HPI"]].astype(str)

In [46]:
# Check for null values
for column in new_community_health_df.columns:
    print(f"Column {column} has {new_community_health_df[column].isnull().sum()} null values.")

Column GEONAME has 0 null values.
Column Pop_Tot has 0 null values.
Column Prop_18y has 0 null values.
Column Prop_64y has 0 null values.
Column Prop_65y+ has 0 null values.
Column Prop_Blk has 0 null values.
Column Prop_Lat has 0 null values.
Column Prop_Whi has 0 null values.
Column Prop_Asi has 0 null values.
Column Prop_Ami has 0 null values.
Column Prop_NHO has 0 null values.
Column Prop_FPL1 has 0 null values.
Column Prop_FPL2 has 0 null values.
Column Prop_forb has 0 null values.
Column Prop_Eng has 0 null values.
Column Prop_edLH has 0 null values.
Column Prop_edHG has 0 null values.
Column Prop_edSC has 0 null values.
Column Prop_edCG has 0 null values.
Column MHI has 0 null values.
Column Prop_groc has 0 null values.
Column Prop_60mi has 0 null values.
Column No_farm has 0 null values.
Column Prop_obse has 0 null values.
Column Prop_DM has 0 null values.
Column Rte_CVD has 0 null values.
Column Prop_ownr has 0 null values.
Column Prop_rentr has 0 null values.
Column No_hless 

In [47]:
# Check shape
new_community_health_df.shape

(87, 38)

In [48]:
# Export to csv
new_community_health_df.to_csv("./Resources/Clean/clean_community_health.csv")

**NEXT STEP**: Review and update 'GEONAME' in SQL.