# New York Dataset

In [10]:
# Import dependencies
import os
import csv
import pandas as pd
from pathlib import Path
import json


## Demographic Metric

**Measuring**: ACS Median Age, Racial Demographics, Education Level

Data was pulled from the 2020 New York Census Database and the American Community Survey (ACS) by the U.S. Census Bureau using various coded tables to ensure unified data for each metropolitan city. Two tables used the same code and were seperated to ensure proper measurement. 
* ACS Median Age: Table DP05
* ACS Race: Table DP05
* Education Level: S1501

##### ACS Median Age (Table DP05)

In [2]:
# Import csv file for Texas ACS Population Density
nyc_age_data = '../ny_data/nyc_raw_data/nyc_age.csv'

# Read the CSV file into a DataFrame
nyc_age_df = pd.read_csv(nyc_age_data)
nyc_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,SEX AND AGE!!Total population!!25 to 34 years,SEX AND AGE!!Total population!!35 to 44 years,SEX AND AGE!!Total population!!45 to 54 years,SEX AND AGE!!Total population!!55 to 59 years,SEX AND AGE!!Total population!!60 to 64 years,SEX AND AGE!!Total population!!65 to 74 years,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years)
0,"New York County, New York",,,,,,,,,,,,,,,,
1,Estimate,,1629153.0,77025,61508,61066,68059,111192,362435,233553,199243,95799,88523,147216,81724,41810,37.7
2,Percent,,1629153.0,4.7%,3.8%,3.7%,4.2%,6.8%,22.2%,14.3%,12.2%,5.9%,5.4%,9.0%,5.0%,2.6%,(X)


In [3]:
nyc_age_df = nyc_age_df.drop(columns=['SEX AND AGE'])
nyc_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,SEX AND AGE!!Total population!!25 to 34 years,SEX AND AGE!!Total population!!35 to 44 years,SEX AND AGE!!Total population!!45 to 54 years,SEX AND AGE!!Total population!!55 to 59 years,SEX AND AGE!!Total population!!60 to 64 years,SEX AND AGE!!Total population!!65 to 74 years,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years)
0,"New York County, New York",,,,,,,,,,,,,,,
1,Estimate,1629153.0,77025,61508,61066,68059,111192,362435,233553,199243,95799,88523,147216,81724,41810,37.7
2,Percent,1629153.0,4.7%,3.8%,3.7%,4.2%,6.8%,22.2%,14.3%,12.2%,5.9%,5.4%,9.0%,5.0%,2.6%,(X)


In [4]:
# Rename columns for clarity
nyc_age_df.columns = ['',
  'Total Population',
  'Under 5',
  '5 to 9',
  '10 to 14',
  '15 to 19',
  '20 to 24',
  '25 to 34',
  '35 to 44',
  '45 to 54',
  '55 to 59',
  '60 to 64',
  '65 to 74',
  '75 to 84',
  '85 and Over',
  'Median Age'
]

nyc_age_df.head()

Unnamed: 0,Unnamed: 1,Total Population,Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,55 to 59,60 to 64,65 to 74,75 to 84,85 and Over,Median Age
0,"New York County, New York",,,,,,,,,,,,,,,
1,Estimate,1629153.0,77025,61508,61066,68059,111192,362435,233553,199243,95799,88523,147216,81724,41810,37.7
2,Percent,1629153.0,4.7%,3.8%,3.7%,4.2%,6.8%,22.2%,14.3%,12.2%,5.9%,5.4%,9.0%,5.0%,2.6%,(X)


In [5]:
# Drop NaN values
nyc_age_df = nyc_age_df.dropna()
nyc_age_df.head()

Unnamed: 0,Unnamed: 1,Total Population,Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,55 to 59,60 to 64,65 to 74,75 to 84,85 and Over,Median Age
1,Estimate,1629153,77025,61508,61066,68059,111192,362435,233553,199243,95799,88523,147216,81724,41810,37.7
2,Percent,1629153,4.7%,3.8%,3.7%,4.2%,6.8%,22.2%,14.3%,12.2%,5.9%,5.4%,9.0%,5.0%,2.6%,(X)


In [6]:
# Save the cleaned DataFrame to a new CSV file
nyc_age_df.to_csv('../ny_data/cleaned_nyc_age.csv', index=False)

#### Racial Demographics (Table DP05)

In [13]:
# Import csv file for race
nyc_race_data = "../ny_data/nyc_raw_data/nyc_race.csv"

# Read the CSV file into a DataFrame
nyc_race_df = pd.read_csv(nyc_race_data)
nyc_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races,Race alone or in combination with one or more other races!!Total population,Race alone or in combination with one or more other races!!Total population!!White,Race alone or in combination with one or more other races!!Total population!!Black or African American,Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native,Race alone or in combination with one or more other races!!Total population!!Asian,Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander,Race alone or in combination with one or more other races!!Total population!!Some other race
0,"New York County, New York",,,,,,,,
1,Estimate,,1629153.0,976633,276040,23168,225191,2816,235303
2,Percent,,1629153.0,59.9%,16.9%,1.4%,13.8%,0.2%,14.4%


In [14]:
# Rename columns to remove 'Race alone or in combination with one or more other races!!Total population!!' from the column names
nyc_race_df.columns = nyc_race_df.columns.str.replace('Race alone or in combination with one or more other races!!Total population!!', '')
nyc_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races,Race alone or in combination with one or more other races!!Total population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some other race
0,"New York County, New York",,,,,,,,
1,Estimate,,1629153.0,976633,276040,23168,225191,2816,235303
2,Percent,,1629153.0,59.9%,16.9%,1.4%,13.8%,0.2%,14.4%


In [15]:
nyc_race_df = nyc_race_df.drop(columns=['Race alone or in combination with one or more other races'])
nyc_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races!!Total population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some other race
0,"New York County, New York",,,,,,,
1,Estimate,1629153.0,976633,276040,23168,225191,2816,235303
2,Percent,1629153.0,59.9%,16.9%,1.4%,13.8%,0.2%,14.4%


In [17]:
# Rename columns for clarity 
nyc_race_df.columns = ['', 
                       'Total Population',
                           'White', 
                           'Black or African American', 
                           'American Indian and Alaska Native', 
                           'Asian', 
                           'Native Hawaiian and Other Pacific Islander', 
                           'Other']

# Display
nyc_race_df.head()


Unnamed: 0,Unnamed: 1,Total Population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Other
0,"New York County, New York",,,,,,,
1,Estimate,1629153.0,976633,276040,23168,225191,2816,235303
2,Percent,1629153.0,59.9%,16.9%,1.4%,13.8%,0.2%,14.4%


In [18]:
# Drop NaN values
nyc_race_df = nyc_race_df.dropna()

In [None]:
# Save as new CSV file
nyc_race_df.to_csv('../nyc_data/cleaned_nyc_race.csv', index=False)

#### Education Level (Table S1501)

In [20]:
# Import csv file for Texas edu level 
nyc_edu_data = '../ny_data/nyc_raw_data/nyc_edu.csv'

# Read the CSV file into a DataFrame
nyc_edu_df = pd.read_csv(nyc_edu_data)
nyc_edu_df

Unnamed: 0,Label (Grouping),"New York County, New York!!Total!!Estimate","New York County, New York!!Male!!Estimate","New York County, New York!!Female!!Estimate"
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,145611,65104,80507
2,Less than high school graduate,11172,6392,4780
3,High school graduate (includes equival...,27293,13162,14131
4,Some college or associate's degree,57718,24366,33352
...,...,...,...,...
63,Less than high school graduate,21808,25909,17451
64,High school graduate (includes equival...,31091,34817,26016
65,Some college or associate's degree,39446,44338,34674
66,Bachelor's degree,82717,97946,72127


In [21]:
# Source data (filter only what is needed out of dataset)
nyc_edu_df = nyc_edu_df.iloc[:28]
nyc_edu_df.head()

Unnamed: 0,Label (Grouping),"New York County, New York!!Total!!Estimate","New York County, New York!!Male!!Estimate","New York County, New York!!Female!!Estimate"
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,145611.0,65104.0,80507.0
2,Less than high school graduate,11172.0,6392.0,4780.0
3,High school graduate (includes equival...,27293.0,13162.0,14131.0
4,Some college or associate's degree,57718.0,24366.0,33352.0


In [22]:
# Rename columns for clarity 
nyc_edu_df.columns = ['Age by Educational Attainment',
                          'Total', 
                            'Male',
                            'Female']
nyc_edu_df.head()

Unnamed: 0,Age by Educational Attainment,Total,Male,Female
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,145611.0,65104.0,80507.0
2,Less than high school graduate,11172.0,6392.0,4780.0
3,High school graduate (includes equival...,27293.0,13162.0,14131.0
4,Some college or associate's degree,57718.0,24366.0,33352.0


In [23]:
# Drop NaN values
nyc_edu_df = nyc_edu_df.dropna()

In [25]:
# save as new csv file with cleaned data
nyc_edu_df.to_csv('../ny_data/cleaned_nyc_edu_level.csv', index=False)

## Economic Metric

**Measuring**: Median Household Income, Unemployment Rate, Poverty Rate.

Data for this section was pulled from the Houston 2020 Census Database using various coded tables:
* Median Household Income: Table S1901
* Unemployment Rate: Table S2301
* Poverty Rate: S1501 - WE ARE CUTTING

##### Median Household Income (Table S1901)

In [26]:
# Import csv file from Median Household Income
nyc_income_data = '../ny_data/nyc_raw_data/nyc_income.csv'

# Read the CSV file into a DataFrame
nyc_income_df = pd.read_csv(nyc_income_data)
nyc_income_df.head()

Unnamed: 0,Label (Grouping),"New York County, New York!!Households!!Estimate","New York County, New York!!Families!!Estimate","New York County, New York!!Married-couple families!!Estimate","New York County, New York!!Nonfamily households!!Estimate"
0,Total,758720,325483,218374,433237
1,"Less than $10,000",8.3%,5.3%,1.7%,10.9%
2,"$10,000 to $14,999",5.1%,3.3%,2.1%,6.4%
3,"$15,000 to $24,999",6.8%,6.4%,3.9%,7.3%
4,"$25,000 to $34,999",5.6%,5.4%,3.4%,5.9%


In [27]:
# Droping unnecessary rows and columns
nyc_income_df = nyc_income_df.iloc[:13]


In [28]:
# Rename columns for clarity
nyc_income_df.columns = ['',
                             'Household', 
                              'Families', 
                              'Married-couple Households', 
                              'Nonfamily Households']
nyc_income_df.head()

Unnamed: 0,Unnamed: 1,Household,Families,Married-couple Households,Nonfamily Households
0,Total,758720,325483,218374,433237
1,"Less than $10,000",8.3%,5.3%,1.7%,10.9%
2,"$10,000 to $14,999",5.1%,3.3%,2.1%,6.4%
3,"$15,000 to $24,999",6.8%,6.4%,3.9%,7.3%
4,"$25,000 to $34,999",5.6%,5.4%,3.4%,5.9%


In [29]:
# Save as new csv file with cleaned data
nyc_income_df.to_csv('../ny_data/cleaned_nyc_income.csv', index=False)

##### Unemployment Rate (Table S2301)

In [30]:
# Import csv file from unemployment rate
nyc_unemployment_data = '../ny_data/nyc_raw_data/nyc_employment_status.csv'

# Read the CSV file into a DataFrame
nyc_unemployment_df = pd.read_csv(nyc_unemployment_data)
nyc_unemployment_df.head()

Unnamed: 0,Label (Grouping),"New York County, New York!!Total!!Estimate","New York County, New York!!Labor Force Participation Rate!!Estimate","New York County, New York!!Employment/Population Ratio!!Estimate","New York County, New York!!Unemployment rate!!Estimate"
0,Population 16 years and over,1418173.0,66.9%,63.1%,5.7%
1,AGE,,,,
2,16 to 19 years,56678.0,21.7%,14.3%,34.0%
3,20 to 24 years,111192.0,64.4%,57.8%,10.2%
4,25 to 29 years,188117.0,86.4%,82.5%,4.5%


In [31]:
# Filter the DataFrame to only include rows 0-11
nyc_unemployment_age_df = nyc_unemployment_df.iloc[0:12]

# Display the filtered DataFrame
nyc_unemployment_age_df.head()

Unnamed: 0,Label (Grouping),"New York County, New York!!Total!!Estimate","New York County, New York!!Labor Force Participation Rate!!Estimate","New York County, New York!!Employment/Population Ratio!!Estimate","New York County, New York!!Unemployment rate!!Estimate"
0,Population 16 years and over,1418173.0,66.9%,63.1%,5.7%
1,AGE,,,,
2,16 to 19 years,56678.0,21.7%,14.3%,34.0%
3,20 to 24 years,111192.0,64.4%,57.8%,10.2%
4,25 to 29 years,188117.0,86.4%,82.5%,4.5%


In [32]:
# Rename columns for clarity
nyc_unemployment_df.columns = ['Age Group', 
                                   'Total', 
                                   'Labor Force Participation', 
                                   'Employment Population Ratio', 
                                   'Unemployment Rate']
nyc_unemployment_df.head()

Unnamed: 0,Age Group,Total,Labor Force Participation,Employment Population Ratio,Unemployment Rate
0,Population 16 years and over,1418173.0,66.9%,63.1%,5.7%
1,AGE,,,,
2,16 to 19 years,56678.0,21.7%,14.3%,34.0%
3,20 to 24 years,111192.0,64.4%,57.8%,10.2%
4,25 to 29 years,188117.0,86.4%,82.5%,4.5%


In [33]:
# Drop any NaN values
nyc_unemployment_df = nyc_unemployment_df.dropna()

In [34]:
# Save the cleaned DataFrame to a new CSV file
nyc_unemployment_df.to_csv('../ny_data/cleaned_nyc_unemployment.csv', index=False)

## Housing Metric

**Measuring**: Median Home Price, Median Rent, Homeownership Rate

Data for this section was pulled from the Houston 2020 Census Database using the same table code from the American Community Survey by the U.S. Census Bureau (ACS). 
* Median Rent: Table DP04

#### Median Rent (DP04)

In [6]:
# Import csv file 
nyc_rent_data = '../ny_data/nyc_raw_data/nyc_gross_rent.csv'

# Read the CSV file into a DataFrame
nyc_rent_df = pd.read_csv(nyc_rent_data)
nyc_rent_df.head()

Unnamed: 0,Label (Grouping),GROSS RENT,GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Less than $500,GROSS RENT!!Occupied units paying rent!!$500 to $999,"GROSS RENT!!Occupied units paying rent!!$1,000 to $1,499","GROSS RENT!!Occupied units paying rent!!$1,500 to $1,999","GROSS RENT!!Occupied units paying rent!!$2,000 to $2,499","GROSS RENT!!Occupied units paying rent!!$2,500 to $2,999","GROSS RENT!!Occupied units paying rent!!$3,000 or more",GROSS RENT!!Occupied units paying rent!!Median (dollars),GROSS RENT!!Occupied units paying rent!!No rent paid
0,Estimate,,564817,58842,82255,97901,75759,72109,52899,125052,1787,11396
1,Percent,,564817,10.4%,14.6%,17.3%,13.4%,12.8%,9.4%,22.1%,(X),(X)


In [7]:
# Drop the 2nd column "GROSS RENT"
nyc_rent_df = nyc_rent_df.drop(columns=['GROSS RENT'])
nyc_rent_df.head()

Unnamed: 0,Label (Grouping),GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Less than $500,GROSS RENT!!Occupied units paying rent!!$500 to $999,"GROSS RENT!!Occupied units paying rent!!$1,000 to $1,499","GROSS RENT!!Occupied units paying rent!!$1,500 to $1,999","GROSS RENT!!Occupied units paying rent!!$2,000 to $2,499","GROSS RENT!!Occupied units paying rent!!$2,500 to $2,999","GROSS RENT!!Occupied units paying rent!!$3,000 or more",GROSS RENT!!Occupied units paying rent!!Median (dollars),GROSS RENT!!Occupied units paying rent!!No rent paid
0,Estimate,564817,58842,82255,97901,75759,72109,52899,125052,1787,11396
1,Percent,564817,10.4%,14.6%,17.3%,13.4%,12.8%,9.4%,22.1%,(X),(X)


In [9]:
# Rename columns for clarity
nyc_rent_df.columns = ['',
                       'Total Housing Units',
                       'Less than $500',
                       '$500 to $999',
                       '$1,000 to $1,499',
                       '$1,500 to $1,999',
                       '$2,000 to $2,499',
                       '$2,500 to $2,999',
                       '$3,000 or more',
                       'Median Rent Price',
                       'No rent paid'
                       ]
nyc_rent_df.head()

Unnamed: 0,Unnamed: 1,Total Housing Units,Less than $500,$500 to $999,"$1,000 to $1,499","$1,500 to $1,999","$2,000 to $2,499","$2,500 to $2,999","$3,000 or more",Median Rent Price,No rent paid
0,Estimate,564817,58842,82255,97901,75759,72109,52899,125052,1787,11396
1,Percent,564817,10.4%,14.6%,17.3%,13.4%,12.8%,9.4%,22.1%,(X),(X)


In [10]:
# Save the cleaned DataFrame to a new CSV file
nyc_rent_df.to_csv('../ny_data/cleaned_nyc_rent.csv', index=False)

## Quality of Life Metric

**Measuring**: Commute Time 

Data was pulled from the Census Database using the following survey by ASC in 2020. 
* Commute Time: Table S0801 

In [39]:
# Import csv file from commute time
nyc_commute_data = '../ny_data/nyc_raw_data/nyc_commute.csv'

# Read the CSV file into a DataFrame
nyc_commute_df = pd.read_csv(nyc_commute_data)
nyc_commute_df

Unnamed: 0,Label (Grouping),"New York County, New York!!Total!!Estimate","New York County, New York!!Male!!Estimate","New York County, New York!!Female!!Estimate"
0,Workers 16 years and over,874997,439639,435358
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",7.7%,9.0%,6.4%
3,Drove alone,5.8%,7.1%,4.5%
4,Carpooled,1.9%,1.9%,1.9%
...,...,...,...,...
58,Private vehicle occupancy,16.1%,(X),(X)
59,Place of work,16.0%,(X),(X)
60,Time of departure to go to work,24.4%,(X),(X)
61,Travel time to work,19.4%,(X),(X)


In [40]:
nyc_commute_df = nyc_commute_df.iloc[:14]
nyc_commute_df.head()

Unnamed: 0,Label (Grouping),"New York County, New York!!Total!!Estimate","New York County, New York!!Male!!Estimate","New York County, New York!!Female!!Estimate"
0,Workers 16 years and over,874997,439639,435358
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",7.7%,9.0%,6.4%
3,Drove alone,5.8%,7.1%,4.5%
4,Carpooled,1.9%,1.9%,1.9%


In [42]:
# Rewrite for clarity 
nyc_commute_df.columns = ['',
                              'Total Estimate',
                              'Male',
                              'Female'
                              ]
nyc_commute_df.head()

Unnamed: 0,Unnamed: 1,Total Estimate,Male,Female
0,Workers 16 years and over,874997,439639,435358
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",7.7%,9.0%,6.4%
3,Drove alone,5.8%,7.1%,4.5%
4,Carpooled,1.9%,1.9%,1.9%


In [43]:
# Drop NaN values
nyc_commute_df = nyc_commute_df.dropna()

In [44]:
# Save as new csv file with cleaned data
nyc_commute_df.to_csv('../ny_data/cleaned_nyc_commute.csv', index=False)

### Health Behaviors & Outcomes

In [12]:
# Read CSV file 
nyc_health_data = '../ny_data/nyc_raw_data/nyc_health_behaviors.csv'

# Read the CSV file into a DataFrame
nyc_health_df = pd.read_csv(nyc_health_data)
nyc_health_df.head()

Unnamed: 0,Entity DCID,Entity properties isoCode,Entity properties name,Variable DCID,Variable observation date,Variable observation metadata importName,Variable observation metadata provenanceUrl,Variable observation metadata scalingFactor,Variable observation metadata unit,Variable observation metadata unitDisplayName,Variable observation value,Variable properties name
0,geoId/3651000,,New York City,Percent_Person_SleepLessThan7Hours,2020,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,35.4,Percentage of Adult Population That Sleeps Les...
1,geoId/3651000,,New York City,Percent_Person_SleepLessThan7Hours,2022,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,39.5,Percentage of Adult Population That Sleeps Les...
2,geoId/3651000,,New York City,Percent_Person_Obesity,2020,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,25.2,Percentage of Adult Population That Is Obese
3,geoId/3651000,,New York City,Percent_Person_Obesity,2021,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,27.4,Percentage of Adult Population That Is Obese
4,geoId/3651000,,New York City,Percent_Person_Obesity,2022,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,27.0,Percentage of Adult Population That Is Obese


In [13]:
# Delete unnecessary rows and columns
nyc_health_df = nyc_health_df.drop(columns=['Entity DCID', 'Entity properties isoCode','Variable observation metadata importName', 'Variable DCID', 'Variable observation metadata importName','Variable observation metadata provenanceUrl','Variable observation metadata scalingFactor', 'Variable observation metadata unit', 'Variable observation metadata unitDisplayName'])
nyc_health_df.head()

Unnamed: 0,Entity properties name,Variable observation date,Variable observation value,Variable properties name
0,New York City,2020,35.4,Percentage of Adult Population That Sleeps Les...
1,New York City,2022,39.5,Percentage of Adult Population That Sleeps Les...
2,New York City,2020,25.2,Percentage of Adult Population That Is Obese
3,New York City,2021,27.4,Percentage of Adult Population That Is Obese
4,New York City,2022,27.0,Percentage of Adult Population That Is Obese


In [14]:
# Filter the DataFrame to only include rows where 'Variable observation date' is 2020
nyc_health_df = nyc_health_df[nyc_health_df['Variable observation date'] == 2020]

# Display the filtered DataFrame
nyc_health_df.head()

Unnamed: 0,Entity properties name,Variable observation date,Variable observation value,Variable properties name
0,New York City,2020,35.4,Percentage of Adult Population That Sleeps Les...
2,New York City,2020,25.2,Percentage of Adult Population That Is Obese
5,New York City,2020,15.6,Percentage of Adult Population Who Binge Drink
8,New York City,2020,28.7,Percentage of Adult Population Who Are Physica...
11,New York City,2020,14.3,Percentage of Adult Population That Smokes


In [15]:
# Save as new csv file with cleaned data
nyc_health_df.to_csv('../ny_data/cleaned_nyc_health.csv', index=False)


In [25]:
nyc_health_df.to_json('nyc_health_data.json', orient='records', indent=4)

### Creating JSON File

In [None]:
# Load the datasets from New York
nyc_commute_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_commute.csv')
nyc_unemployment_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_unemployment.csv')
nyc_rent_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_rent.csv')
nyc_income_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_income.csv')
nyc_edu_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_edu_level.csv')
nyc_age_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_age.csv')
nyc_race_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_race.csv')
nyc_health_csv = Path('../ny_data/nyc_cleaned_data/cleaned_nyc_health.csv')

# Read the csv files
nyc_commute_df = pd.read_csv(nyc_commute_csv)
nyc_unemployment_df = pd.read_csv(nyc_unemployment_csv)
nyc_rent_df = pd.read_csv(nyc_rent_csv)
nyc_income_df = pd.read_csv(nyc_income_csv)
nyc_edu_df = pd.read_csv(nyc_edu_csv)
nyc_age_df = pd.read_csv(nyc_age_csv)
nyc_race_df = pd.read_csv(nyc_race_csv)
nyc_health_df = pd.read_csv(nyc_health_csv)

In [None]:
nyc_data = {
    "commute": nyc_commute_df.to_dict(orient='records'),
    "unemployment": nyc_unemployment_df.to_dict(orient='records'),
    "rent": nyc_rent_df.to_dict(orient='records'),
    "income": nyc_income_df.to_dict(orient='records'),
    "education": nyc_edu_df.to_dict(orient='records'),
    "age": nyc_age_df.to_dict(orient='records'),
    "race": nyc_race_df.to_dict(orient='records'),
    "health": nyc_health_df.to_dict(orient='records')
}

In [None]:
# Save the dictionary as a JSON file
with open('../ny_data/nyc_data.json', 'w') as json_file:
    json.dump(ny_data, json_file)