# Los Angeles Dataset

In [75]:
# Import dependencies
import os
import csv
import pandas as pd
import json
from pathlib import Path

## Demographic Metric

**Measuring**: ACS Median Age, Racial Demographics, Education Level

Data was pulled from the 2020 Los Angeles Census Database and the American Community Survey (ACS) by the U.S. Census Bureau using various coded tables to ensure unified data for each metropolitan city. Two tables used the same code and were seperated to ensure proper measurement. 
* ACS Median Age: Table DP05
* ACS Race: Table DP05
* Education Level: S1501

##### ACS Median Age (Table DP05)

In [76]:
# Import csv file for ACS Population Density
la_age_data = "..\los_angeles_data\la_raw_csv\la_age.csv"

# Read the CSV file into a DataFrame
la_age_df = pd.read_csv(la_age_data)
la_age_df.head()

  la_age_data = "..\los_angeles_data\la_raw_csv\la_age.csv"


Unnamed: 0,Label (Grouping),SEX AND AGE,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,SEX AND AGE!!Total population!!25 to 34 years,SEX AND AGE!!Total population!!35 to 44 years,SEX AND AGE!!Total population!!45 to 54 years,SEX AND AGE!!Total population!!55 to 59 years,SEX AND AGE!!Total population!!60 to 64 years,SEX AND AGE!!Total population!!65 to 74 years,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years)
0,"Los Angeles city, California",,,,,,,,,,,,,,,,
1,Estimate,,3973278.0,226227,224155,225304,237653,292361,723858,571032,516798,237072,208031,290863,143939,75985,35.9
2,Percent,,3973278.0,5.7%,5.6%,5.7%,6.0%,7.4%,18.2%,14.4%,13.0%,6.0%,5.2%,7.3%,3.6%,1.9%,(X)


In [77]:
la_age_df = la_age_df.drop(columns=['SEX AND AGE'])
la_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,SEX AND AGE!!Total population!!25 to 34 years,SEX AND AGE!!Total population!!35 to 44 years,SEX AND AGE!!Total population!!45 to 54 years,SEX AND AGE!!Total population!!55 to 59 years,SEX AND AGE!!Total population!!60 to 64 years,SEX AND AGE!!Total population!!65 to 74 years,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years)
0,"Los Angeles city, California",,,,,,,,,,,,,,,
1,Estimate,3973278.0,226227,224155,225304,237653,292361,723858,571032,516798,237072,208031,290863,143939,75985,35.9
2,Percent,3973278.0,5.7%,5.6%,5.7%,6.0%,7.4%,18.2%,14.4%,13.0%,6.0%,5.2%,7.3%,3.6%,1.9%,(X)


In [78]:
# Rename columns for clarity
la_age_df.columns = ['',
  'Total Population',
  'Under 5',
  '5 to 9',
  '10 to 14',
  '15 to 19',
  '20 to 24',
  '25 to 34',
  '35 to 44',
  '45 to 54',
  '55 to 59',
  '60 to 64',
  '65 to 74',
  '75 to 84',
  '85 and Over',
  'Median Age'
]

la_age_df.head()

Unnamed: 0,Unnamed: 1,Total Population,Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,55 to 59,60 to 64,65 to 74,75 to 84,85 and Over,Median Age
0,"Los Angeles city, California",,,,,,,,,,,,,,,
1,Estimate,3973278.0,226227,224155,225304,237653,292361,723858,571032,516798,237072,208031,290863,143939,75985,35.9
2,Percent,3973278.0,5.7%,5.6%,5.7%,6.0%,7.4%,18.2%,14.4%,13.0%,6.0%,5.2%,7.3%,3.6%,1.9%,(X)


In [79]:
# Drop NaN values
la_age_df = la_age_df.dropna()
la_age_df.head()

Unnamed: 0,Unnamed: 1,Total Population,Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,55 to 59,60 to 64,65 to 74,75 to 84,85 and Over,Median Age
1,Estimate,3973278,226227,224155,225304,237653,292361,723858,571032,516798,237072,208031,290863,143939,75985,35.9
2,Percent,3973278,5.7%,5.6%,5.7%,6.0%,7.4%,18.2%,14.4%,13.0%,6.0%,5.2%,7.3%,3.6%,1.9%,(X)


In [80]:
# Save the cleaned DataFrame to a new CSV file
la_age_df.to_csv('../los_angeles_data/la_cleaned_csv/cleaned_la_age.csv', index=False)

#### Racial Demographics (Table DP05)

In [81]:
# Import csv file for race
la_race_data = "..\los_angeles_data\la_raw_csv\la_race.csv"

# Read the CSV file into a DataFrame
la_race_df = pd.read_csv(la_race_data)
la_race_df.head()

  la_race_data = "..\los_angeles_data\la_raw_csv\la_race.csv"


Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races,Race alone or in combination with one or more other races!!Total population,Race alone or in combination with one or more other races!!Total population!!White,Race alone or in combination with one or more other races!!Total population!!Black or African American,Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native,Race alone or in combination with one or more other races!!Total population!!Asian,Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander,Race alone or in combination with one or more other races!!Total population!!Some other race
0,"Los Angeles city, California",,,,,,,,
1,Estimate,,3973278.0,2187180,401039,62817,541935,18525,1058349
2,Percent,,3973278.0,55.0%,10.1%,1.6%,13.6%,0.5%,26.6%


In [82]:
# Rename columns to remove 'Race alone or in combination with one or more other races!!Total population!!' from the column names
la_race_df.columns = la_race_df.columns.str.replace('Race alone or in combination with one or more other races!!Total population!!', '')
la_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races,Race alone or in combination with one or more other races!!Total population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some other race
0,"Los Angeles city, California",,,,,,,,
1,Estimate,,3973278.0,2187180,401039,62817,541935,18525,1058349
2,Percent,,3973278.0,55.0%,10.1%,1.6%,13.6%,0.5%,26.6%


In [83]:
la_race_df = la_race_df.drop(columns=['Race alone or in combination with one or more other races'])
la_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races!!Total population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some other race
0,"Los Angeles city, California",,,,,,,
1,Estimate,3973278.0,2187180,401039,62817,541935,18525,1058349
2,Percent,3973278.0,55.0%,10.1%,1.6%,13.6%,0.5%,26.6%


In [84]:
# Rename columns for clarity 
la_race_df.columns = ['', 
                       'Total Population',
                           'White', 
                           'Black or African American', 
                           'American Indian and Alaska Native', 
                           'Asian', 
                           'Native Hawaiian and Other Pacific Islander', 
                           'Other']

# Display
la_race_df.head()


Unnamed: 0,Unnamed: 1,Total Population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Other
0,"Los Angeles city, California",,,,,,,
1,Estimate,3973278.0,2187180,401039,62817,541935,18525,1058349
2,Percent,3973278.0,55.0%,10.1%,1.6%,13.6%,0.5%,26.6%


In [85]:
# Drop NaN values
la_race_df = la_race_df.dropna()

In [86]:
# Save as new CSV file
la_race_df.to_csv('../los_angeles_data/la_cleaned_csv/cleaned_la_race.csv', index=False)

#### Education Level (Table S1501)

In [87]:
# Import csv file for Texas edu level 
la_edu_data = "..\los_angeles_data\la_raw_csv\la_edu_lvl.csv"

# Read the CSV file into a DataFrame
la_edu_df = pd.read_csv(la_edu_data)
la_edu_df

  la_edu_data = "..\los_angeles_data\la_raw_csv\la_edu_lvl.csv"


Unnamed: 0,Label (Grouping),California!!Total!!Estimate,California!!Total!!Margin of Error,California!!Percent!!Estimate,California!!Percent!!Margin of Error,California!!Male!!Estimate,California!!Male!!Margin of Error,California!!Percent Male!!Estimate,California!!Percent Male!!Margin of Error,California!!Female!!Estimate,...,"Los Angeles city, California!!Percent!!Estimate","Los Angeles city, California!!Percent!!Margin of Error","Los Angeles city, California!!Male!!Estimate","Los Angeles city, California!!Male!!Margin of Error","Los Angeles city, California!!Percent Male!!Estimate","Los Angeles city, California!!Percent Male!!Margin of Error","Los Angeles city, California!!Female!!Estimate","Los Angeles city, California!!Female!!Margin of Error","Los Angeles city, California!!Percent Female!!Estimate","Los Angeles city, California!!Percent Female!!Margin of Error"
0,AGE BY EDUCATIONAL ATTAINMENT,,,,,,,,,,...,,,,,,,,,,
1,Population 18 to 24 years,3724239,"±1,093",(X),(X),1914100,±755,(X),(X),1810139,...,(X),(X),197761,"±2,716",(X),(X),198490,"±2,968",(X),(X)
2,Less than high school graduate,383097,"±4,474",10.3%,±0.1,229054,"±3,239",12.0%,±0.2,154043,...,11.8%,±0.4,27942,"±1,308",14.1%,±0.6,18789,"±1,094",9.5%,±0.5
3,High school graduate (includes equival...,1164734,"±7,650",31.3%,±0.2,657609,"±5,782",34.4%,±0.3,507125,...,25.6%,±0.6,56242,"±1,667",28.4%,±0.7,45313,"±1,764",22.8%,±0.9
4,Some college or associate's degree,1749440,"±9,230",47.0%,±0.2,845388,"±6,628",44.2%,±0.3,904052,...,48.4%,±0.8,90716,"±2,124",45.9%,±1.0,101038,"±2,521",50.9%,±1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Less than high school graduate,25253,±92,(X),(X),29734,±226,(X),(X),19064,...,(X),(X),25900,±276,(X),(X),18044,±337,(X),(X)
64,High school graduate (includes equival...,32560,±171,(X),(X),38059,±283,(X),(X),26830,...,(X),(X),32252,±340,(X),(X),25540,±457,(X),(X)
65,Some college or associate's degree,40683,±99,(X),(X),48230,±274,(X),(X),34162,...,(X),(X),40512,±597,(X),(X),32433,±456,(X),(X)
66,Bachelor's degree,64554,±307,(X),(X),76714,±447,(X),(X),54297,...,(X),(X),63120,±952,(X),(X),51016,±618,(X),(X)


In [88]:
# Source data (filter only what is needed out of dataset)
la_edu_df = la_edu_df.iloc[:28]

#drop all columns with "margin of error" in the column name
la_edu_df = la_edu_df.loc[:, ~la_edu_df.columns.str.contains('Margin of Error')]

#drop all columns with percent in the column name
la_edu_df = la_edu_df.loc[:, ~la_edu_df.columns.str.contains('Percent')]

#drop columns that begin with california
la_edu_df = la_edu_df.loc[:, ~la_edu_df.columns.str.startswith('California')]

la_edu_df.head()

Unnamed: 0,Label (Grouping),"Los Angeles city, California!!Total!!Estimate","Los Angeles city, California!!Male!!Estimate","Los Angeles city, California!!Female!!Estimate"
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,396251.0,197761.0,198490.0
2,Less than high school graduate,46731.0,27942.0,18789.0
3,High school graduate (includes equival...,101555.0,56242.0,45313.0
4,Some college or associate's degree,191754.0,90716.0,101038.0


In [89]:
# Rename columns for clarity 
la_edu_df.columns = ['Age by Educational Attainment',
                          'Total', 
                            'Male',
                            'Female']
la_edu_df.head()

Unnamed: 0,Age by Educational Attainment,Total,Male,Female
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,396251.0,197761.0,198490.0
2,Less than high school graduate,46731.0,27942.0,18789.0
3,High school graduate (includes equival...,101555.0,56242.0,45313.0
4,Some college or associate's degree,191754.0,90716.0,101038.0


In [90]:
# Drop NaN values
la_edu_df = la_edu_df.dropna()

In [91]:
# save as new csv file with cleaned data
la_edu_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_edu_lvl.csv', index=False)

  la_edu_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_edu_lvl.csv', index=False)


## Economic Metric

**Measuring**: Median lasehold Income, Unemployment Rate, Poverty Rate.

Data for this section was pulled from the la 2020 Census Database using various coded tables:
* Median lasehold Income: Table S1901
* Unemployment Rate: Table S2301
* Poverty Rate: S1501 - WE ARE CUTTING

##### Median lasehold Income (Table S1901)

In [92]:
# Import csv file from Median lasehold Income
la_income_data = '..\los_angeles_data\la_raw_csv\LA_med_household_income.csv'

# Read the CSV file into a DataFrame
la_income_df = pd.read_csv(la_income_data)
la_income_df.head()

  la_income_data = '..\los_angeles_data\la_raw_csv\LA_med_household_income.csv'


Unnamed: 0,Label (Grouping),California!!Households!!Estimate,California!!Households!!Margin of Error,California!!Families!!Estimate,California!!Families!!Margin of Error,California!!Married-couple families!!Estimate,California!!Married-couple families!!Margin of Error,California!!Nonfamily households!!Estimate,California!!Nonfamily households!!Margin of Error,"Los Angeles city, California!!Households!!Estimate","Los Angeles city, California!!Households!!Margin of Error","Los Angeles city, California!!Families!!Estimate","Los Angeles city, California!!Families!!Margin of Error","Los Angeles city, California!!Married-couple families!!Estimate","Los Angeles city, California!!Married-couple families!!Margin of Error","Los Angeles city, California!!Nonfamily households!!Estimate","Los Angeles city, California!!Nonfamily households!!Margin of Error"
0,Total,13103114,"±18,542",8986666,"±23,960",6510580,"±28,316",4116448,"±12,837",1402522,"±4,385",830131,"±4,327",542563,"±4,178",572391,"±5,787"
1,"Less than $10,000",4.7%,±0.1,3.1%,±0.1,1.4%,±0.1,9.5%,±0.1,6.6%,±0.2,4.0%,±0.1,1.7%,±0.1,11.4%,±0.3
2,"$10,000 to $14,999",3.9%,±0.1,2.1%,±0.1,1.1%,±0.1,8.4%,±0.1,5.5%,±0.2,2.9%,±0.1,1.7%,±0.1,9.6%,±0.3
3,"$15,000 to $24,999",6.9%,±0.1,5.6%,±0.1,3.8%,±0.1,10.9%,±0.1,8.6%,±0.2,8.2%,±0.2,6.2%,±0.3,10.1%,±0.3
4,"$25,000 to $34,999",7.1%,±0.1,6.4%,±0.1,4.8%,±0.1,9.3%,±0.1,8.1%,±0.2,8.1%,±0.2,6.3%,±0.2,8.6%,±0.3


In [93]:
# Droping unnecessary rows and columns
la_income_df = la_income_df.iloc[:13]

#drop all columns with "margin of error" in the column name
la_income_df = la_income_df.loc[:, ~la_income_df.columns.str.contains('Margin of Error')]

#drop all columns with percent in the column name
la_income_df = la_income_df.loc[:, ~la_income_df.columns.str.contains('Percent')]

#drop columns that begin with california
la_income_df = la_income_df.loc[:, ~la_income_df.columns.str.startswith('California')]

la_income_df.head()


Unnamed: 0,Label (Grouping),"Los Angeles city, California!!Households!!Estimate","Los Angeles city, California!!Families!!Estimate","Los Angeles city, California!!Married-couple families!!Estimate","Los Angeles city, California!!Nonfamily households!!Estimate"
0,Total,1402522,830131,542563,572391
1,"Less than $10,000",6.6%,4.0%,1.7%,11.4%
2,"$10,000 to $14,999",5.5%,2.9%,1.7%,9.6%
3,"$15,000 to $24,999",8.6%,8.2%,6.2%,10.1%
4,"$25,000 to $34,999",8.1%,8.1%,6.3%,8.6%


In [94]:
# Rename columns for clarity
la_income_df.columns = ['',
                             'lasehold', 
                              'Families', 
                              'Married-couple laseholds', 
                              'Nonfamily laseholds']
la_income_df.head()

Unnamed: 0,Unnamed: 1,lasehold,Families,Married-couple laseholds,Nonfamily laseholds
0,Total,1402522,830131,542563,572391
1,"Less than $10,000",6.6%,4.0%,1.7%,11.4%
2,"$10,000 to $14,999",5.5%,2.9%,1.7%,9.6%
3,"$15,000 to $24,999",8.6%,8.2%,6.2%,10.1%
4,"$25,000 to $34,999",8.1%,8.1%,6.3%,8.6%


In [95]:
# Save as new csv file with cleaned data
la_income_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_med_household_income.csv', index=False)

  la_income_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_med_household_income.csv', index=False)


##### Unemployment Rate (Table S2301)

In [96]:
# Import csv file from unemployment rate
la_unemployment_data = '..\los_angeles_data\la_cleaned_csv\cleaned_la_unemployment_rate.csv'

# Read the CSV file into a DataFrame
la_unemployment_df = pd.read_csv(la_unemployment_data)
la_unemployment_df.head()

  la_unemployment_data = '..\los_angeles_data\la_cleaned_csv\cleaned_la_unemployment_rate.csv'


Unnamed: 0,Age Group,Total,Labor Force Participation,Employment Population Ratio,Unemployment Rate
0,Population 16 years and over,3254012,66.5%,61.9%,6.9%
1,16 to 19 years,194073,25.7%,19.7%,23.2%
2,20 to 24 years,292361,69.9%,61.9%,11.3%
3,25 to 29 years,372648,84.2%,78.1%,7.2%
4,30 to 34 years,351210,84.4%,79.2%,6.0%


In [97]:
# Filter the DataFrame to only include rows 0-11
la_unemployment_df = la_unemployment_df.iloc[0:12]

#drop columns that begin with california
la_unemployment_df = la_unemployment_df.loc[:, ~la_unemployment_df.columns.str.startswith('California')]

# Display the filtered DataFrame
la_unemployment_df.head()

Unnamed: 0,Age Group,Total,Labor Force Participation,Employment Population Ratio,Unemployment Rate
0,Population 16 years and over,3254012,66.5%,61.9%,6.9%
1,16 to 19 years,194073,25.7%,19.7%,23.2%
2,20 to 24 years,292361,69.9%,61.9%,11.3%
3,25 to 29 years,372648,84.2%,78.1%,7.2%
4,30 to 34 years,351210,84.4%,79.2%,6.0%


In [98]:
# Rename columns for clarity
la_unemployment_df.columns = ['Age Group', 
                                   'Total', 
                                   'Labor Force Participation', 
                                   'Employment Population Ratio', 
                                   'Unemployment Rate']
la_unemployment_df.head()

Unnamed: 0,Age Group,Total,Labor Force Participation,Employment Population Ratio,Unemployment Rate
0,Population 16 years and over,3254012,66.5%,61.9%,6.9%
1,16 to 19 years,194073,25.7%,19.7%,23.2%
2,20 to 24 years,292361,69.9%,61.9%,11.3%
3,25 to 29 years,372648,84.2%,78.1%,7.2%
4,30 to 34 years,351210,84.4%,79.2%,6.0%


In [99]:
# Drop any NaN values
la_unemployment_df = la_unemployment_df.dropna()

In [100]:
# Save the cleaned DataFrame to a new CSV file
la_unemployment_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_unemployment_rate.csv', index=False)

  la_unemployment_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_unemployment_rate.csv', index=False)


## lasing Metric

**Measuring**: Median Home Price, Median Rent, Homeownership Rate

Data for this section was pulled from the la 2020 Census Database using the same table code from the American Community Survey by the U.S. Census Bureau (ACS). 
* Median Rent: Table DP04

#### Median Rent (DP04)

In [101]:
# Import csv file 
la_rent_data = '..\los_angeles_data\la_raw_csv\la_rent.csv'

# Read the CSV file into a DataFrame
la_rent_df = pd.read_csv(la_rent_data)
la_rent_df.head()

  la_rent_data = '..\los_angeles_data\la_raw_csv\la_rent.csv'


Unnamed: 0,Label (Grouping),GROSS RENT,GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Less than $500,GROSS RENT!!Occupied units paying rent!!$500 to $999,"GROSS RENT!!Occupied units paying rent!!$1,000 to $1,499","GROSS RENT!!Occupied units paying rent!!$1,500 to $1,999","GROSS RENT!!Occupied units paying rent!!$2,000 to $2,499","GROSS RENT!!Occupied units paying rent!!$2,500 to $2,999","GROSS RENT!!Occupied units paying rent!!$3,000 or more",GROSS RENT!!Occupied units paying rent!!Median (dollars),GROSS RENT!!Occupied units paying rent!!No rent paid
0,"Los Angeles city, California",,,,,,,,,,,
1,Estimate,,865877.0,44630,119637,259348,203998,116140,60439,61685,1523,18299
2,Percent,,865877.0,5.2%,13.8%,30.0%,23.6%,13.4%,7.0%,7.1%,(X),(X)


In [102]:
# Rename columns for clarity
la_rent_df.columns = ['',
                             'Total lasing Units',
                             'Occupied lasing Units Paying Rent',
                             'Less than $500',
                             '$500 to $999',
                             '$1,000 to $1,499',
                             '$1,500 to $1,999',
                             '$2,000 to $2,499',
                             '$2,500 to $2,999',
                             '$3,000 or more',
                             'Median Rent Price',
                             'No rent paid'
]
la_rent_df.head()

Unnamed: 0,Unnamed: 1,Total lasing Units,Occupied lasing Units Paying Rent,Less than $500,$500 to $999,"$1,000 to $1,499","$1,500 to $1,999","$2,000 to $2,499","$2,500 to $2,999","$3,000 or more",Median Rent Price,No rent paid
0,"Los Angeles city, California",,,,,,,,,,,
1,Estimate,,865877.0,44630,119637,259348,203998,116140,60439,61685,1523,18299
2,Percent,,865877.0,5.2%,13.8%,30.0%,23.6%,13.4%,7.0%,7.1%,(X),(X)


In [103]:
# Drop NaN values
la_rent_df = la_rent_df.dropna()

In [104]:
# Save the cleaned DataFrame to a new CSV file
la_rent_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_rent.csv', index=False)

  la_rent_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_rent.csv', index=False)


## Quality of Life Metric

**Measuring**: Commute Time 

Data was pulled from the Census Database using the following survey by ASC in 2020. 
* Commute Time: Table S0801 

In [105]:
# Import csv file from commute time
la_commute_data = '..\los_angeles_data\la_raw_csv\LA_commute_time.csv'

# Read the CSV file into a DataFrame
la_commute_df = pd.read_csv(la_commute_data)
la_commute_df

  la_commute_data = '..\los_angeles_data\la_raw_csv\LA_commute_time.csv'


Unnamed: 0,Label (Grouping),California!!Total!!Estimate,California!!Total!!Margin of Error,California!!Male!!Estimate,California!!Male!!Margin of Error,California!!Female!!Estimate,California!!Female!!Margin of Error,"Los Angeles city, California!!Total!!Estimate","Los Angeles city, California!!Total!!Margin of Error","Los Angeles city, California!!Male!!Estimate","Los Angeles city, California!!Male!!Margin of Error","Los Angeles city, California!!Female!!Estimate","Los Angeles city, California!!Female!!Margin of Error"
0,Workers 16 years and over,18239892,"±23,587",9925774,"±14,377",8314118,"±19,722",1956957,"±6,935",1058847,"±5,319",898110,"±4,858"
1,MEANS OF TRANSPORTATION TO WORK,,,,,,,,,,,,
2,"Car, truck, or van",82.0%,±0.1,82.7%,±0.1,81.2%,±0.1,76.6%,±0.3,77.8%,±0.4,75.2%,±0.4
3,Drove alone,72.1%,±0.1,73.1%,±0.1,70.9%,±0.1,67.7%,±0.3,69.2%,±0.4,65.9%,±0.4
4,Carpooled,10.0%,±0.1,9.6%,±0.1,10.3%,±0.1,8.9%,±0.2,8.5%,±0.2,9.4%,±0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Private vehicle occupancy,15.3%,(X),(X),(X),(X),(X),16.0%,(X),(X),(X),(X),(X)
59,Place of work,17.8%,(X),(X),(X),(X),(X),18.1%,(X),(X),(X),(X),(X)
60,Time of departure to go to work,25.0%,(X),(X),(X),(X),(X),25.8%,(X),(X),(X),(X),(X)
61,Travel time to work,19.1%,(X),(X),(X),(X),(X),21.8%,(X),(X),(X),(X),(X)


In [106]:
la_commute_df = la_commute_df.iloc[:14]
#drop all columns with "margin of error" in the column name
la_commute_df = la_commute_df.loc[:, ~la_commute_df.columns.str.contains('Margin of Error')]

#drop all columns with percent in the column name
la_commute_df = la_commute_df.loc[:, ~la_commute_df.columns.str.contains('Percent')]

#drop columns that begin with california
la_commute_df = la_commute_df.loc[:, ~la_commute_df.columns.str.startswith('California')]

la_commute_df.head()

Unnamed: 0,Label (Grouping),"Los Angeles city, California!!Total!!Estimate","Los Angeles city, California!!Male!!Estimate","Los Angeles city, California!!Female!!Estimate"
0,Workers 16 years and over,1956957,1058847,898110
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",76.6%,77.8%,75.2%
3,Drove alone,67.7%,69.2%,65.9%
4,Carpooled,8.9%,8.5%,9.4%


In [107]:
# Rewrite for clarity 
la_commute_df.columns = ['',
                              'Total Estimate',
                              'Male',
                              'Female'
                              ]
la_commute_df.head()

Unnamed: 0,Unnamed: 1,Total Estimate,Male,Female
0,Workers 16 years and over,1956957,1058847,898110
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",76.6%,77.8%,75.2%
3,Drove alone,67.7%,69.2%,65.9%
4,Carpooled,8.9%,8.5%,9.4%


In [108]:
# Drop NaN values
la_commute_df = la_commute_df.dropna()

In [109]:
# Save as new csv file with cleaned data
la_commute_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_commute_time.csv', index=False)

  la_commute_df.to_csv('..\los_angeles_data\la_cleaned_csv\cleaned_la_commute_time.csv', index=False)


### Creating JSON File

In [110]:
# Load the datasets from la 
la_commute_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_commute_time.csv')
la_unemployment_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_unemployment_rate.csv')
la_rent_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_rent.csv')
la_income_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_med_household_income.csv')
la_edu_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_edu_lvl.csv')
la_age_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_age.csv')
la_race_csv = Path('../los_angeles_data/la_cleaned_csv/cleaned_la_race.csv')

# Read the csv files
la_commute_df = pd.read_csv(la_commute_csv)
la_unemployment_df = pd.read_csv(la_unemployment_csv)
la_rent_df = pd.read_csv(la_rent_csv)
la_income_df = pd.read_csv(la_income_csv)
la_edu_df = pd.read_csv(la_edu_csv)
la_age_df = pd.read_csv(la_age_csv)
la_race_df = pd.read_csv(la_race_csv)

In [111]:
# Combine all data into a dictionary
los_angeles_data = {
    "commute": la_commute_df.to_dict(orient='records'),
    "unemployment": la_unemployment_df.to_dict(orient='records'),
    "rent": la_rent_df.to_dict(orient='records'),
    "income": la_income_df.to_dict(orient='records'),
    "education": la_edu_df.to_dict(orient='records'),
    "age": la_age_df.to_dict(orient='records'),
    "race": la_race_df.to_dict(orient='records')
}

In [112]:
# Save the dictionary as a JSON file
with open('../los_angeles_data/la_data.json', 'w') as json_file:
    json.dump(los_angeles_data, json_file)