# Coordinate to Census Tract

see `CATS_Tracts_Mapping.ipnyb` for an example of converting mass coordinates to census tracts

# California Census Tracts EJ Metrics

code should develop a pipeline that takes a list of census tracts and returns \
- a dataframe that has all the data relevant to provided dataframes
    - this may include multiple sources (e.g. EJScreen, LEAD, CalEnviron, etc.)
- a dataframe with definitions of each coloumn

In [4]:
import pandas as pd
import numpy as np


In [None]:
def extract_data(df, Metadata_File):
    import os
    import pandas as pd

    # Check if the metadata file exists
    if not os.path.exists(Metadata_File):
        raise FileNotFoundError(f"Error: The file {Metadata_File} does not exist.")
    
    # Read the metadata file
    df_MD = pd.read_csv(Metadata_File)
    
    # Check if the necessary column exists in the metadata DataFrame
    
    if 'Keep' not in df_MD.columns:
        print(f"Warning: Missing `keep` column in the metadata file ({Metadata_File}). All columns were kept")
        df_MD['Keep'] = 1  # Create 'Keep' column and set all entries to 1
    if 'Column' not in df_MD.columns:
        print(f"Warning: Missing `Column` column in the metadata file ({Metadata_File}). All columns were kept")
        df_MD['Column'] = df.columns
        df_MD['Keep'] = 1  # Create 'Keep' column and set all entries to 1

    # Get columns to keep based on the 'Keep' column
    df_col_kept = df_MD[df_MD['Keep'] == 1]['Column']
    
    # Check if all columns to keep exist in the input DataFrame
    missing_columns = [col for col in df_col_kept if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Error: The following columns are missing from the data frame: {', '.join(missing_columns)}")
    
    # Extract the data based on the 'Column' values from the metadata
    new_data = df[df_col_kept]
    
    return new_data, df_MD

In [104]:
def combine_dfs(data_df, new_df,new_df_col_name='CTs',data_df_col_name='Census Tract'):
    combined_df = pd.merge(data_df, new_df, left_on=data_df_col_name, right_on=new_df_col_name, how='inner')
    combined_df = combined_df.drop('CTs', axis=1)
    return combined_df

In [106]:

def combine_dfs(data_df, new_df, new_df_col_name='CTs', data_df_col_name='Census Tract'):
    import pandas as pd
    # Check if the specified columns exist in both DataFrames
    if new_df_col_name not in new_df.columns:
        raise ValueError(f"Column '{new_df_col_name}' does not exist in the new DataFrame.")
    
    if data_df_col_name not in data_df.columns:
        raise ValueError(f"Column '{data_df_col_name}' does not exist in the data DataFrame.")
    
    try:
        # Perform the merge operation
        combined_df = pd.merge(data_df, new_df, left_on=data_df_col_name, right_on=new_df_col_name, how='inner')
        
        # Drop the 'CTs' column
        combined_df = combined_df.drop(new_df_col_name, axis=1)
        
        return combined_df
    
    except Exception as e:
        # If any error occurs during the merge, show the error
        raise RuntimeError(f"An error occurred during the merge: {e}")

## Census Tracts ([US Census Bureau](census.gov))

In [93]:
USCB_DECENNIALDP2020 = pd.read_csv('/Users/melek/Documents/Data/Census/Cenus.gov/California/DECENNIALDP2020.DP1_2024-12-05T145935/DECENNIALDP2020.DP1-Data.csv',
                                   low_memory=False)
USCB_DECENNIALDP2020_meta_file = '/Users/melek/Documents/GitHub/EnergyJustice/Data/Metadata/DECENNIALDP2020.DP1-Column-Metadata.csv'

### Data Dataframe:

In [103]:
Data = pd.DataFrame()
Data['Census Tract'] = USCB_DECENNIALDP2020['GEO_ID'][1:len(USCB_DECENNIALDP2020)].str[-11:].astype(int)

### Extract data

In [95]:
USCB_df, USCB_MD =extract_data(USCB_DECENNIALDP2020,USCB_DECENNIALDP2020_meta_file)

### Data cleanup

In [96]:
USCB_df = USCB_df.drop(0, axis=0)
USCB_df = USCB_df.reset_index(drop=True)
USCB_df['CTs'] = USCB_df['GEO_ID'].str[-11:].astype(int)

### Update data

In [109]:
Data = combine_dfs(Data,USCB_df,new_df_col_name='CTs',data_df_col_name='Census Tract')

## EJScreen ([EPA](https://www.epa.gov/ejscreen/download-ejscreen-data))

In [116]:
EPA_EJScreen =  pd.read_csv('/Users/melek/Documents/Data/Energy Justice/EJScreen (Environmental Justice Screening and Mapping Tool)/Census Tracts/State Level/EJScreen_2024_Tract_StatePct_with_AS_CNMI_GU_VI.csv',
                            low_memory=False)
EPA_EJScreen = EPA_EJScreen[EPA_EJScreen['STATE_NAME']=='CALIFORNIA']

EPA_EJScreen_meta_file = '/Users/melek/Documents/GitHub/EnergyJustice/Data/Metadata/EJScreen_2024_Tract_State_Percentiles_Columns.csv'

### Extract data

In [119]:
EJScreen, EJScreen_MD =extract_data(EPA_EJScreen,EPA_EJScreen_meta_file)

### Data cleanup

In [124]:
EJScreen.loc[:, 'ID'] = EJScreen['ID'].astype(int)
EJScreen = EJScreen.reset_index()

### Update data

In [126]:
Data = combine_dfs(Data,EJScreen,new_df_col_name='ID',data_df_col_name='Census Tract')

# CalEnviron4

### Extract data

### Data cleanup

### Update data