In [1]:
import pandas as pd
import numpy as np
import os
DATA_PATH = "../calrecycle-data/"

## CalRecycle Waste Data

### Combine all individual county data into a single df

In [2]:
def filter_waste_origin_df(df, county):
    """Drop quarterly values in the csv and only keep yearly values
    Input:
        - calrecycle df
        - county (string)
    
    Ouput:
        - df with year, county, disposal ton
    """
    columns_to_keep = ["Report Year", "Diposal Ton", "County"]
    df["County"] = county
    missing_data = df["Report Year"].isnull()
    df = df[~missing_data]
    return df[columns_to_keep]

In [8]:
county_waste_production = []
for f in os.listdir(DATA_PATH):
    filename, ext = os.path.splitext(f)
    county = filename.split("_waste_production")[0]
    county = county.replace("_", " ")
    print("[INFO] filtering county file: ", os.path.join(DATA_PATH, f))
    try:
        df = pd.read_excel(os.path.join(DATA_PATH, f), skiprows=3, skipfooter=2)
        filtered_df = filter_waste_origin_df(df, county)
        county_waste_production.append(filtered_df)
    except Exception as e:
        print("[DEBUG] Skipping: ", county)

county_waste_production_df = pd.concat(county_waste_production)

[INFO] filtering county file:  ../calrecycle-data/San_Bernardino_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Tehama_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Modoc_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/San_Benito_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Santa_Barbara_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Yolo_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Sonoma_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Tuolumne_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/San_Mateo_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Monterey_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Alpine_waste_production.xlsx
[INFO] filtering county file:  ../calrecycle-data/Solano_waste_production.xlsx
[INFO] filtering county file:

In [9]:
county_waste_production_df.head()

Unnamed: 0,Report Year,Diposal Ton,County
0,1995.0,1623266.99,San Bernardino
105,1996.0,1652495.58,San Bernardino
210,1997.0,1603700.6,San Bernardino
315,1998.0,1682447.54,San Bernardino
420,1999.0,1679816.31,San Bernardino


### Tests

- Total number of counties (excluding Sutter) * total number of years = 58 - 1 * (2019 - 1995 + 1) = total number of observations in the final df
- 2012 LA County total waste produced = 8,141,712
- 2019 Alameda County total waste produced = 1,465,264	

In [10]:
def waste_produced(county, year):
    county_condition = county_waste_production_df["County"] == county
    year_condition = county_waste_production_df["Report Year"] == year
    return county_waste_production_df.loc[county_condition & year_condition, "Diposal Ton"]

In [11]:
assert county_waste_production_df.shape[0] == (58 - 1) * (2019 - 1995 + 1)
assert np.isclose(1465264, waste_produced("Alameda", 2019))
assert np.isclose(8141712, waste_produced("Los Angeles", 2012))

### Save cleaned waste production csv

In [12]:
county_waste_production_df.to_csv("../data/calrecycle_waste_production_per_county.csv", index=False)

## Census Population Data