In [2]:
import numpy as np
import pandas as pd
import json
from urllib.parse import urlsplit
from collections import OrderedDict
from getpass import getpass

import plotly.express as px
import plotly.graph_objects as go
import chart_studio
import chart_studio.plotly as py
import folium

# username = "matthew-y-dong"
# api_key = getpass()
# chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

BASE_PATH = "/Users/mdong/dataScience/projects-ml/ca-waste/"
DATA_PATH = BASE_PATH + "data/"
GEOSPATIAL_DATA_PATH = DATA_PATH + "geospatial-data/"

## CA Geospatial visualization

In [8]:
complete_feature_df = pd.read_csv(DATA_PATH + "complete_feature_df.csv")
complete_feature_df.head()

Unnamed: 0,Year,Waste Produced (Tons),County,Population,Electricity Usage (GWh)
0,2000.0,1676429.25,Alameda,1443939.0,2926.106226
1,2000.0,745.0,Alpine,1208.0,6.247035
2,2000.0,41059.9,Amador,35100.0,127.238094
3,2000.0,203896.87,Butte,203171.0,705.766172
4,2000.0,34110.44,Calaveras,40554.0,173.578409


In [9]:
complete_feature_df_2019 = complete_feature_df[complete_feature_df.Year == 2019]
complete_feature_df_2019["Waste Produced"] = np.log10(complete_feature_df_2019["Waste Produced (Tons)"])# / 1e6
complete_feature_df_2019.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Year,Waste Produced (Tons),County,Population,Electricity Usage (GWh),Waste Produced
1073,2019.0,1465263.51,Alameda,1664783.0,3064.781376,6.165916
1074,2019.0,566.18,Alpine,1149.0,10.131788,2.752955
1075,2019.0,35768.39,Amador,37820.0,140.689119,4.553499
1076,2019.0,1818063.23,Butte,221521.0,669.328499,6.259609
1077,2019.0,35660.9,Calaveras,45085.0,205.365353,4.552192


In [10]:
waste_heat_map = folium.Map(location=(37.5, -119.5), zoom_start=6, min_zoom=5, tiles='cartodbpositron')

ca_counties_geojson_path = GEOSPATIAL_DATA_PATH + "ca-counties.geojson"

folium.Choropleth(
    geo_data=ca_counties_geojson_path,
    data=complete_feature_df_2019,
    columns=['County', 'Waste Produced'],
    key_on='feature.id',
    fill_color='BuPu',
    legend_name='Waste Produced (Log 10 Tons)'
).add_to(waste_heat_map)

# waste_heat_map.save(BASE_PATH + 'docs/ca-waste.html')

waste_heat_map

## Creating prediction graphs

In [6]:
average_waste_produced_county = complete_feature_df.groupby("County").agg(np.mean)[["Waste Produced (Tons)"]]
average_waste_produced_county.head()

Unnamed: 0_level_0,Waste Produced (Tons)
County,Unnamed: 1_level_1
Alameda,1388987.0
Alpine,1337.51
Amador,36809.86
Butte,281785.1
Calaveras,45511.52


### TODO: replace this with actual predictions later

In [93]:
average_waste_produced = 1.388987e+06
years_to_predict = np.arange(2020, 2025)
12 ** (years_to_predict % 2019 ) + average_waste_produced

array([1388999., 1389131., 1390715., 1409723., 1637819.])

In [132]:
complete_feature_df = pd.read_csv(DATA_PATH + "complete_feature_df.csv")

In [133]:
years_to_predict = np.arange(2020, 2025)
for county in average_waste_produced_county.index:
    average_waste_produced = average_waste_produced_county.loc[county, "Waste Produced (Tons)"]
    county_name = np.repeat(county, len(years_to_predict))
    dummy_predictions = 12 ** (years_to_predict % 2019) + average_waste_produced
#     average = np.repeat(average_waste_produced, len(years_to_predict))
    df = pd.DataFrame({"Year": years_to_predict, 
                       "County": county_name,
                       "Waste Produced (Tons)": dummy_predictions
                      })
    complete_feature_df = complete_feature_df.append(df, sort=False)

In [134]:
assert complete_feature_df.Year.max() == 2024

In [135]:
timeforecast_predictions = complete_feature_df[["Year", "County", "Waste Produced (Tons)"]]
timeforecast_predictions.sort_values(["Year", "County"], inplace=True)
timeforecast_predictions.head(3)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Year,County,Waste Produced (Tons)
0,2000.0,Alameda,1676429.25
1,2000.0,Alpine,745.0
2,2000.0,Amador,41059.9


In [136]:
timeforecast_predictions.to_csv(DATA_PATH + "timeforecast_predictions.csv", index=False)

## Create a visualization for every county

In [137]:
timeforecast_predictions = pd.read_csv(DATA_PATH + "timeforecast_predictions.csv")
timeforecast_predictions.head()

Unnamed: 0,Year,County,Waste Produced (Tons)
0,2000.0,Alameda,1676429.25
1,2000.0,Alpine,745.0
2,2000.0,Amador,41059.9
3,2000.0,Butte,203896.87
4,2000.0,Calaveras,34110.44


In [4]:
predictions_observations = pd.read_csv(DATA_PATH + "predictions_observations.csv")
predictions_observations.head()

Unnamed: 0,Year,County,Observed Waste Produced,Predicted Waste Produced
0,2000.0,Alameda,1676429.25,1402038.0
1,2000.0,Alpine,745.0,2326.535
2,2000.0,Amador,41059.9,31398.48
3,2000.0,Butte,203896.87,197706.2
4,2000.0,Calaveras,34110.44,32372.63


In [5]:
def generate_prediction_plot(df, county, timeforecast=False):
    """Generates plotly line chart for a given county depicting observed waste trends
    and predicted values
    """
    if timeforecast:
        county_preds_obs = df[df.County == county]
        observations = county_preds_obs[county_preds_obs.Year <= 2019]
        predictions = county_preds_obs[county_preds_obs.Year > 2019]
        fig = go.FigureWidget(data=[
        go.Scatter(x=observations.Year, y=observations["Waste Produced (Tons)"], 
                   mode='lines', line={'dash': 'solid'}, name="Observed"),
        go.Scatter(x=predictions.Year, y=predictions["Waste Produced (Tons)"], 
                   mode='lines', line={'dash': 'dash'}, name="Predicted")
        ])
    else:
        county_preds_obs = county_preds_obs = df[df.County == county]
        observations = county_preds_obs["Observed Waste Produced"]
        predictions = county_preds_obs["Predicted Waste Produced"]
        fig = go.FigureWidget(data=[
        go.Scatter(x=county_preds_obs.Year, y=observations, 
                   mode='lines', line={'dash': 'solid'}, name="Observed"),
        go.Scatter(x=county_preds_obs.Year, y=predictions, 
                   mode='lines', line={'dash': 'dash'}, name="Predicted")
        ])
    
    fig.update_xaxes(title_text='Year')
    fig.update_yaxes(title_text='Waste Produced (Tons)')
    fig.update_layout(title=county + " Observed and Predicted Values")
    fig.update_layout(legend=dict(
        yanchor="top",
        y=0.25,
        xanchor="left",
        x=0.8
    ))
    return fig
#     plot_url = py.plot(fig, filename = county + '_obs_preds', auto_open=False)
#     print("[INFO] {} url: {}".format(county, plot_url))
#     path = urlsplit(plot_url).path
#     url_identifer = path.split("/")[2]
#     return url_identifer

### Note: Delete all existing plotly visualizations before regenerating new ones

In [6]:
generate_prediction_plot(predictions_observations, "Alameda")

FigureWidget({
    'data': [{'line': {'dash': 'solid'},
              'mode': 'lines',
              'name': '…

In [12]:
counties_list = predictions_observations.County.unique()
county_plotly_url_map = OrderedDict()

for county in counties_list:
    county_plotly_url_map[county] = generate_prediction_plot(county)

[INFO] Alameda url: https://plotly.com/~matthew-y-dong/120/
[INFO] Alpine url: https://plotly.com/~matthew-y-dong/122/
[INFO] Amador url: https://plotly.com/~matthew-y-dong/124/
[INFO] Butte url: https://plotly.com/~matthew-y-dong/126/
[INFO] Calaveras url: https://plotly.com/~matthew-y-dong/128/
[INFO] Colusa url: https://plotly.com/~matthew-y-dong/130/
[INFO] Contra Costa url: https://plotly.com/~matthew-y-dong/132/
[INFO] Del Norte url: https://plotly.com/~matthew-y-dong/134/
[INFO] El Dorado url: https://plotly.com/~matthew-y-dong/136/
[INFO] Fresno url: https://plotly.com/~matthew-y-dong/138/
[INFO] Glenn url: https://plotly.com/~matthew-y-dong/140/
[INFO] Humboldt url: https://plotly.com/~matthew-y-dong/142/
[INFO] Imperial url: https://plotly.com/~matthew-y-dong/144/
[INFO] Inyo url: https://plotly.com/~matthew-y-dong/147/
[INFO] Kern url: https://plotly.com/~matthew-y-dong/149/
[INFO] Kings url: https://plotly.com/~matthew-y-dong/151/
[INFO] Lake url: https://plotly.com/~matthe

PlotlyRequestError: Hi there, you've reached the threshold of 100 combined image exports and chart saves per 24h period. If you need to raise your daily limit, please consider upgrading to a paid plan.

In [13]:
json.dumps(county_plotly_url_map)

'{"Alameda": "120", "Alpine": "122", "Amador": "124", "Butte": "126", "Calaveras": "128", "Colusa": "130", "Contra Costa": "132", "Del Norte": "134", "El Dorado": "136", "Fresno": "138", "Glenn": "140", "Humboldt": "142", "Imperial": "144", "Inyo": "147", "Kern": "149", "Kings": "151", "Lake": "153", "Lassen": "155", "Los Angeles": "157", "Madera": "159", "Marin": "161", "Mariposa": "163", "Mendocino": "165", "Merced": "167", "Modoc": "169", "Mono": "171", "Monterey": "173", "Napa": "175", "Nevada": "177", "Orange": "179", "Placer": "181", "Plumas": "183", "Riverside": "185", "Sacramento": "187", "San Benito": "189", "San Bernardino": "191", "San Diego": "193", "San Francisco": "195", "San Joaquin": "197", "San Luis Obispo": "199"}'

---

## Waste characterization

In [4]:
waste_breakdown = pd.read_csv(DATA_PATH + "calrecycle_waste_characterization_per_county.csv")
waste_breakdown.head()

Unnamed: 0,Material Category,Material Type,County,Single Family Tons,Regional Single Family Composition,Multi Family Tons,Statewide Multi Family Composition,Total Residential Tons,Total Residential Composition
0,Paper,Uncoated Corrugated Cardboard,CONTRA COSTA,1536,0.006688,1931,0.035679,3467,0.012218
1,Paper,Paper Bags,CONTRA COSTA,719,0.003132,286,0.005275,1005,0.003541
2,Paper,Newspaper,CONTRA COSTA,1665,0.00725,2513,0.046431,4178,0.014724
3,Paper,White Ledger Paper,CONTRA COSTA,426,0.001854,286,0.005287,712,0.002509
4,Paper,Other Office Paper,CONTRA COSTA,1097,0.004775,319,0.005888,1415,0.004987


In [5]:
waste_breakdown["Material Category"].unique()

array(['Paper', 'Glass', 'Metal', 'Electronics', 'Plastic',
       'Other Organic', 'Inerts and Other',
       'Household Hazardous Waste (HHW)', 'Special Waste',
       'Mixed Residue'], dtype=object)

In [6]:
e_waste = waste_breakdown[waste_breakdown["Material Category"] == "Electronics"]
e_waste.head()

Unnamed: 0,Material Category,Material Type,County,Single Family Tons,Regional Single Family Composition,Multi Family Tons,Statewide Multi Family Composition,Total Residential Tons,Total Residential Composition
24,Electronics,Brown Goods,CONTRA COSTA,0,0.0,245,0.004534,245,0.000865
25,Electronics,Computer-related Electronics,CONTRA COSTA,131,0.00057,109,0.002017,240,0.000846
26,Electronics,Other Small Consumer Electronics,CONTRA COSTA,384,0.001671,142,0.002617,525,0.001851
27,Electronics,Video Display Devices,CONTRA COSTA,1651,0.007191,354,0.006533,2005,0.007065
92,Electronics,Brown Goods,SIERRA,0,0.0,0,0.004534,0,0.000428


In [7]:
e_waste["Material Type"].unique()

array(['Brown Goods', 'Computer-related Electronics',
       'Other Small Consumer Electronics', 'Video Display Devices'],
      dtype=object)

In [8]:
alameda_ewaste = e_waste[e_waste["County"] == "ALAMEDA"]
alameda_ewaste

Unnamed: 0,Material Category,Material Type,County,Single Family Tons,Regional Single Family Composition,Multi Family Tons,Statewide Multi Family Composition,Total Residential Tons,Total Residential Composition
976,Electronics,Brown Goods,ALAMEDA,0,0.0,538,0.004534,538,0.001327
977,Electronics,Computer-related Electronics,ALAMEDA,163,0.00057,239,0.002017,403,0.000994
978,Electronics,Other Small Consumer Electronics,ALAMEDA,479,0.001671,310,0.002617,789,0.001948
979,Electronics,Video Display Devices,ALAMEDA,2061,0.007191,775,0.006533,2835,0.006998


In [11]:
fig = px.bar(alameda_ewaste, x='Material Type', y='Total Residential Tons')
fig.update_layout(title_text='Alameda Electronic Waste Breakdown')
py.plot(fig, filename = "alameda_ewaste", auto_open=False)
fig.show()

In [22]:
per_county_ewaste = e_waste.groupby("County").agg(sum)[["Total Residential Tons"]].reset_index()
per_county_ewaste.sort_values("Total Residential Tons", inplace=True)
per_county_ewaste.head()

Unnamed: 0,County,Total Residential Tons
1,ALPINE,5
45,SIERRA,11
24,MODOC,35
52,TRINITY,49
25,MONO,50


In [43]:
fig = px.bar(per_county_ewaste.tail(25), x='County', y='Total Residential Tons')
fig.update_layout(xaxis_tickangle=-45)
fig.update_layout(title_text='Electronic Waste Generated per CA County (Top 25)')
py.plot(fig, filename = "per_county_ewaste_fig", auto_open=False)
fig.show()

---

## plotly sandbox

In [None]:
county = "Alameda"
county_preds_obs = timeforecast_predictions[timeforecast_predictions.County == county]
observations = county_preds_obs[county_preds_obs.Year <= 2019]
predictions = county_preds_obs[county_preds_obs.Year > 2019]
fig = go.FigureWidget(data=[
    go.Scatter(x=observations.Year, y=observations["Waste Produced (Tons)"], 
               mode='lines', line={'dash': 'solid'}, name="Observed"),
    go.Scatter(x=predictions.Year, y=predictions["Waste Produced (Tons)"], 
               mode='lines', line={'dash': 'dash'}, name="Predicted")
    ])
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Waste Produced (Tons)')
fig.update_layout(title=county + " Observed and Predicted Values")
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.25,
    xanchor="left",
    x=0.8
))

In [None]:
# url_identifer = generate_prediction_plot("Alpine")
# url_identifer