In [11]:
import pandas as pd
import numpy as np
from getpass import getpass

import plotly.express as px
import plotly.graph_objects as go
import chart_studio
import chart_studio.plotly as py

username = "matthew-y-dong"
# api_key = getpass()
# chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [2]:
GOOGLE_DRIVE_BASE_PATH = "/content/drive/MyDrive/"
user = "Matthew"

if user == "Julia":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
elif user == "Matthew":
    DATA_PATH = "/Users/mdong/dataScience/projects-ml/ca-waste/" + "data/"
#     DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "Sustainability/ER131-Project/" + "data/"
elif user == "Samadi":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
elif user == "Shaye":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
    
print("User: {}\nPath to data: {}".format(user, DATA_PATH))

User: Matthew
Path to data: /Users/mdong/dataScience/projects-ml/ca-waste/data/


## Waste generation

In [29]:
complete_feature_df = pd.read_csv(DATA_PATH + "complete_feature_df.csv")
complete_feature_df.sort_values(["Year", "County"], inplace=True)
complete_feature_df = complete_feature_df[["Year", "County", "Waste Produced (Tons)"]]
complete_feature_df.head()

Unnamed: 0,Year,County,Waste Produced (Tons)
911,2000.0,Alameda,1676429.25
192,2000.0,Alpine,745.0
392,2000.0,Amador,41059.9
1010,2000.0,Butte,203896.87
332,2000.0,Calaveras,34110.44


In [8]:
fig = px.line(complete_feature_df, x="Year", y="Population", color='County')
fig.show()

In [9]:
fig = px.line(complete_feature_df, x="Year", y="Waste Produced (Tons)", color='County')
fig.show()

In [33]:
average_waste_produced_county = complete_feature_df.groupby("County").agg(np.mean)[["Waste Produced (Tons)"]]
average_waste_produced_county.head()

Unnamed: 0_level_0,Waste Produced (Tons)
County,Unnamed: 1_level_1
Alameda,1388987.0
Alpine,1337.51
Amador,36809.86
Butte,281785.1
Calaveras,45511.52


In [38]:
years_to_predict = np.arange(2020, 2030)
for county in average_waste_produced_county.index:
    average_waste_produced = average_waste_produced_county.loc[county, "Waste Produced (Tons)"]
    county_name = np.repeat(county, len(years_to_predict))
    predictions = np.repeat(average_waste_produced, len(years_to_predict))
    df = pd.DataFrame({"Year": years_to_predict, 
                       "County": county_name,
                       "Waste Produced (Tons)": predictions
                      })
    complete_feature_df = complete_feature_df.append(df)

In [42]:
assert complete_feature_df.Year.max() == 2029

In [58]:
complete_feature_df.to_csv(DATA_PATH + "predictions_observations_df.csv", index=False)

In [46]:
alameda = complete_feature_df[complete_feature_df.County == "Alameda"]
observations = alameda[alameda.Year <= 2019]
predictions = alameda[alameda.Year > 2019]
predictions.head()

Unnamed: 0,Year,County,Waste Produced (Tons)
0,2020.0,Alameda,1388987.0
1,2021.0,Alameda,1388987.0
2,2022.0,Alameda,1388987.0
3,2023.0,Alameda,1388987.0
4,2024.0,Alameda,1388987.0


In [53]:
fig = go.FigureWidget(data=[
    go.Scatter(x=observations.Year, y=observations["Waste Produced (Tons)"], 
               mode='lines', line={'dash': 'solid'}, name="Observations"),
    go.Scatter(x=predictions.Year, y=predictions["Waste Produced (Tons)"], 
               mode='lines', line={'dash': 'dash'}, name="Predictions")
])
fig

FigureWidget({
    'data': [{'line': {'dash': 'solid'},
              'mode': 'lines',
              'name': '…

In [57]:
url = py.plot(fig, filename = 'Alameda_preds', auto_open=False)
url[-3:]

'63/'

## Waste characterization

In [2]:
calrecycle_waste_characterization_per_county = pd.read_csv("../data/calrecycle_waste_characterization_per_county.csv")
calrecycle_waste_characterization_per_county.head()

Unnamed: 0,County,Total Residential Tons,Electricity Usage_2014,Population_2014
0,ALAMEDA,405147,2891.232325,1607792.0
1,ALPINE,291,11.974975,1080.0
2,AMADOR,8799,134.744387,36726.0
3,BUTTE,55883,718.947064,223516.0
4,CALAVERAS,11400,192.543967,44671.0


In [3]:
calrecycle_waste_characterization_per_county.groupby("County").agg("mean").head()

Unnamed: 0_level_0,Total Residential Tons,Electricity Usage_2014,Population_2014
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALAMEDA,405147,2891.232325,1607792.0
ALPINE,291,11.974975,1080.0
AMADOR,8799,134.744387,36726.0
BUTTE,55883,718.947064,223516.0
CALAVERAS,11400,192.543967,44671.0


In [21]:
fig = px.scatter(calrecycle_waste_characterization_per_county, x="Electricity Usage_2014", y="Total Residential Tons",
                 size="Population_2014", hover_data=["County"], color="County",
#                  log_x=True, log_y=True, 
                 labels={
                     "Total Residential Tons": "Total Residential Waste (Tons)",
                     "Electricity Usage_2014": "Electricity Usage (gWh)",
                     "Population_2014": "Population"
                 },
                 title="CA County Waste vs Electricity Usage & Population (2014)"
                )
# fig.update_traces(textposition='top left')
fig.show()

In [22]:
py.plot(fig, filename = 'Waste-vs-Energy-Population', auto_open=True)

'https://plotly.com/~mdong127/59/'