In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
GOOGLE_DRIVE_BASE_PATH = "/content/drive/MyDrive/"
user = "Matthew" # change before running the notebook

if user == "Julia":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
elif user == "Matthew":
    DATA_PATH = "/Users/mdong/dataScience/projects-ml/ca-waste/" + "data/"
#     DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "Sustainability/ER131-Project/" + "data/"
elif user == "Samadi":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
elif user == "Shaye":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
    
print("User: {}\nPath to data: {}".format(user, DATA_PATH))

User: Matthew
Path to data: /Users/mdong/dataScience/projects-ml/ca-waste/data/


## Load in data

In [3]:
complete_feature_df = pd.read_csv(DATA_PATH + "complete_feature_df.csv")
complete_feature_df.head()

Unnamed: 0,Year,Waste Produced (Tons),County,Population,Electricity Usage (GWh)
0,2000.0,1676429.25,Alameda,1443939.0,2926.106226
1,2000.0,745.0,Alpine,1208.0,6.247035
2,2000.0,41059.9,Amador,35100.0,127.238094
3,2000.0,203896.87,Butte,203171.0,705.766172
4,2000.0,34110.44,Calaveras,40554.0,173.578409


In [4]:
complete_feature_df.set_index("Year", inplace=True)
complete_feature_df.head(3)

Unnamed: 0_level_0,Waste Produced (Tons),County,Population,Electricity Usage (GWh)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000.0,1676429.25,Alameda,1443939.0,2926.106226
2000.0,745.0,Alpine,1208.0,6.247035
2000.0,41059.9,Amador,35100.0,127.238094


In [5]:
# x = pd.to_datetime(complete_feature_df['Year'], format='%Y')
# type(x[0].year)

## Feature, target split

In [6]:
target = complete_feature_df[["Waste Produced (Tons)"]]
feature_df = complete_feature_df.drop(columns="Waste Produced (Tons)")
feature_df.head(2)

Unnamed: 0_level_0,County,Population,Electricity Usage (GWh)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000.0,Alameda,1443939.0,2926.106226
2000.0,Alpine,1208.0,6.247035


In [7]:
target.head(2)

Unnamed: 0_level_0,Waste Produced (Tons)
Year,Unnamed: 1_level_1
2000.0,1676429.25
2000.0,745.0


## Preprocessing

### One hot encoding


In [8]:
feature_df = pd.get_dummies(feature_df, columns=["County"])
feature_df.head()

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,1443939.0,2926.106226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,1208.0,6.247035,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,35100.0,127.238094,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,203171.0,705.766172,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,40554.0,173.578409,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# from sklearn.preprocessing import OneHotEncoder

# one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
# one_hot_encoder.fit(X_train)

# X_train = one_hot_encoder.transform(X_train)
# X_test = one_hot_encoder.transform(X_test)

# one_hot_encoder.categories_
# X_train
# <1073x2190 sparse matrix of type '<class 'numpy.float64'>'
# with 3219 stored elements in Compressed Sparse Row format>

### TODO: Scaling

## Train test split

In [10]:
feature_df.head()

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,1443939.0,2926.106226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,1208.0,6.247035,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,35100.0,127.238094,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,203171.0,705.766172,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,40554.0,173.578409,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X_train, y_train = feature_df.loc[feature_df.index!=2019], target[target.index!=2019]
X_test, y_test = feature_df.loc[feature_df.index==2019], target[target.index==2019]

In [12]:
X_train.head(2)

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,1443939.0,2926.106226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,1208.0,6.247035,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
y_train.head(2)

Unnamed: 0_level_0,Waste Produced (Tons)
Year,Unnamed: 1_level_1
2000.0,1676429.25
2000.0,745.0


In [22]:
X_test

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019.0,1664783.0,3064.781376,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,1149.0,10.131788,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,37820.0,140.689119,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,221521.0,669.328499,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,45085.0,205.365353,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,21990.0,67.633093,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,1150621.0,2934.141948,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,27127.0,117.160978,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2019.0,190018.0,767.40108,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019.0,1015195.0,2796.305742,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
assert 2019 not in X_train.index.unique(), "2019 should not be in the training data"
assert 2019 not in y_train.index.unique(), "2019 should not be in the training data"

assert 2019 == X_test.index.unique(), "2019 should be the test data"
assert 2019 == y_test.index.unique(), "2019 should be the test data"

# Prediction question 1: Model selection training on the entire dataset

## Linear regression methods

## Decision tree ensemble method

- Remark: Decision trees are supposed to be able to handle categorical features but this is [not supported in sklearn](https://stackoverflow.com/questions/38108832/passing-categorical-data-to-sklearn-decision-tree) so we need to one hot encode

In [15]:
def root_mean_squared_error(actual, predicted):
    """Creating RMSE function since it's not inbuilt in sklearn
    """
    return np.sqrt(mean_squared_error(actual, predicted))

rmse = make_scorer(root_mean_squared_error, greater_is_better = False)

In [16]:
random_forest_model = RandomForestRegressor()

possible_hyperparams = { 
    'n_estimators': [10, 20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,10)]
}

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=possible_hyperparams, cv=5, scoring=rmse)

In [17]:
grid_search.fit(X_train, y_train.values.ravel()) # need to call this ravel function because of https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected
best_score = -grid_search.best_score_ # needs to be negated because of https://stackoverflow.com/questions/21443865/scikit-learn-cross-validation-negative-values-with-mean-squared-error
best_model = grid_search.best_estimator_




In [18]:
best_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [19]:
y_pred = best_model.predict(X_test)

In [20]:
root_mean_squared_error(y_test, y_pred)

235751.53298577736

In [21]:
r2_score(y_test, y_pred)

0.9780381642336637

# Prediction question 2: Model selection training on each individual county