In [93]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

In [2]:
GOOGLE_DRIVE_BASE_PATH = "/content/drive/MyDrive/"
user = "Matthew" # change before running the notebook

if user == "Julia":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
elif user == "Matthew":
    DATA_PATH = "/Users/mdong/dataScience/projects-ml/ca-waste/" + "data/"
#     DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "Sustainability/ER131-Project/" + "data/"
elif user == "Samadi":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
elif user == "Shaye":
    DATA_PATH = GOOGLE_DRIVE_BASE_PATH + "..." + "data/"
    
print("User: {}\nPath to data: {}".format(user, DATA_PATH))

User: Matthew
Path to data: /Users/mdong/dataScience/projects-ml/ca-waste/data/


In [113]:
complete_feature_df = pd.read_csv(DATA_PATH + "complete_feature_df.csv")
complete_feature_df.head()

Unnamed: 0,Year,Waste Produced (Tons),County,Population,Electricity Usage (GWh)
0,2000.0,1676429.25,Alameda,1443939.0,2926.106226
1,2000.0,745.0,Alpine,1208.0,6.247035
2,2000.0,41059.9,Amador,35100.0,127.238094
3,2000.0,203896.87,Butte,203171.0,705.766172
4,2000.0,34110.44,Calaveras,40554.0,173.578409


In [114]:
complete_feature_df.set_index("Year", inplace=True)
complete_feature_df.head(3)

Unnamed: 0_level_0,Waste Produced (Tons),County,Population,Electricity Usage (GWh)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000.0,1676429.25,Alameda,1443939.0,2926.106226
2000.0,745.0,Alpine,1208.0,6.247035
2000.0,41059.9,Amador,35100.0,127.238094


In [115]:
# x = pd.to_datetime(complete_feature_df['Year'], format='%Y')
# type(x[0].year)

## Feature, target split

In [116]:
target = complete_feature_df[["Waste Produced (Tons)"]]
feature_df = complete_feature_df.drop(columns="Waste Produced (Tons)")
feature_df.head(2)

Unnamed: 0_level_0,County,Population,Electricity Usage (GWh)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000.0,Alameda,1443939.0,2926.106226
2000.0,Alpine,1208.0,6.247035


In [117]:
target.head(2)

Unnamed: 0_level_0,Waste Produced (Tons)
Year,Unnamed: 1_level_1
2000.0,1676429.25
2000.0,745.0


## Preprocessing / Scaling

In [122]:
feature_df = pd.get_dummies(feature_df, columns=["County"])
feature_df.head()

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,1443939.0,2926.106226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,1208.0,6.247035,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,35100.0,127.238094,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,203171.0,705.766172,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,40554.0,173.578409,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
# from sklearn.preprocessing import OneHotEncoder

# one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
# one_hot_encoder.fit(X_train)

# X_train = one_hot_encoder.transform(X_train)
# X_test = one_hot_encoder.transform(X_test)

# one_hot_encoder.categories_
# X_train
# <1073x2190 sparse matrix of type '<class 'numpy.float64'>'
# with 3219 stored elements in Compressed Sparse Row format>

## Train test split

In [124]:
feature_df.head()

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,1443939.0,2926.106226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,1208.0,6.247035,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,35100.0,127.238094,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,203171.0,705.766172,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,40554.0,173.578409,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
X_train, y_train = feature_df.loc[feature_df.index!=2019], target[target.index!=2019]
X_test, y_test = feature_df.loc[feature_df.index==2019], target[target.index==2019]

In [126]:
X_train.head(2)

Unnamed: 0_level_0,Population,Electricity Usage (GWh),County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_Del Norte,...,County_Solano,County_Sonoma,County_Stanislaus,County_Tehama,County_Trinity,County_Tulare,County_Tuolumne,County_Ventura,County_Yolo,County_Yuba
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,1443939.0,2926.106226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2000.0,1208.0,6.247035,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
y_train.head(2)

Unnamed: 0_level_0,Waste Produced (Tons)
Year,Unnamed: 1_level_1
2000.0,1676429.25
2000.0,745.0


In [128]:
assert 2019 not in X_train.index.unique(), "2019 should not be in the training data"
assert 2019 not in y_train.index.unique(), "2019 should not be in the training data"

assert 2019 == X_test.index.unique(), "2019 should be the test data"
assert 2019 == y_test.index.unique(), "2019 should be the test data"

## Linear regression methods

## Decision tree ensemble methods

- Remark: Decision trees are supposed to be able to handle categorical features but this is [not supported in sklearn](https://stackoverflow.com/questions/38108832/passing-categorical-data-to-sklearn-decision-tree)

In [170]:
def root_mean_squared_error(actual, predicted):
    """
    """
    return np.sqrt(mean_squared_error(actual, predicted))

rmse = make_scorer(root_mean_squared_error, greater_is_better = False)
# mse = make_scorer(mean_squared_error, greater_is_better = False)

In [171]:
random_forest_model = RandomForestRegressor()

possible_hyperparams = { 
    'n_estimators': [10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,10)]
}

gsearch = GridSearchCV(estimator=random_forest_model, param_grid=possible_hyperparams, cv=5, scoring=rmse)

In [172]:
gsearch.fit(X_train, y_train.values.ravel())
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_
best_score # -103186874308.62581



-278666.6633299263

In [175]:
np.sqrt(103186874308)

321227.13818729576

In [158]:
import sklearn

In [159]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [152]:
np.sqrt(-best_score)

315123.7079594949

In [138]:
gsearch.fit(X_train, y_train.values.ravel())
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_



In [139]:
best_score

-276514.7616603697

In [140]:
best_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [None]:
y_pred = best_model.predict(X_test)