In [73]:
import os
from pathlib import Path
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

In [74]:
with open("data/df_agriculture_cleaned.pkl", "rb") as file:
    df_agriculture_cleaned = pickle.load(file)

In [75]:
train_set, test_set = train_test_split(df_agriculture_cleaned, test_size=0.2, random_state=42)

In [76]:
train_set = train_set[~train_set.isin([np.nan, np.inf, -np.inf]).any(1)]

In [77]:
test_set = test_set[~test_set.isin([np.nan, np.inf, -np.inf]).any(1)]

In [78]:
price_train_features = train_set.loc[:,['food_supply_percentage',
                       'feed_supply_percentage',
                       'export_supply_percentage',
                       'other_use_supply_percentage']]

In [79]:
price_test_features = test_set.loc[:,['food_supply_percentage',
                       'feed_supply_percentage',
                       'export_supply_percentage',
                       'other_use_supply_percentage']]

In [80]:
price_train_target = train_set.loc[:,'Price Level']

In [81]:
price_lr = LinearRegression()

In [82]:
price_lr.fit(price_train_features, price_train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [83]:
price_lr_predictions = price_lr.predict(price_test_features)

In [84]:
price_lr_mse = mean_squared_error(test_set.loc[:,'Price Level'], price_lr_predictions)

In [85]:
price_lr_rmse = np.sqrt(price_lr_mse)

In [86]:
price_lr_rmse

0.9879945390801739

In [87]:
price_dtr = DecisionTreeRegressor()

In [88]:
price_dtr.fit(price_train_features, price_train_target)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [89]:
price_dtr_predictions = price_dtr.predict(price_test_features)

In [51]:
price_dtr_mse = mean_squared_error(test_set.loc[:,'Price USD'], price_dtr_predictions)

In [52]:
price_dtr_rmse = np.sqrt(price_dtr_mse)

In [53]:
price_dtr_rmse

3488868794.8392634

In [54]:
price_rfr = RandomForestRegressor()

In [55]:
price_rfr.fit(price_train_features, price_train_target)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [57]:
price_rfr_predictions = price_rfr.predict(price_test_features)

In [58]:
price_rfr_mse = mean_squared_error(test_set.loc[:,'Price USD'], price_rfr_predictions)

In [59]:
price_rfr_rmse = np.sqrt(price_rfr_mse)

In [60]:
price_rfr_rmse

2860725703.210372

In [61]:
price_lr_scores = cross_val_score(price_lr, price_train_features, price_train_target,
                           scoring="neg_mean_squared_error", cv=10)

In [62]:
price_lr_rmse_scores = np.sqrt(-price_lr_scores)

In [63]:
print("Price LR Mean: {}".format(price_lr_rmse_scores.mean()))
print("Price LR STD: {}".format(price_lr_rmse_scores.std()))

Price LR Mean: 1617758930.0360808
Price LR STD: 1224647002.825478


In [64]:
price_dtr_scores = cross_val_score(price_dtr, price_train_features, price_train_target,
                                  scoring="neg_mean_squared_error", cv=10)

In [65]:
price_dtr_rmse_scores = np.sqrt(-price_dtr_scores)

In [66]:
print("Price DTR Mean: {}".format(price_dtr_rmse_scores.mean()))
print("Price DTR STD: {}".format(price_dtr_rmse_scores.std()))

Price DTR Mean: 1309627518.6816442
Price DTR STD: 1189350124.9377856


In [67]:
price_rfr_scores = cross_val_score(price_rfr, price_train_features, price_train_target,
                                   scoring="neg_mean_squared_error", cv=10)

In [68]:
price_rfr_rmse_scores = np.sqrt(-price_rfr_scores)

In [69]:
print("Price RFR Mean: {}".format(price_rfr_rmse_scores.mean()))
print("Price RFR STD: {}".format(price_rfr_rmse_scores.std()))

Price RFR Mean: 1222338086.5549366
Price RFR STD: 605570692.5669394


In [70]:
hparams_grid= [
    {'n_estimators': [30, 40, 50], 'max_features': [3, 5]},
    {'bootstrap': [False], 'n_estimators': [30, 40], 'max_features': [4, 5]}
]

In [71]:
rfr = RandomForestRegressor()

gs = GridSearchCV(rfr, hparams_grid, cv=5,
                 scoring="neg_mean_squared_error",
                 return_train_score=True)

In [72]:
gs.fit(price_train_features, price_train_target)



ValueError: max_features must be in (0, n_features]

In [58]:
gs.best_params_

{'max_features': 3, 'n_estimators': 50}

In [59]:
gs.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [60]:
cvres = gs.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

12003.010966105954 {'max_features': 3, 'n_estimators': 30}
12229.00985707594 {'max_features': 3, 'n_estimators': 40}
11887.682029844149 {'max_features': 3, 'n_estimators': 50}
12304.888558318871 {'max_features': 5, 'n_estimators': 30}
12075.149750791032 {'max_features': 5, 'n_estimators': 40}
12287.806364811775 {'max_features': 5, 'n_estimators': 50}
12326.176970186278 {'bootstrap': False, 'max_features': 4, 'n_estimators': 30}
12209.969769624036 {'bootstrap': False, 'max_features': 4, 'n_estimators': 40}
15510.743407822813 {'bootstrap': False, 'max_features': 5, 'n_estimators': 30}
15532.292041217579 {'bootstrap': False, 'max_features': 5, 'n_estimators': 40}


In [61]:
yield_feature_importances = gs.best_estimator_.feature_importances_

In [62]:
yield_feature_importances
sorted(zip(yield_feature_importances, ['food_supply_percentage', 'feed_supply_percentage', 'export_supply_percentage', 'other_use_supply_percentage', 'Fertilizer']
), reverse=True)

[(0.29333817237918736, 'food_supply_percentage'),
 (0.2125607633671596, 'other_use_supply_percentage'),
 (0.19194457910965995, 'feed_supply_percentage'),
 (0.15589389434570505, 'export_supply_percentage'),
 (0.14626259079828816, 'Fertilizer')]

In [63]:
final_yield_model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [64]:
final_yield_model.fit(yield_train_features, yield_train_target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [65]:
final_yield_predictions = final_yield_model.predict(yield_test_features)

In [66]:
final_yield_mse = mean_squared_error(test_set.loc[:,'Yield'], final_yield_predictions)

In [67]:
final_yield_rmse = np.sqrt(final_yield_mse)

In [68]:
final_yield_rmse

9837.024900608043

In [69]:
final_yield_scores = cross_val_score(final_yield_model, yield_test_features, test_set.loc[:,'Yield'],
                           scoring="neg_mean_squared_error", cv=10)

In [70]:
final_yield_rmse_scores = np.sqrt(-final_yield_scores)

In [71]:
print("Final Model Scores Mean: {}".format(final_yield_rmse_scores.mean()))
print("Final Model Scores STD: {}".format(final_yield_rmse_scores.std()))

Final Model Scores Mean: 14512.276858178182
Final Model Scores STD: 3317.1088222248795
