In [1]:
import os
from pathlib import Path
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

In [2]:
with open("data/df_agriculture_cleaned.pkl", "rb") as file:
    df_agriculture_cleaned = pickle.load(file)

In [3]:
train_set, test_set = train_test_split(df_agriculture_cleaned, test_size=0.2, random_state=42)

In [4]:
train_set = train_set[~train_set.isin([np.nan, np.inf, -np.inf]).any(1)]
test_set = test_set[~test_set.isin([np.nan, np.inf, -np.inf]).any(1)]

In [5]:
food_supply_train_features = train_set.loc[:,['other_use_supply_percentage', 'feed_supply_percentage', 'export_supply_percentage', 'import_supply_percentage']]
food_supply_train_target = train_set.loc[:,'food_supply_percentage']

food_supply_test_features = test_set.loc[:,['other_use_supply_percentage', 'feed_supply_percentage', 'export_supply_percentage', 'import_supply_percentage']]
food_supply_test_target = test_set.loc[:,'food_supply_percentage']

In [6]:
def useLinearRegression(train_features, train_target, test_features, test_target):
    lr = LinearRegression()
    lr.fit(train_features, train_target)
    lr_predictions = lr.predict(test_features)
    lr_mse = mean_squared_error(test_target, lr_predictions)
    lr_rmse = np.sqrt(lr_mse)
    lr_cvs_scores = cross_val_score(lr, train_features, train_target, scoring="neg_mean_squared_error", cv=10)
    lr_rmse_scores = np.sqrt(-lr_cvs_scores)
    lr_rmse_mean_score = lr_rmse_scores.mean()
    lr_rmse_std_score = lr_rmse_scores.std()
    return [lr_rmse, lr_rmse_mean_score, lr_rmse_std_score]

In [7]:
lr_food_supply_scores = useLinearRegression(food_supply_train_features, food_supply_train_target, food_supply_test_features, food_supply_test_target)

In [8]:
print("Food Supply - Linear Regression - Root Mean Squared Error: {}".format(lr_food_supply_scores[0]))
print("Food Supply - Linear Regression - Mean: {}".format(lr_food_supply_scores[1]))
print("Food Supply - Linear Regression - STD: {}".format(lr_food_supply_scores[2]))

Food Supply - Linear Regression - Root Mean Squared Error: 6.108048483033716
Food Supply - Linear Regression - Mean: 5.9768286615664845
Food Supply - Linear Regression - STD: 0.6758107508172784


In [9]:
def useDecisionTreeRegressor(train_features, train_target, test_features, test_target):
    dtr = DecisionTreeRegressor()
    dtr.fit(train_features, train_target)
    dtr_predictions = dtr.predict(test_features)
    dtr_mse = mean_squared_error(test_target, dtr_predictions)
    dtr_rmse = np.sqrt(dtr_mse)
    dtr_scores = cross_val_score(dtr, train_features, train_target, scoring="neg_mean_squared_error", cv=10)
    dtr_rmse_scores = np.sqrt(-dtr_scores)
    dtr_rmse_mean_score = dtr_rmse_scores.mean()
    dtr_rmse_std_score = dtr_rmse_scores.std()
    return [dtr_rmse, dtr_rmse_mean_score, dtr_rmse_std_score]

In [10]:
dtr_food_supply_scores = useDecisionTreeRegressor(food_supply_train_features, food_supply_train_target, food_supply_test_features, food_supply_test_target)

In [11]:
print("Food Supply - Decision Tree Regression - Root Mean Squared Error: {}".format(dtr_food_supply_scores[0]))
print("Food Supply - Decision Tree Regression - Mean: {}".format(dtr_food_supply_scores[1]))
print("Food Supply - Decision Tree Regression - STD: {}".format(dtr_food_supply_scores[2]))

Food Supply - Decision Tree Regression - Root Mean Squared Error: 3.9676878035278405
Food Supply - Decision Tree Regression - Mean: 5.075420276579883
Food Supply - Decision Tree Regression - STD: 2.0469622844887394


In [12]:
def useRandomForestRegressor(train_features, train_target, test_features, test_target):
    rfr = RandomForestRegressor()
    rfr.fit(train_features, train_target)
    rfr_predictions = rfr.predict(test_features)
    rfr_mse = mean_squared_error(test_target, rfr_predictions)
    rfr_rmse = np.sqrt(rfr_mse)
    rfr_cvs_scores = cross_val_score(rfr, train_features, train_target, scoring="neg_mean_squared_error", cv=10)
    rfr_rmse_scores = np.sqrt(-rfr_cvs_scores)
    rfr_rmse_mean_score = rfr_rmse_scores.mean()
    rfr_rmse_std_score = rfr_rmse_scores.std()
    return [rfr_rmse, rfr_rmse_mean_score, rfr_rmse_std_score]

In [13]:
rfr_food_supply_scores = useRandomForestRegressor(food_supply_train_features, food_supply_train_target, food_supply_test_features, food_supply_test_target)



In [14]:
print("Food Supply - Random Forest Regression - Root Mean Squared Error: {}".format(rfr_food_supply_scores[0]))
print("Food Supply - Random Forest Regression - Mean: {}".format(rfr_food_supply_scores[1]))
print("Food Supply - Random Forest Regression - STD: {}".format(rfr_food_supply_scores[2]))

Food Supply - Random Forest Regression - Root Mean Squared Error: 3.9081853412462886
Food Supply - Random Forest Regression - Mean: 4.240573854104936
Food Supply - Random Forest Regression - STD: 1.5160876849538794


In [15]:
hparams_grid= [
    {'n_estimators': [20, 30, 40, 50], 'max_features': [2, 4]},
    {'bootstrap': [False], 'n_estimators': [30, 40, 50, 60], 'max_features': [3, 4]}
]

In [16]:
rfr_model = RandomForestRegressor(random_state=0)

gs = GridSearchCV(rfr_model, hparams_grid, cv=5,
                 scoring="neg_mean_squared_error",
                 return_train_score=True)

In [17]:
gs.fit(food_supply_train_features, food_supply_train_target)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=0,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4],
         

In [18]:
gs.best_params_

{'bootstrap': False, 'max_features': 3, 'n_estimators': 40}

In [19]:
gs.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [20]:
cvres = gs.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

4.776660562992186 {'max_features': 2, 'n_estimators': 20}
4.535641919674319 {'max_features': 2, 'n_estimators': 30}
4.550536123847723 {'max_features': 2, 'n_estimators': 40}
4.456221809469817 {'max_features': 2, 'n_estimators': 50}
4.576296410855213 {'max_features': 4, 'n_estimators': 20}
4.553151967212738 {'max_features': 4, 'n_estimators': 30}
4.504727448579877 {'max_features': 4, 'n_estimators': 40}
4.496003882090672 {'max_features': 4, 'n_estimators': 50}
4.064396041011997 {'bootstrap': False, 'max_features': 3, 'n_estimators': 30}
4.027579602653438 {'bootstrap': False, 'max_features': 3, 'n_estimators': 40}
4.079688012837597 {'bootstrap': False, 'max_features': 3, 'n_estimators': 50}
4.058532621695126 {'bootstrap': False, 'max_features': 3, 'n_estimators': 60}
5.225559903561239 {'bootstrap': False, 'max_features': 4, 'n_estimators': 30}
5.222627636709599 {'bootstrap': False, 'max_features': 4, 'n_estimators': 40}
5.219667983149594 {'bootstrap': False, 'max_features': 4, 'n_estimat

In [21]:
food_supply_feature_importances = gs.best_estimator_.feature_importances_

In [22]:
sorted(zip(food_supply_feature_importances, ['other_use_supply_percentage', 'feed_supply_percentage', 'export_supply_percentage', 'import_supply_percentage']), reverse=True)

[(0.8616603989548646, 'feed_supply_percentage'),
 (0.07441377152083993, 'other_use_supply_percentage'),
 (0.03870460128767835, 'export_supply_percentage'),
 (0.02522122823661716, 'import_supply_percentage')]

In [23]:
food_supply_model = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [24]:
food_supply_model.fit(food_supply_train_features, food_supply_train_target)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [25]:
food_supply_predictions = food_supply_model.predict(food_supply_test_features)

In [26]:
food_supply_mse = mean_squared_error(food_supply_test_target, food_supply_predictions)

In [27]:
food_supply_rmse = np.sqrt(food_supply_mse)

In [28]:
food_supply_rmse

2.623906020523008

In [29]:
food_supply_scores = cross_val_score(food_supply_model, food_supply_test_features, food_supply_test_target,
                           scoring="neg_mean_squared_error", cv=10)

In [30]:
food_supply_rmse_scores = np.sqrt(-food_supply_scores)

In [31]:
print("Final Model Mean Scores: {}".format(food_supply_rmse_scores.mean()))
print("Final Model Scores STD: {}".format(food_supply_rmse_scores.std()))

Final Model Mean Scores: 6.280656022138692
Final Model Scores STD: 2.166475928013373


In [32]:
df_agriculture_cleaned.tail()

Unnamed: 0,Producer Price (LCU/tonne),Price USD,Area harvested,Yield,Domestic Supply,Pesticides,Fertilizer,food_supply_percentage,feed_supply_percentage,export_supply_percentage,other_use_supply_percentage,import_supply_percentage,Price Level
786,1000.0,3327.917,2515541.0,44941.0,7708.0,6947.9,491831.0,11.390763,85.236118,28.863335,0.038921,4.060716,4.0
787,2006.0,19368.042336,2781200.0,42466.0,10481.0,26857.0,721481.0,50.405496,44.986165,25.704851,0.0,0.5343,4.0
788,198.9,149.803325,442300.0,110445.0,10421.0,54197.0,1749149.0,0.892429,87.515594,6.526187,10.411669,54.678054,3.0
789,611.9,1164.915639,659222.0,89499.0,7053.0,39440.0,2312134.0,25.818801,58.131292,6.728814,10.562881,21.976464,4.0
790,176.0,176.0,35390550.0,99256.0,292776.0,407779.2,20994055.0,1.337883,43.727628,6.970616,46.801309,1.227901,3.0


In [42]:
increase = -20
export_supply_percentage = 0.152328
import_supply_percentage = -0.084344
feed_supply_percentage = -0.150412
food_supply_percentage = -0.158367
#'other_use_supply_percentage', 'feed_supply_percentage', 'export_supply_percentage', 'import_supply_percentage'
#food_supply_model.predict([[46.8 + increase, 43.7 + (increase*feed_supply_percentage), 8.42 + (increase*export_supply_percentage), 1.22 + (increase*import_supply_percentage)]])
food_supply_model.predict([[46.8, 43.7, 15, 0.5]])

array([7.14126168])