In [1]:
import os, sys
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import pickle
import sklearn
from sklearn.ensemble import RandomForestRegressor
sys.path.append(os.path.abspath('../SRC'))
from HomeRoots import HomeRoots

#### Data for prediction test

In [2]:
datafortest = pd.read_csv('../data/merged_data/cropweachard.csv')

In [3]:
weather_features = ['mintempC', 'maxtempC', 'sunHour', 'cloudcover', 'humidity', 
                        'precipMM', 'pressure', 'windspeedKmph', 'quant_per_tot']

In [4]:
test = datafortest.iloc[2][weather_features].to_numpy().reshape(1,-1)
test2 = datafortest.iloc[12][weather_features].to_numpy().reshape(1,-1)
test2

array([[-4.32258064516129, 1.7741935483870968, 6.138709677419352,
        51.45161290322582, 74.61290322580646, 2.0064516129032257,
        1019.3548387096774, 8.612903225806452, 0.0033603852980096023]],
      dtype=object)

Crop categories

In [5]:
vegetable = ['amaranth', 'arugula','asparagus', 'peppers','beans','beetroot', 'peas',
       'bok choy', 'gourd','broccoli', 'fennel','brussels sprouts', 'squash', 'cabbage', 
       'carrots', 'celery', 'chard', 'tomato', 'collards', 'corn',  'cucumbers','kale',
       'eggplant','garlic', 'lettuce', 'radish', 'jalapenos',
       'kohlrabi', 'leeks','okra', 'onion','potatoes', 'pumpkin', 'spinach', 'sweet potato',
       'taro', 'tomatillos', 'turnip','zucchini', 'artichoke', 'callaloo']

In [6]:
fruit = [ 'apples', 'blackberries',
       'cantaloupe','cherries','figs','grapes', 'ground cherry',
        'melon','mulberry', 'peaches', 'pear', 'plums', 'raspberry',
       'rhubarb','strawberry']

In [7]:
herbs=[ 'basil', 'chives', 'cilantro', 'dill', 'epazote',
     'lemon balm', 'mint', 'mustard', 'oregano', 'parsley', 'rosemary',
        'sage', 'scallions',  'shiso',
       'tarragon', 'thyme']

In [8]:
flowers=['borage', 'calendula', 'chamomile', 'dandelion',
        'lavender',  'marigolds','nasturtium', 'rue']   #most of the flowers had too few data points to be used so i will drop them

#### Functions for model comparison

In [None]:
def predicted_actual_plot(df_pred):
    plt.rcParams['figure.figsize']=(20,5)
    plt.title('Predicted vs Actual Values RF Regression')
    plt.plot(df_pred['Actual'], marker = '.', color = 'blue', label = 'Actual')
    plt.plot(df_pred['Predicted'], marker = '.', color = 'red', label = 'Predicted')
    plt.legend()
    plt.savefig('peppersRF.png')
    plt.show()

In [None]:
def error_plots(cv_result):
    plt.rcParams['figure.figsize']=(20,5)
    plt.title('Error Metricts')
    plt.plot(cv_result[0], color = 'red', label = 'mae')
    plt.plot(cv_result[1], color = 'blue', label = 'mse')
    plt.plot(cv_result[2], color = 'cyan', label = 'rmse')
    plt.legend()
    plt.show()
    

In [None]:
def crop_compare(model_choose, ne, maxd, sc):   #only for location dependent split but with scaled/unscaled choice 
    dict_dep = {}
    dict_dep2 = {}
    for veg in vegetable:
        myroot =   HomeRoots(veg, split='location_dependent', percent=0.2, is_scaled=sc)
        df_pred = myroot.group_cv(model=model_choose, n_estim=ne, max_d=maxd)
        dict_dep[veg] = df_pred[3]
        dict_dep2[veg] = df_pred[4]
    return dict_dep, dict_dep2

In [None]:
def compare_models(myroot1, myroot2, myroot3, myroot4, model1, model2, model3, model4, n_estim1, n_estim2, n_estim3, maxd1, maxd2, maxd3):
    df_pred_cv1 = myroot1.group_cv(model=model1, n_estim=n_estim1, max_d = maxd1)
    df_pred_cv2 = myroot2.group_cv(model=model2, n_estim=n_estim2, max_d = maxd2)
    df_pred_cv3 = myroot3.group_cv(model=model3, n_estim=n_estim3, max_d = maxd3)
    df_pred_cv4 = myroot4.group_cv(model=model4)
    plt.rcParams['figure.figsize']=(20,5)
    plt.title('Error Metricts')
    plt.plot(df_pred_cv1[1], color = 'green', label = f'mse-{model1}-$n_e${n_estim1}-$m_d${maxd1}-tomato')
    plt.plot(df_pred_cv2[1], color = 'red', label = f'mse-{model2}-$n_e${n_estim2}-$m_d${maxd2}-pepper')
    plt.plot(df_pred_cv3[1], color = 'blue', label = f'mse-{model3}-tomato')
    plt.plot(df_pred_cv4[1], color = 'orange', label = f'mse-{model4}-pepper')
    
    plt.legend()
    plt.show()

## Model comparison

In [None]:
myroot_peppers_dep = HomeRoots('peppers', split='location_dependent', percent=0.2, scaled = True)
myroot_collards_dep = HomeRoots('collards', split='location_dependent', percent=0.2, scaled = True)
myroot_lettuce_dep = HomeRoots('lettuce', split='location_dependent', percent=0.2, scaled = True)

In [None]:
myroot_zucchini_dep = HomeRoots('zucchini', split='location_dependent', percent=0.2, scaled = True)
myroot_zucchini_indep = HomeRoots('zucchini', split='location_independent', percent=0.2, scaled = True)

In [None]:
myroot_tomato_dep = HomeRoots('tomato', split='location_dependent', percent=0.2, scaled = True)
myroot_tomato_indep = HomeRoots('tomato', split='location_independent', percent=0.2, scaled = True)

#### Actual vs Predicted plots

#### Comparison of location dependend and location indipendant train/test split

In [None]:
df_pred_rf_dep = myroot_tomato_dep.random_forest_model(n_est = 50, max_d = 5, one_spot = test)
df_pred_rf_indep = myroot_tomato_indep.random_forest_model(n_est = 50, max_d = 5, one_spot = test)
df_pred_lin_dep = myroot_tomato_dep.linear_model(one_spot = test)
df_pred_lin_indep = myroot_tomato_indep.linear_model(one_spot = test)

In [None]:
df_pred_rf_dep = myroot_zucchini_dep.random_forest_model(n_est = 50, max_d = 5, one_spot = None)
df_pred_rf_indep = myroot_zucchini_indep.random_forest_model(n_est = 50, max_d = 5, one_spot = None)
df_pred_lin_dep = myroot_zucchini_dep.linear_model(one_spot = None)
df_pred_lin_indep = myroot_zucchini_indep.linear_model(one_spot = None)

In [None]:
predicted_actual_plot(df_pred_rf_dep[0])

In [None]:
predicted_actual_plot(df_pred_rf_indep[0])

In [None]:
predicted_actual_plot(df_pred_lin_dep[0])

In [None]:
predicted_actual_plot(df_pred_lin_indep[0])

## Group K fold cross validation

#### Comparison of error metrics for regression

In [None]:
cv_tomato_rf = myroot_tomato_dep.group_cv(model='RF', n_estim=50, max_d = 10)       # hight number of data points
cv_tomato_lin = myroot_tomato_dep.group_cv(model='lin')

In [None]:
cv_zucchini_rf = myroot_zucchini_dep.group_cv(model='RF', n_estim=50, max_d = 10)   #low number of data points
cv_zucchini_lin = myroot_zucchini_dep.group_cv(model='lin')

In [None]:
 error_plots(cv_tomato_rf)

In [None]:
 error_plots(cv_tomato_lin)

In [None]:
 error_plots(cv_zucchini_rf)

In [None]:
 error_plots(cv_zucchini_lin)

Going to use MSE as error meteric. Other error metrics are consistently higher or the same. 

#### Linear vs Random Forest

Linear vs RF for a couple crops. Plot of each K fold iteration erros. They seem to vary depending on the training/CV split so will need to look at average of erros. 


In [None]:
compare_models(myroot_tomato_dep, myroot_peppers_dep, myroot_tomato_dep, myroot_peppers_dep, 'RF', 'RF', 'lin', 'lin', 50, 50, None, 10, 10, None)

In [None]:
crop_compareRF = crop_compare('RF', 50, 10, True)

In [None]:
crop_comparelin = crop_compare('lin', 50, 20, True)

In [None]:
veg_rf = pd.DataFrame(data = crop_compareRF[0], index = ['mse']).T.sort_values('mse',ascending = False)
veg_rf2 = pd.DataFrame(data = crop_compareRF[1], index = ['mae']).T.sort_values('mae',ascending = False)
veg_lin = pd.DataFrame(data = crop_comparelin[0], index = ['mse']).T.sort_values('mse',ascending = False)
veg_lin2 = pd.DataFrame(data = crop_comparelin[1], index = ['mae']).T.sort_values('mae',ascending = False)

In [None]:
subsetrf= veg_rf.loc[['asparagus', 'jalapenos', 'gourd', 'carrots', 'eggplant', 'spinach', 'tomato', 'turnip']]
subsetlin =veg_lin.loc[['asparagus', 'jalapenos', 'gourd', 'carrots', 'eggplant', 'spinach', 'tomato', 'turnip']]

In [None]:
plt.rcParams['figure.figsize']=(10,5)
plt.title('Compare Veg Error MSE - RF vs Linear')
plt.plot(subsetrf, '^', color = 'green', label ='MSE for RF - n_estim=50, max-depth=10' )
plt.plot(subsetlin,'.', color = 'red', label = 'MSE for Linear')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.yscale('log')
plt.savefig('linvsrf.png')
plt.legend()
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.title('Compare Veg Error MSE - RF vs Linear')
plt.plot(veg_rf, '.', color = 'red', label ='RF mse - ns=50, maxd=10' )
plt.plot(veg_lin,'.', color = 'green', label = 'linear mse')
plt.plot(veg_rf2, '^', color = 'red', label ='RF mae - ns=50, maxd=10' )
plt.plot(veg_lin2,'^', color = 'green', label = 'linear mae')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
mse_vegRF_mean = veg_rf.mean()
mae_vegRF_mean = veg_rf2.mean()
mse_veglin_mean = veg_lin.mean()
mae_veglin_mean = veg_lin2.mean()
mse_vegRF_mean, mse_veglin_mean, mae_vegRF_mean,mae_veglin_mean

While linear MSE is slightly lower for some vegetables, it is wildly high for others (the ones with lower number of data points). RF seems to give a more stable prediction over all. 

#### Scaled vs Unscaled features for Random Forest

In [None]:
veg_rfu = pd.DataFrame(data = crop_compare('RF', 50, 10, False), index = ['mae']).T.sort_values('mae',ascending = True)
veg_rfs = pd.DataFrame(data = crop_compare('RF', 50, 10, True), index = ['mae']).T.sort_values('mae',ascending = True)

In [None]:
veg_rfu.T

In [None]:
veg_rfs.T

In [None]:
veg_rfu2 = pd.DataFrame(data = crop_compare('RF', 100, 10, False), index = ['mae']).T.sort_values('mae',ascending = True)
veg_rfs2 = pd.DataFrame(data = crop_compare('RF', 100, 10, True), index = ['mae']).T.sort_values('mae',ascending = True)

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.title('Compare Veg Error MSE - Scaled vs Unscaled Features')
plt.plot(veg_rfu, '^', color = 'red', label ='RF - ns=50, maxd=10 - unscaled' )
plt.plot(veg_rfs,'.', color = 'green', label ='RF - ns=50, maxd=10 - scaled')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.legend()
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.title('Compare Veg Error MSE - Scaled vs Unscaled Features')
plt.plot(veg_rfu2, '^', color = 'red', label ='RF - ns=100, maxd=10 - unscaled' )
plt.plot(veg_rfs2,'.', color = 'green', label ='RF - ns=100, maxd=10 - scaled')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.legend()
plt.show()

Scaled on average perform worse then unscaled for RF so I will used unscaled in the final app part. 

#### Optimizing Random Forest parameters - unscaled

In [None]:
veg_rf1 = pd.DataFrame(data = crop_compare('RF', 100, 10, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)
veg_rf2 = pd.DataFrame(data = crop_compare('RF', 50, 10, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)
veg_rf3 = pd.DataFrame(data = crop_compare('RF', 30, 10, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)

In [None]:
veg_rf6 = pd.DataFrame(data = crop_compare('RF', 20, 10, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)
veg_rf7 = pd.DataFrame(data = crop_compare('RF', 10, 10, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)

In [None]:
veg_rf4 = pd.DataFrame(data = crop_compare('RF', 50, 5, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)
veg_rf5 = pd.DataFrame(data = crop_compare('RF', 50, 20, False)[0], index = ['mae']).T.sort_values('mae',ascending = True)

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.title('Compare Veg Error MSE - RF compare n-estimators')
plt.plot(veg_rf3, 's', color = 'red', label ='RF - ns=30, maxd=10' )
plt.plot(veg_rf2,'.', color = 'green', label ='RF - ns=50, maxd=10')
plt.plot(veg_rf1,'^', color = 'blue', label ='RF - ns=100, maxd=10')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.legend()
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.title('Compare Veg Error MSE - RF compare n-estimators')
plt.plot(veg_rf3, 's', color = 'red', label ='RF - ns=30, maxd=10' )
plt.plot(veg_rf6,'.', color = 'green', label ='RF - ns=20, maxd=10')
plt.plot(veg_rf7,'^', color = 'blue', label ='RF - ns=10, maxd=10')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.legend()
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.title('Compare Veg Error MSE - RF compare max depth')
plt.plot(veg_rf4, 's', color = 'red', label ='RF - ns=50, maxd=5' )
plt.plot(veg_rf2,'.', color = 'green', label ='RF - ns=50, maxd=10')
plt.plot(veg_rf5,'^', color = 'blue', label ='RF - ns=50, maxd=20')
plt.xticks(rotation=90)
plt.ylabel('Average MSE for Group K-fold CV')
plt.legend()
plt.show()

Changing maximum depth doesn't seem to change much, but lower n-estimators perform sligthly better for some crops

## Prediction Test

In [None]:
def lin_predict(test_data):
    lin_dict_dep = {}
    for veg in vegetable:
        print(veg)
        myroot =   HomeRoots(veg, split='location_dependent', percent=0.2, is_scaled=True )
        prediction = myroot.linear_model(one_spot = test_data)
        if prediction[2]<1:
            lin_dict_dep[veg] = [prediction[-1][0], prediction[2]]
    return lin_dict_dep

In [9]:
def rf_predict(test_data, n, m, type_crop):
    rf_dict_dep = {}
    for veg in type_crop:
        print(veg)
        myroot =   HomeRoots(veg, split='location_dependent', percent=0.2, scaled=False)
        prediction = myroot.random_forest_model(n_est = n, max_d = m, one_spot = test_data)
        if prediction[2]<0.9:
            rf_dict_dep[veg] = [prediction[-1][0], prediction[2]]
    return rf_dict_dep

In [10]:
HomeRoots('garlic', split='location_dependent', percent=0.2, scaled=False).Y_train

array([2.89238649e-01, 4.09033831e-02, 9.91640621e-02, 7.79787898e-02,
       1.30667177e-02, 4.20720512e-02, 1.66082526e-02, 2.91666667e-01,
       1.45833333e-01, 2.29166667e-01, 3.33333333e-01, 2.16948913e-04,
       2.05334136e-03, 1.23152709e-01, 1.30169348e-03, 1.10043220e-03,
       1.37554025e-03, 6.97290356e-05, 9.76206499e-03, 1.74322589e-04,
       3.48645178e-04, 8.36748427e-04, 2.09187107e-03, 1.28557424e-03,
       4.03074222e-03, 2.52353563e-03, 5.04707126e-03, 1.59587618e-03,
       1.08474456e-04, 2.02297959e-04, 2.04413473e-02, 5.42372282e-04,
       2.84586541e-03, 4.87862641e-03, 5.69173081e-03, 3.25241761e-03,
       1.62620880e-03, 3.19175236e-04, 1.06391745e-04, 5.19390499e-03,
       1.31147541e-02, 5.24590164e-02, 1.96721311e-02, 7.50819858e-03,
       2.18989125e-03, 3.12841607e-04, 6.67389501e-03, 9.83575235e-03,
       1.61838367e-03, 6.91546345e-03, 1.61290323e-02])

In [11]:
dict_rf_n100veg  = rf_predict(test, 100, 10, vegetable)

amaranth
(13,)
[[0.002603386953540036]]
Mean Absolute Error: 0.00178138823234471
Mean Squared Error: 3.1733440343362103e-06
Root Mean Squared Error: 0.00178138823234471
arugula
(85,)
[[0.002603386953540036]]
Mean Absolute Error: 0.014023202112439865
Mean Squared Error: 0.0005885968467309273
Root Mean Squared Error: 0.024261014956735164
asparagus
(28,)
[[0.002603386953540036]]
Mean Absolute Error: 0.005006229447262969
Mean Squared Error: 2.51869263840227e-05
Root Mean Squared Error: 0.005018657826951614
peppers
(751,)
[[0.002603386953540036]]
Mean Absolute Error: 0.010336149369915124
Mean Squared Error: 0.00044673600444864115
Root Mean Squared Error: 0.0211361303092274
beans
(417,)
[[0.002603386953540036]]
Mean Absolute Error: 0.010642937758800836
Mean Squared Error: 0.0005932500718798477
Root Mean Squared Error: 0.02435672539320193
beetroot
(142,)
[[0.002603386953540036]]
Mean Absolute Error: 0.021953643277638737
Mean Squared Error: 0.003157681252192499
Root Mean Squared Error: 0.05619

In [12]:
dict_rf_n100herb  = rf_predict(test, 100, 10, herbs)

basil
(271,)
[[0.002603386953540036]]
Mean Absolute Error: 0.008953179984486201
Mean Squared Error: 0.00015522933020858663
Root Mean Squared Error: 0.012459106316609816
chives
(78,)
[[0.002603386953540036]]
Mean Absolute Error: 0.0067282838535154136
Mean Squared Error: 0.00024913417360242717
Root Mean Squared Error: 0.015783984718771973
cilantro
(66,)
[[0.002603386953540036]]
Mean Absolute Error: 0.0040579485192684455
Mean Squared Error: 2.8166685825904417e-05
Root Mean Squared Error: 0.005307229581043618
dill
(33,)
[[0.002603386953540036]]
Mean Absolute Error: 0.0051597883377679146
Mean Squared Error: 4.01083956770314e-05
Root Mean Squared Error: 0.006333118953330294
epazote
(5,)
[[0.002603386953540036]]
Mean Absolute Error: 0.002814776278234116
Mean Squared Error: 7.922965496509502e-06
Root Mean Squared Error: 0.002814776278234116
lemon balm
(26,)
[[0.002603386953540036]]
Mean Absolute Error: 0.02420141956075425
Mean Squared Error: 0.0017320870266197806
Root Mean Squared Error: 0.041

In [13]:
dict_rf_n100fruit  = rf_predict(test, 100, 10, fruit)

apples
(30,)
[[0.002603386953540036]]
Mean Absolute Error: 0.002149018151834613
Mean Squared Error: 1.1928880646301515e-05
Root Mean Squared Error: 0.0034538211659409226
blackberries
(19,)
[[0.002603386953540036]]
Mean Absolute Error: 0.0019379135455996211
Mean Squared Error: 5.634509144504236e-06
Root Mean Squared Error: 0.002373712102278673
cantaloupe
(5,)
[[0.002603386953540036]]
Mean Absolute Error: 0.028697486938962055
Mean Squared Error: 0.0008235457566118978
Root Mean Squared Error: 0.028697486938962055
cherries
(13,)
[[0.002603386953540036]]
Mean Absolute Error: 0.022360994716326226
Mean Squared Error: 0.0007456561392066789
Root Mean Squared Error: 0.027306705022881815
figs
(26,)
[[0.002603386953540036]]
Mean Absolute Error: 0.002060112100157063
Mean Squared Error: 8.809858558174436e-06
Root Mean Squared Error: 0.002968140589354628
grapes
(26,)
[[0.002603386953540036]]
Mean Absolute Error: 0.0007855197624783688
Mean Squared Error: 8.964703445389189e-07
Root Mean Squared Error: 

In [None]:
df_dict_rf_n100 = pd.DataFrame(data = dict_rf_n100, index = ['Planting Score','mae']).T.sort_values('Planting Score',ascending = False)

In [None]:
df_dict_rf_n30 = pd.DataFrame(data = dict_rf_n30, index = ['Planting Score','mae']).T.sort_values('Planting Score',ascending = False)

In [None]:

df_dict_rf_n100['mae'].mean(), df_dict_rf_n100['mae'].max()

In [None]:
tot = df_dict_rf_n100['Planting Score'].sum()

In [None]:
(df_dict_rf_n100['Planting Score'].iloc[0])

In [None]:
plt.rcParams['figure.figsize']=(20,5)
plt.plot(df_dict_rf_n30['mae'],'.', color = 'green', label ='RF - ns=30, maxd=10')
plt.plot(df_dict_rf_n10['mae'],'^', color = 'blue', label ='RF - ns=10, maxd=10')
plt.legend()
plt.xticks(rotation=90)
plt.show()

In [None]:
myroot =   HomeRoots('tomato', split='location_dependent', percent=0.2, is_scaled=False)

In [None]:
myroot.X_train.shape

In [None]:
for veg in vegetable:
    myroot =   HomeRoots(veg, split='location_dependent', percent=0.2, is_scaled=False)
    rf = RandomForestRegressor(n_estimators = 30)
    feature_list = weather_features
    rf.fit(myroot.X_train, myroot.Y_train)
    importances = rf.feature_importances_
    feature_importances = pd.DataFrame(importances, index=feature_list,
                                   columns=['Importance']).sort_values(by=['Importance'],ascending=False)
    num_features = list(range(len(importances)))

    plt.figure(figsize=(10, 5))
    plt.bar(num_features, importances, orientation = 'vertical', color = '#3d9973',
        edgecolor = 'k', linewidth = 1.2)

    plt.xticks(num_features, feature_list, rotation='vertical')

    plt.ylabel(f'{veg} Importance'); plt.xlabel('Variable'); plt.title(f'Variable Importances - {veg}');
    plt.plot()

In [None]:
df1 = myroot_tomato_dep.test
df2= myroot_peppers_dep.test
df3= myroot_collards_dep.test
df4= myroot_lettuce_dep.test
df1zip = df1.index.unique().to_numpy()
df2zip = df2.index.unique().to_numpy()
df3zip = df3.index.unique().to_numpy()
df4zip = df4.index.unique().to_numpy()

In [None]:
import sys

In [None]:
sys.version


In [None]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))


In [1]:
a = 'potatoes'
a[-1]

's'

In [2]:
if a[-1]=='s':
    print('plural')

plural
