# ML trials

In [1]:
import pandas as pd

performance_data = pd.read_csv('performance_data.csv')
submission_template = pd.read_csv('submission_template.csv')
weather_data = pd.read_csv('weather_data.csv')

In [2]:
performance_data.columns

Index(['HYBRID_ID', 'ENV_ID', 'HYBRID_MG', 'ENV_MG', 'YIELD', 'YEAR', 'LAT',
       'LONG', 'PLANT_DATE', 'HARVEST_DATE', 'IRRIGATION', 'ENV_YIELD_MEAN',
       'ENV_YIELD_STD', 'ELEVATION', 'CLAY', 'SILT', 'SAND', 'AWC', 'PH', 'OM',
       'CEC', 'KSAT'],
      dtype='object')

In [3]:
weather_data.head()

Unnamed: 0,ENV_ID,DAY_NUM,DAYL,PREC,SRAD,SWE,TMAX,TMIN,VP
0,Env_1,1,29030.400391,0,92.800003,24,-14.5,-21.0,120
1,Env_1,2,29030.400391,0,166.399994,24,-7.0,-26.0,80
2,Env_1,3,29030.400391,0,144.0,24,0.0,-12.5,240
3,Env_1,4,29030.400391,0,112.0,24,-3.5,-11.5,240
4,Env_1,5,29030.400391,1,153.600006,24,2.5,-11.5,240


## Transforming weather data into useful metrics

In [4]:
weather_data_grouped_mean = weather_data.groupby(by='ENV_ID').mean()
weather_data_grouped_std = weather_data.groupby(by='ENV_ID').std()
weather_data_grouped_mean.head()
weather_data_grouped_std.head()
weather_data_grouped = weather_data_grouped_mean.join(weather_data_grouped_std,
                                                      lsuffix='_AVG',
                                                     rsuffix='_STD')
weather_data_grouped = weather_data_grouped.drop('DAY_NUM_AVG',axis=1)
weather_data_grouped.head()

Unnamed: 0_level_0,DAYL_AVG,PREC_AVG,SRAD_AVG,SWE_AVG,TMAX_AVG,TMIN_AVG,VP_AVG,DAY_NUM_STD,DAYL_STD,PREC_STD,SRAD_STD,SWE_STD,TMAX_STD,TMIN_STD,VP_STD
ENV_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Env_1,43200.000043,1.684932,276.111781,11.167123,8.093151,-3.256164,630.356164,105.510663,10075.200862,4.544882,131.085275,16.046875,14.926959,13.574307,535.634711
Env_10,43200.946768,3.079452,294.97863,19.868493,12.09589,2.241096,874.191781,105.510663,8183.382184,5.974924,134.977729,31.810984,11.625704,9.839269,559.526111
Env_100,43200.00007,2.90137,296.714521,20.679452,12.616438,0.632877,820.054795,105.510663,8795.413435,7.152662,119.593751,30.413609,12.877466,12.410814,614.102577
Env_1000,43200.946854,2.0,352.368219,0.99726,18.323288,2.883562,824.876712,105.510663,6955.702533,6.970897,111.010912,2.447805,11.374718,10.219944,674.104452
Env_1001,43200.946854,2.561644,341.681097,2.443836,17.70137,2.924658,862.246575,105.510663,6955.702533,8.697269,112.386846,5.729262,11.039508,10.119774,629.404388


## Joining performance and weather data

In [5]:
joined_df = performance_data.join(weather_data_grouped,on='ENV_ID')
joined_df.head()
joined_df.columns

heat_stress_df = joined_df[['HYBRID_ID','ENV_ID','TMAX_AVG','TMAX_STD',
                            'TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD',
                           'SRAD_AVG','SRAD_STD','YIELD']]
drought_stress_df = joined_df[['HYBRID_ID','ENV_ID','IRRIGATION','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC','YIELD']]

In [6]:
#Setting non-numeric variables to index
heat_stress_df = heat_stress_df.set_index(['HYBRID_ID','ENV_ID'])
heat_stress_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,TMAX_AVG,TMAX_STD,TMIN_AVG,TMIN_STD,DAYL_AVG,DAYL_STD,SRAD_AVG,SRAD_STD,YIELD
HYBRID_ID,ENV_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
H2782,Env_1,8.093151,14.926959,-3.256164,13.574307,43200.000043,10075.200862,276.111781,131.085275,107.9577
H2782,Env_2,8.178082,14.860702,-3.290411,13.446089,43200.000054,10001.326165,278.706849,130.288514,85.7498
H2240,Env_3,9.805479,14.31117,-1.224658,12.90174,43200.000054,10001.326165,274.112876,122.067104,74.6116
H1527,Env_3,9.805479,14.31117,-1.224658,12.90174,43200.000054,10001.326165,274.112876,122.067104,83.8191
H1369,Env_3,9.805479,14.31117,-1.224658,12.90174,43200.000054,10001.326165,274.112876,122.067104,81.7917


In [7]:
heat_stress_df.corr()

Unnamed: 0,TMAX_AVG,TMAX_STD,TMIN_AVG,TMIN_STD,DAYL_AVG,DAYL_STD,SRAD_AVG,SRAD_STD,YIELD
TMAX_AVG,1.0,-0.766013,0.844419,-0.660018,-0.031076,-0.86514,0.689132,-0.752999,0.081646
TMAX_STD,-0.766013,1.0,-0.793189,0.893305,0.107864,0.742199,-0.4415,0.417376,-0.064415
TMIN_AVG,0.844419,-0.793189,1.0,-0.717589,-0.038231,-0.798304,0.306109,-0.616567,0.016498
TMIN_STD,-0.660018,0.893305,-0.717589,1.0,0.123271,0.560108,-0.285517,0.250184,0.014625
DAYL_AVG,-0.031076,0.107864,-0.038231,0.123271,1.0,0.028784,-0.010226,-0.046711,0.0035
DAYL_STD,-0.86514,0.742199,-0.798304,0.560108,0.028784,1.0,-0.747047,0.675689,-0.116823
SRAD_AVG,0.689132,-0.4415,0.306109,-0.285517,-0.010226,-0.747047,1.0,-0.535784,0.161162
SRAD_STD,-0.752999,0.417376,-0.616567,0.250184,-0.046711,0.675689,-0.535784,1.0,-0.160375
YIELD,0.081646,-0.064415,0.016498,0.014625,0.0035,-0.116823,0.161162,-0.160375,1.0


In [8]:
print('Total performance observations: ' + str(len(heat_stress_df)))
print('Total unique hybrids: ' + str(len(heat_stress_df.reset_index()['HYBRID_ID'].unique())))

Total performance observations: 387427
Total unique hybrids: 2452


## Setting up a Random Forest model

In [9]:
# Splitting data
import sklearn
from sklearn.model_selection import train_test_split

train, test = train_test_split(heat_stress_df, test_size = 0.3, random_state=1)
x_train = train.iloc[:,0:8]
y_train = train['YIELD']
x_test = test.iloc[:,0:8]
y_test = test['YIELD']
x_train.head()
y_train.head()

HYBRID_ID  ENV_ID  
H1155      Env_943     125.8307
H1237      Env_1274     53.5729
H1401      Env_1260    110.6046
H1146      Env_1307    124.7939
H1000      Env_396     113.9650
Name: YIELD, dtype: float64

In [10]:
# Setting up model
import numpy as np
from sklearn import linear_model
from sklearn import tree

lin_model = linear_model.LinearRegression()
tree_model = tree.DecisionTreeRegressor()

# Fitting models
lin_model.fit(x_train,y_train)
tree_model.fit(x_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## Model scores

In [11]:
# Make predictions
lin_testing_predictions  = lin_model.predict(x_test)
tree_testing_predictions = tree_model.predict(x_test)

#Get scores on the training data
print(lin_model.score(x_test,y_test))
print(tree_model.score(x_test,y_test))

0.06374242628179727
0.6173379501845964


## Finding the best Decision Tree Regressor

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
     'max_depth':(5,10,15),
     'min_samples_split': (2,4,8),
     'min_samples_leaf': (4,8,12)
}

model = GridSearchCV(tree.DecisionTreeRegressor(),parameters,cv=3,iid=False)
model.fit(x_train, np.ravel(y_train)) #we should run this on ALL the data, not on the split
# here out test becomes our VALIDATION data
model.best_score_, model.best_params_