# ML trials

In [1]:
import pandas as pd

performance_data = pd.read_csv('performance_data.csv')
submission_template = pd.read_csv('submission_template.csv')
weather_data = pd.read_csv('weather_data.csv')

In [2]:
performance_data.columns

Index(['HYBRID_ID', 'ENV_ID', 'HYBRID_MG', 'ENV_MG', 'YIELD', 'YEAR', 'LAT',
       'LONG', 'PLANT_DATE', 'HARVEST_DATE', 'IRRIGATION', 'ENV_YIELD_MEAN',
       'ENV_YIELD_STD', 'ELEVATION', 'CLAY', 'SILT', 'SAND', 'AWC', 'PH', 'OM',
       'CEC', 'KSAT'],
      dtype='object')

In [3]:
weather_data.head()

Unnamed: 0,ENV_ID,DAY_NUM,DAYL,PREC,SRAD,SWE,TMAX,TMIN,VP
0,Env_1,1,29030.400391,0,92.800003,24,-14.5,-21.0,120
1,Env_1,2,29030.400391,0,166.399994,24,-7.0,-26.0,80
2,Env_1,3,29030.400391,0,144.0,24,0.0,-12.5,240
3,Env_1,4,29030.400391,0,112.0,24,-3.5,-11.5,240
4,Env_1,5,29030.400391,1,153.600006,24,2.5,-11.5,240


## Transforming weather data into useful metrics

In [4]:
weather_data_grouped_mean = weather_data.groupby(by='ENV_ID').mean()
weather_data_grouped_std = weather_data.groupby(by='ENV_ID').std()
weather_data_grouped_mean.head()
weather_data_grouped_std.head()
weather_data_grouped = weather_data_grouped_mean.join(weather_data_grouped_std,
                                                      lsuffix='_AVG',
                                                     rsuffix='_STD')
weather_data_grouped = weather_data_grouped.drop('DAY_NUM_AVG',axis=1)
weather_data_grouped.head()

Unnamed: 0_level_0,DAYL_AVG,PREC_AVG,SRAD_AVG,SWE_AVG,TMAX_AVG,TMIN_AVG,VP_AVG,DAY_NUM_STD,DAYL_STD,PREC_STD,SRAD_STD,SWE_STD,TMAX_STD,TMIN_STD,VP_STD
ENV_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Env_1,43200.000043,1.684932,276.111781,11.167123,8.093151,-3.256164,630.356164,105.510663,10075.200862,4.544882,131.085275,16.046875,14.926959,13.574307,535.634711
Env_10,43200.946768,3.079452,294.97863,19.868493,12.09589,2.241096,874.191781,105.510663,8183.382184,5.974924,134.977729,31.810984,11.625704,9.839269,559.526111
Env_100,43200.00007,2.90137,296.714521,20.679452,12.616438,0.632877,820.054795,105.510663,8795.413435,7.152662,119.593751,30.413609,12.877466,12.410814,614.102577
Env_1000,43200.946854,2.0,352.368219,0.99726,18.323288,2.883562,824.876712,105.510663,6955.702533,6.970897,111.010912,2.447805,11.374718,10.219944,674.104452
Env_1001,43200.946854,2.561644,341.681097,2.443836,17.70137,2.924658,862.246575,105.510663,6955.702533,8.697269,112.386846,5.729262,11.039508,10.119774,629.404388


## Joining performance and weather data

In [299]:
joined_df = performance_data.join(weather_data_grouped,on='ENV_ID')
joined_df.head()
joined_df.columns

heat_stress_df = joined_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','ENV_YIELD_STD','TMAX_AVG','TMAX_STD',
                            'TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD',
                           'SRAD_AVG','SRAD_STD']]
drought_stress_df = joined_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','ENV_YIELD_STD','IRRIGATION','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]

trainheat_df=heat_stress_df.loc[:100000]
testheat_df=heat_stress_df.loc[100000:100015]
testheat_df=testheat_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]


In [300]:
import statsmodels.api as sm

# heat_stress_df.columns
features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD','SRAD_AVG','SRAD_STD']
X = trainheat_df[features_]
y = trainheat_df["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
# model.summary()

# print(model.params)
print(model.summary2())
coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns
# tc=pd.concat([tc]*len(testheat_df.index))

# model.HC1_se

                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.989      
Dependent Variable: ENV_YIELD_MEAN   AIC:                778225.3837
Date:               2018-11-28 13:53 BIC:                778301.4872
No. Observations:   100001           Log-Likelihood:     -3.8910e+05
Df Model:           8                F-statistic:        1.149e+06  
Df Residuals:       99993            Prob (F-statistic): 0.00       
R-squared:          0.989            Scale:              140.36     
----------------------------------------------------------------------
             Coef.    Std.Err.      t       P>|t|     [0.025    0.975]
----------------------------------------------------------------------
TMAX_AVG    -3.6989     0.0890   -41.5718   0.0000   -3.8733   -3.5245
TMAX_STD    -2.5394     0.1601   -15.8620   0.0000   -2.8532   -2.2256
TMIN_AVG    10.2493     0.1184    86.5893   0.0000   10.0173   10.4813
TMIN_STD     5.7191     0.1386    41.2733

Index(['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD'],
      dtype='object')

In [301]:
print(tc)
print(testheat_df)
print ('value',tc['TMAX_AVG'])

coeff_=tc['TMAX_AVG']
testheat_df['TMAX_AVG']=testheat_df['TMAX_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMAX_STD']
testheat_df['TMAX_STD']=testheat_df['TMAX_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_AVG']
testheat_df['TMIN_AVG']=testheat_df['TMIN_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_STD']
testheat_df['TMIN_STD']=testheat_df['TMIN_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_AVG']
testheat_df['DAYL_AVG']=testheat_df['DAYL_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_STD']
testheat_df['DAYL_STD']=testheat_df['DAYL_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_AVG']
testheat_df['SRAD_AVG']=testheat_df['SRAD_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_STD']
testheat_df['SRAD_STD']=testheat_df['SRAD_STD'].apply (lambda x: coeff_ * x )


   TMAX_AVG  TMAX_STD   TMIN_AVG  TMIN_STD  DAYL_AVG  DAYL_STD  SRAD_AVG  \
0  -3.69891  -2.53942  10.249268  5.719139 -0.003567   0.01382  0.606971   

   SRAD_STD  
0 -0.308845  
       HYBRID_ID   ENV_ID     YIELD  ENV_YIELD_MEAN   TMAX_AVG   TMAX_STD  \
100000     H1019  Env_432  127.4595      132.749349  14.430137  12.309702   
100001     H1173  Env_432  133.1468      132.749349  14.430137  12.309702   
100002     H1878  Env_432  149.0262      132.749349  14.430137  12.309702   
100003     H1173  Env_432  123.2990      132.749349  14.430137  12.309702   
100004     H2169  Env_432  126.3522      132.749349  14.430137  12.309702   
100005     H1019  Env_432  131.9418      132.749349  14.430137  12.309702   
100006     H1140  Env_432  121.6329      132.749349  14.430137  12.309702   
100007     H1308  Env_432  125.9709      132.749349  14.430137  12.309702   
100008     H1536  Env_432  130.7659      132.749349  14.430137  12.309702   
100009     H1308  Env_432  123.2826      132.7493

In [354]:

print(testheat_df)
testheatreg_df=testheat_df[['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]
regress_=testheatreg_df.sum(1)
pd.DataFrame(regress_,columns=['A'])
# regress_=regress_.transpose()
print(regress_)
abc=testheat_df[['ENV_YIELD_MEAN']]
# regress_.rename(columns='A',inplace=True)
# regress_
# coeff_=regress_.loc['ENV_YIELD_MEAN']
# abc['ENV_YIELD_MEAN']=abc['ENV_YIELD_MEAN'].apply (lambda x: x - coeff_ )
# abc

       HYBRID_ID   ENV_ID     YIELD  ENV_YIELD_MEAN   TMAX_AVG  TMAX_STD  \
100000     H1019  Env_432  127.4595      132.749349 -53.375776  -31.2595   
100001     H1173  Env_432  133.1468      132.749349 -53.375776  -31.2595   
100002     H1878  Env_432  149.0262      132.749349 -53.375776  -31.2595   
100003     H1173  Env_432  123.2990      132.749349 -53.375776  -31.2595   
100004     H2169  Env_432  126.3522      132.749349 -53.375776  -31.2595   
100005     H1019  Env_432  131.9418      132.749349 -53.375776  -31.2595   
100006     H1140  Env_432  121.6329      132.749349 -53.375776  -31.2595   
100007     H1308  Env_432  125.9709      132.749349 -53.375776  -31.2595   
100008     H1536  Env_432  130.7659      132.749349 -53.375776  -31.2595   
100009     H1308  Env_432  123.2826      132.749349 -53.375776  -31.2595   
100010     H1111  Env_432  123.8941      132.749349 -53.375776  -31.2595   
100011     H2139  Env_432  122.5464      132.749349 -53.375776  -31.2595   
100012     H

In [244]:
# print(tc)
# tc_=list(tc.values)
# tc_=tc_[0]
# # print('COEFF',tc_)
# # print('COEFF',tc_[0])
# # print(' ')
# print(testheat_df)
# # tc.reset_index()
# # print(tc)
# # a_=testheat_df.mul(10,'TMAX_AVG')
# # for x in range (3):
# # test_= tc*testheat_df
# # print(test_)


# print(a_)
# # test_


In [58]:
# heat_stress_df.columns
features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD','SRAD_AVG','SRAD_STD']
X = trainheat_df[features_]
y = trainheat_df["YIELD"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,YIELD,R-squared:,0.977
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,706100.0
Date:,"Tue, 27 Nov 2018",Prob (F-statistic):,0.0
Time:,15:12:48,Log-Likelihood:,-428710.0
No. Observations:,100001,AIC:,857400.0
Df Residuals:,99995,BIC:,857500.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TMAX_AVG,-1.0757,0.098,-10.999,0.000,-1.267,-0.884
TMAX_STD,0.8923,0.225,3.967,0.000,0.451,1.333
TMIN_AVG,6.7553,0.094,72.113,0.000,6.572,6.939
TMIN_STD,4.5688,0.193,23.665,0.000,4.190,4.947
SRAD_AVG,0.2530,0.007,36.706,0.000,0.239,0.266
SRAD_STD,-0.2084,0.009,-22.641,0.000,-0.226,-0.190

0,1,2,3
Omnibus:,2874.927,Durbin-Watson:,1.076
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3858.573
Skew:,-0.33,Prob(JB):,0.0
Kurtosis:,3.7,Cond. No.,1750.0


In [None]:
#REMOVING DayL as it has a strong multicolinearity with Solar R
# heat_stress_df.columns
features_=['TMAX_AVG','TMIN_AVG','SRAD_AVG']
X = heat_stress_df[features_]
y = heat_stress_df["YIELD"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

In [47]:
#REMOVING DayL as it has a strong multicolinearity with Solar R
# heat_stress_df.columns
features_=['TMAX_AVG','TMIN_AVG','SRAD_AVG']
X = heat_stress_df[features_]
y = heat_stress_df["YIELD"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,YIELD,R-squared:,0.969
Model:,OLS,Adj. R-squared:,0.969
Method:,Least Squares,F-statistic:,3985000.0
Date:,"Tue, 27 Nov 2018",Prob (F-statistic):,0.0
Time:,14:42:33,Log-Likelihood:,-1742700.0
No. Observations:,387427,AIC:,3485000.0
Df Residuals:,387424,BIC:,3485000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TMAX_AVG,-1.8333,0.033,-55.542,0.000,-1.898,-1.769
TMIN_AVG,1.1020,0.034,32.112,0.000,1.035,1.169
SRAD_AVG,0.4549,0.001,358.645,0.000,0.452,0.457

0,1,2,3
Omnibus:,25355.37,Durbin-Watson:,0.773
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36039.167
Skew:,-0.569,Prob(JB):,0.0
Kurtosis:,3.968,Cond. No.,424.0


In [48]:

features_=['PREC_AVG','KSAT','SWE_AVG','VP_AVG','AWC']
X = drought_stress_df[features_]
y = drought_stress_df["YIELD"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,YIELD,R-squared:,0.967
Model:,OLS,Adj. R-squared:,0.967
Method:,Least Squares,F-statistic:,2279000.0
Date:,"Tue, 27 Nov 2018",Prob (F-statistic):,0.0
Time:,14:42:39,Log-Likelihood:,-1751700.0
No. Observations:,387427,AIC:,3503000.0
Df Residuals:,387422,BIC:,3503000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PREC_AVG,5.2577,0.072,73.476,0.000,5.117,5.398
KSAT,5.1713,0.021,248.382,0.000,5.131,5.212
SWE_AVG,-0.3902,0.005,-81.876,0.000,-0.399,-0.381
VP_AVG,-0.0030,0.000,-9.580,0.000,-0.004,-0.002
AWC,4.2386,0.010,411.781,0.000,4.218,4.259

0,1,2,3
Omnibus:,13359.358,Durbin-Watson:,0.738
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18076.657
Skew:,-0.373,Prob(JB):,0.0
Kurtosis:,3.751,Cond. No.,2010.0


In [49]:
#Removing VP due to multicolinearity
features_=['PREC_AVG','KSAT','SWE_AVG','AWC']
X = drought_stress_df[features_]
y = drought_stress_df["YIELD"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,YIELD,R-squared:,0.967
Model:,OLS,Adj. R-squared:,0.967
Method:,Least Squares,F-statistic:,2848000.0
Date:,"Tue, 27 Nov 2018",Prob (F-statistic):,0.0
Time:,14:42:44,Log-Likelihood:,-1751800.0
No. Observations:,387427,AIC:,3504000.0
Df Residuals:,387423,BIC:,3504000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PREC_AVG,4.8234,0.055,87.117,0.000,4.715,4.932
KSAT,5.1111,0.020,257.494,0.000,5.072,5.150
SWE_AVG,-0.3680,0.004,-88.298,0.000,-0.376,-0.360
AWC,4.1627,0.007,633.725,0.000,4.150,4.176

0,1,2,3
Omnibus:,14379.126,Durbin-Watson:,0.738
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19575.302
Skew:,-0.391,Prob(JB):,0.0
Kurtosis:,3.776,Cond. No.,39.6
