# ML trials

In [1]:
import pandas as pd

performance_data = pd.read_csv('performance_data.csv')
submission_template = pd.read_csv('submission_template.csv')
weather_data = pd.read_csv('weather_data.csv')

In [2]:
performance_data.columns

Index(['HYBRID_ID', 'ENV_ID', 'HYBRID_MG', 'ENV_MG', 'YIELD', 'YEAR', 'LAT',
       'LONG', 'PLANT_DATE', 'HARVEST_DATE', 'IRRIGATION', 'ENV_YIELD_MEAN',
       'ENV_YIELD_STD', 'ELEVATION', 'CLAY', 'SILT', 'SAND', 'AWC', 'PH', 'OM',
       'CEC', 'KSAT'],
      dtype='object')

In [3]:
weather_data.head()

Unnamed: 0,ENV_ID,DAY_NUM,DAYL,PREC,SRAD,SWE,TMAX,TMIN,VP
0,Env_1,1,29030.400391,0,92.800003,24,-14.5,-21.0,120
1,Env_1,2,29030.400391,0,166.399994,24,-7.0,-26.0,80
2,Env_1,3,29030.400391,0,144.0,24,0.0,-12.5,240
3,Env_1,4,29030.400391,0,112.0,24,-3.5,-11.5,240
4,Env_1,5,29030.400391,1,153.600006,24,2.5,-11.5,240


## Transforming weather data into useful metrics

In [4]:
weather_data_grouped_mean = weather_data.groupby(by='ENV_ID').mean()
weather_data_grouped_std = weather_data.groupby(by='ENV_ID').std()
weather_data_grouped_mean.head()
weather_data_grouped_std.head()
weather_data_grouped = weather_data_grouped_mean.join(weather_data_grouped_std,
                                                      lsuffix='_AVG',
                                                     rsuffix='_STD')
weather_data_grouped = weather_data_grouped.drop('DAY_NUM_AVG',axis=1)
weather_data_grouped.head()

Unnamed: 0_level_0,DAYL_AVG,PREC_AVG,SRAD_AVG,SWE_AVG,TMAX_AVG,TMIN_AVG,VP_AVG,DAY_NUM_STD,DAYL_STD,PREC_STD,SRAD_STD,SWE_STD,TMAX_STD,TMIN_STD,VP_STD
ENV_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Env_1,43200.000043,1.684932,276.111781,11.167123,8.093151,-3.256164,630.356164,105.510663,10075.200862,4.544882,131.085275,16.046875,14.926959,13.574307,535.634711
Env_10,43200.946768,3.079452,294.97863,19.868493,12.09589,2.241096,874.191781,105.510663,8183.382184,5.974924,134.977729,31.810984,11.625704,9.839269,559.526111
Env_100,43200.00007,2.90137,296.714521,20.679452,12.616438,0.632877,820.054795,105.510663,8795.413435,7.152662,119.593751,30.413609,12.877466,12.410814,614.102577
Env_1000,43200.946854,2.0,352.368219,0.99726,18.323288,2.883562,824.876712,105.510663,6955.702533,6.970897,111.010912,2.447805,11.374718,10.219944,674.104452
Env_1001,43200.946854,2.561644,341.681097,2.443836,17.70137,2.924658,862.246575,105.510663,6955.702533,8.697269,112.386846,5.729262,11.039508,10.119774,629.404388


## Joining performance and weather data

In [399]:
joined_df = performance_data.join(weather_data_grouped,on='ENV_ID')
joined_df.head()
joined_df.columns

heat_stress_df = joined_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','ENV_YIELD_STD','TMAX_AVG','TMAX_STD',
                            'TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD',
                           'SRAD_AVG','SRAD_STD']]
drought_stress_df = joined_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','ENV_YIELD_STD','IRRIGATION','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]

trainheat_df=heat_stress_df.loc[:10000]
testheat_df=heat_stress_df.loc[10000:]
testheat_df=testheat_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]


In [400]:
import statsmodels.api as sm

# heat_stress_df.columns
features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD','SRAD_AVG','SRAD_STD']
X = trainheat_df[features_]
y = trainheat_df["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
# model.summary()

# print(model.params)
print(model.summary2())
coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns
# tc=pd.concat([tc]*len(testheat_df.index))

# model.HC1_se

                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.990     
Dependent Variable: ENV_YIELD_MEAN   AIC:                75882.0763
Date:               2018-11-28 20:00 BIC:                75939.7598
No. Observations:   10001            Log-Likelihood:     -37933.   
Df Model:           8                F-statistic:        1.193e+05 
Df Residuals:       9993             Prob (F-statistic): 0.00      
R-squared:          0.990            Scale:              115.44    
---------------------------------------------------------------------
            Coef.    Std.Err.      t       P>|t|     [0.025    0.975]
---------------------------------------------------------------------
TMAX_AVG   -0.7617     0.2642    -2.8826   0.0040   -1.2796   -0.2437
TMAX_STD    4.8163     0.4639    10.3825   0.0000    3.9070    5.7256
TMIN_AVG   11.2381     0.3121    36.0131   0.0000   10.6264   11.8498
TMIN_STD    2.7699     0.3543     7.8171   0.0000    2

Index(['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD'],
      dtype='object')

In [401]:
# print(tc)
# print(testheat_df)
# print ('value',tc['TMAX_AVG'])

coeff_=tc['TMAX_AVG']
testheat_df['TMAX_AVG']=testheat_df['TMAX_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMAX_STD']
testheat_df['TMAX_STD']=testheat_df['TMAX_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_AVG']
testheat_df['TMIN_AVG']=testheat_df['TMIN_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_STD']
testheat_df['TMIN_STD']=testheat_df['TMIN_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_AVG']
testheat_df['DAYL_AVG']=testheat_df['DAYL_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_STD']
testheat_df['DAYL_STD']=testheat_df['DAYL_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_AVG']
testheat_df['SRAD_AVG']=testheat_df['SRAD_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_STD']
testheat_df['SRAD_STD']=testheat_df['SRAD_STD'].apply (lambda x: coeff_ * x )


In [405]:

testheatreg_df=testheat_df[['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]
real_=testheat_df["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testheatreg_df.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
print(regress_)
print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)


          YIELDREG
0       105.572494
1       105.572494
2       105.572494
3       105.572494
4       105.572494
5       105.572494
6       105.572494
7       105.572494
8       105.572494
9       105.572494
10      105.572494
11      105.572494
12      105.572494
13      105.572494
14      105.572494
15      105.572494
16      105.572494
17      105.572494
18      105.572494
19      105.572494
20      105.572494
21      105.572494
22      111.314187
23      111.314187
24      111.314187
25      111.314187
26      111.314187
27      111.314187
28      111.314187
29      111.314187
...            ...
377397  186.287730
377398  186.287730
377399  186.287730
377400  179.305926
377401  179.305926
377402  179.305926
377403  179.305926
377404  179.305926
377405  179.305926
377406  179.305926
377407  179.305926
377408  179.305926
377409  179.305926
377410  179.305926
377411  179.305926
377412  179.305926
377413  179.305926
377414  179.305926
377415  179.305926
377416  170.865860
377417  170.