# Linear Regressions for HEAT , DROUGHT

In [1]:
import pandas as pd

performance_data = pd.read_csv('performance_data.csv')
submission_template = pd.read_csv('submission_template.csv')
weather_data = pd.read_csv('weather_data.csv')

In [2]:
performance_data.columns

Index(['HYBRID_ID', 'ENV_ID', 'HYBRID_MG', 'ENV_MG', 'YIELD', 'YEAR', 'LAT',
       'LONG', 'PLANT_DATE', 'HARVEST_DATE', 'IRRIGATION', 'ENV_YIELD_MEAN',
       'ENV_YIELD_STD', 'ELEVATION', 'CLAY', 'SILT', 'SAND', 'AWC', 'PH', 'OM',
       'CEC', 'KSAT'],
      dtype='object')

In [3]:
weather_data.head()

Unnamed: 0,ENV_ID,DAY_NUM,DAYL,PREC,SRAD,SWE,TMAX,TMIN,VP
0,Env_1,1,29030.400391,0,92.800003,24,-14.5,-21.0,120
1,Env_1,2,29030.400391,0,166.399994,24,-7.0,-26.0,80
2,Env_1,3,29030.400391,0,144.0,24,0.0,-12.5,240
3,Env_1,4,29030.400391,0,112.0,24,-3.5,-11.5,240
4,Env_1,5,29030.400391,1,153.600006,24,2.5,-11.5,240


## Transforming weather data into useful metrics

In [4]:
weather_data_grouped_mean = weather_data.groupby(by='ENV_ID').mean()
weather_data_grouped_std = weather_data.groupby(by='ENV_ID').std()
weather_data_grouped_mean.head()
weather_data_grouped_std.head()
weather_data_grouped = weather_data_grouped_mean.join(weather_data_grouped_std,
                                                      lsuffix='_AVG',
                                                     rsuffix='_STD')
weather_data_grouped = weather_data_grouped.drop('DAY_NUM_AVG',axis=1)
weather_data_grouped.head()

Unnamed: 0_level_0,DAYL_AVG,PREC_AVG,SRAD_AVG,SWE_AVG,TMAX_AVG,TMIN_AVG,VP_AVG,DAY_NUM_STD,DAYL_STD,PREC_STD,SRAD_STD,SWE_STD,TMAX_STD,TMIN_STD,VP_STD
ENV_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Env_1,43200.000043,1.684932,276.111781,11.167123,8.093151,-3.256164,630.356164,105.510663,10075.200862,4.544882,131.085275,16.046875,14.926959,13.574307,535.634711
Env_10,43200.946768,3.079452,294.97863,19.868493,12.09589,2.241096,874.191781,105.510663,8183.382184,5.974924,134.977729,31.810984,11.625704,9.839269,559.526111
Env_100,43200.00007,2.90137,296.714521,20.679452,12.616438,0.632877,820.054795,105.510663,8795.413435,7.152662,119.593751,30.413609,12.877466,12.410814,614.102577
Env_1000,43200.946854,2.0,352.368219,0.99726,18.323288,2.883562,824.876712,105.510663,6955.702533,6.970897,111.010912,2.447805,11.374718,10.219944,674.104452
Env_1001,43200.946854,2.561644,341.681097,2.443836,17.70137,2.924658,862.246575,105.510663,6955.702533,8.697269,112.386846,5.729262,11.039508,10.119774,629.404388


## Joining performance and weather data

In [399]:
joined_df = performance_data.join(weather_data_grouped,on='ENV_ID')
joined_df.head()
joined_df.columns

heat_stress_df = joined_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','ENV_YIELD_STD','TMAX_AVG','TMAX_STD',
                            'TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD',
                           'SRAD_AVG','SRAD_STD']]
drought_stress_df = joined_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','ENV_YIELD_STD','IRRIGATION','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]

trainheat_df=heat_stress_df.loc[:10000]
testheat_df=heat_stress_df.loc[10000:]
testheat_df=testheat_df[['HYBRID_ID','ENV_ID','YIELD','ENV_YIELD_MEAN','TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]


## First Linear Regression Test

In [400]:
import statsmodels.api as sm

# heat_stress_df.columns
features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD','SRAD_AVG','SRAD_STD']
X = trainheat_df[features_]
y = trainheat_df["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
# model.summary()

# print(model.params)
print(model.summary2())
coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns
# tc=pd.concat([tc]*len(testheat_df.index))

# model.HC1_se

                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.990     
Dependent Variable: ENV_YIELD_MEAN   AIC:                75882.0763
Date:               2018-11-28 20:00 BIC:                75939.7598
No. Observations:   10001            Log-Likelihood:     -37933.   
Df Model:           8                F-statistic:        1.193e+05 
Df Residuals:       9993             Prob (F-statistic): 0.00      
R-squared:          0.990            Scale:              115.44    
---------------------------------------------------------------------
            Coef.    Std.Err.      t       P>|t|     [0.025    0.975]
---------------------------------------------------------------------
TMAX_AVG   -0.7617     0.2642    -2.8826   0.0040   -1.2796   -0.2437
TMAX_STD    4.8163     0.4639    10.3825   0.0000    3.9070    5.7256
TMIN_AVG   11.2381     0.3121    36.0131   0.0000   10.6264   11.8498
TMIN_STD    2.7699     0.3543     7.8171   0.0000    2

Index(['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD'],
      dtype='object')

In [401]:
# print(tc)
# print(testheat_df)
# print ('value',tc['TMAX_AVG'])

coeff_=tc['TMAX_AVG']
testheat_df['TMAX_AVG']=testheat_df['TMAX_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMAX_STD']
testheat_df['TMAX_STD']=testheat_df['TMAX_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_AVG']
testheat_df['TMIN_AVG']=testheat_df['TMIN_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_STD']
testheat_df['TMIN_STD']=testheat_df['TMIN_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_AVG']
testheat_df['DAYL_AVG']=testheat_df['DAYL_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_STD']
testheat_df['DAYL_STD']=testheat_df['DAYL_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_AVG']
testheat_df['SRAD_AVG']=testheat_df['SRAD_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_STD']
testheat_df['SRAD_STD']=testheat_df['SRAD_STD'].apply (lambda x: coeff_ * x )


In [406]:

testheatreg_df=testheat_df[['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]
real_=testheat_df["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testheatreg_df.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
# print(regress_)
# print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)


sqmeanerror YIELDREG    24.743205
dtype: float64
sqstderror YIELDREG    42.383336
dtype: float64


#### Grouping by Env to create new Linear Regression by Enviroment instead of Crop

In [413]:
heat_stress_env = joined_df[['ENV_ID','ENV_YIELD_MEAN','ENV_YIELD_STD','TMAX_AVG','TMAX_STD',
                            'TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD',
                           'SRAD_AVG','SRAD_STD']]
drought_stress_env = joined_df[['ENV_ID','ENV_YIELD_MEAN','ENV_YIELD_STD','IRRIGATION','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]



In [436]:
#Converting dataframes to create linear regressions based only on enviroments

heat_stress_env=heat_stress_env.drop_duplicates(subset=None, keep='first', inplace=False)
heat_stress_env=heat_stress_env.reset_index(level=0, drop=True)
# print(heat_stress_env)
print('HeatStressLen>',heat_stress_env.index)

drought_stress_env=drought_stress_env.drop_duplicates(subset=None, keep='first', inplace=False)
drought_stress_env=drought_stress_env.reset_index(level=0, drop=True)
# print(heat_stress_env)
print('DroughtStressLen>',drought_stress_env.index)

HeatStressLen> RangeIndex(start=0, stop=1560, step=1)
DroughtStressLen> RangeIndex(start=0, stop=1560, step=1)


# HEAT LR

In [496]:
trainheat_env=heat_stress_env.loc[:400]
testheat_env=heat_stress_env.loc[401:]
testheat_env=testheat_env[['ENV_ID','ENV_YIELD_MEAN','TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]

features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD','DAYL_AVG','DAYL_STD','SRAD_AVG','SRAD_STD']
X = trainheat_env[features_]
y = trainheat_env["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary2())

coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns


                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.982    
Dependent Variable: ENV_YIELD_MEAN   AIC:                3308.6060
Date:               2018-11-30 10:47 BIC:                3340.5577
No. Observations:   401              Log-Likelihood:     -1646.3  
Df Model:           8                F-statistic:        2762.    
Df Residuals:       393              Prob (F-statistic): 0.00     
R-squared:          0.983            Scale:              219.92   
--------------------------------------------------------------------
            Coef.    Std.Err.      t      P>|t|     [0.025    0.975]
--------------------------------------------------------------------
TMAX_AVG   -0.8813     1.4884   -0.5921   0.5541   -3.8076    2.0450
TMAX_STD    4.6596     2.5594    1.8206   0.0694   -0.3722    9.6914
TMIN_AVG    8.3948     2.0279    4.1396   0.0000    4.4079   12.3817
TMIN_STD    2.1579     2.1022    1.0265   0.3053   -1.9750    6.2908

Index(['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD'],
      dtype='object')

In [497]:
#Test coefficients with test data

coeff_=tc['TMAX_AVG']
testheat_env['TMAX_AVG']=testheat_env['TMAX_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMAX_STD']
testheat_env['TMAX_STD']=testheat_env['TMAX_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_AVG']
testheat_env['TMIN_AVG']=testheat_env['TMIN_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_STD']
testheat_env['TMIN_STD']=testheat_env['TMIN_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_AVG']
testheat_env['DAYL_AVG']=testheat_env['DAYL_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['DAYL_STD']
testheat_env['DAYL_STD']=testheat_env['DAYL_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_AVG']
testheat_env['SRAD_AVG']=testheat_env['SRAD_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_STD']
testheat_env['SRAD_STD']=testheat_env['SRAD_STD'].apply (lambda x: coeff_ * x )

testheatreg_env=testheat_env[['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]
real_=testheat_env["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testheatreg_env.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
# print(regress_)
# print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)


sqmeanerror YIELDREG    37.84038
dtype: float64
sqstderror YIELDREG    56.95416
dtype: float64


In [515]:
#Taking Out DAYL as P Value is too large
trainheat_env=heat_stress_env.loc[:400]
testheat_env=heat_stress_env.loc[401:]
testheat_env=testheat_env[['ENV_ID','ENV_YIELD_MEAN','TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]

features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD','SRAD_AVG','SRAD_STD']
X = trainheat_env[features_]
y = trainheat_env["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary2())

coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns


                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.982    
Dependent Variable: ENV_YIELD_MEAN   AIC:                3304.8131
Date:               2018-11-30 11:40 BIC:                3328.7769
No. Observations:   401              Log-Likelihood:     -1646.4  
Df Model:           6                F-statistic:        3700.    
Df Residuals:       395              Prob (F-statistic): 0.00     
R-squared:          0.983            Scale:              218.92   
--------------------------------------------------------------------
            Coef.    Std.Err.      t      P>|t|     [0.025    0.975]
--------------------------------------------------------------------
TMAX_AVG   -0.4855     1.1132   -0.4362   0.6630   -2.6741    1.7030
TMAX_STD    5.0057     2.1721    2.3045   0.0217    0.7354    9.2760
TMIN_AVG    7.6073     1.0241    7.4281   0.0000    5.5939    9.6207
TMIN_STD    1.8440     1.9743    0.9340   0.3509   -2.0374    5.7255

Index(['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'SRAD_AVG', 'SRAD_STD'], dtype='object')

In [516]:
#Test coefficients with test data

coeff_=tc['TMAX_AVG']
testheat_env['TMAX_AVG']=testheat_env['TMAX_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMAX_STD']
testheat_env['TMAX_STD']=testheat_env['TMAX_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_AVG']
testheat_env['TMIN_AVG']=testheat_env['TMIN_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_STD']
testheat_env['TMIN_STD']=testheat_env['TMIN_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_AVG']
testheat_env['SRAD_AVG']=testheat_env['SRAD_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['SRAD_STD']
testheat_env['SRAD_STD']=testheat_env['SRAD_STD'].apply (lambda x: coeff_ * x )


testheatreg_env=testheat_env[['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD',
       'SRAD_AVG', 'SRAD_STD']]
real_=testheat_env["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testheatreg_env.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
# print(regress_)
# print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)

sqmeanerror YIELDREG    37.799961
dtype: float64
sqstderror YIELDREG    56.677163
dtype: float64


In [534]:
#Taking Out SRAD due to multicolinearity and no improvement in previous iteration
trainheat_env=heat_stress_env.loc[:400]
testheat_env=heat_stress_env.loc[401:]
testheat_env=testheat_env[['ENV_ID','ENV_YIELD_MEAN','TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD', 'DAYL_AVG', 'DAYL_STD',
       'SRAD_AVG', 'SRAD_STD']]



features_=['TMAX_AVG','TMAX_STD','TMIN_AVG','TMIN_STD']
X = trainheat_env[features_]
y = trainheat_env["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary2())

coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns


                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.982    
Dependent Variable: ENV_YIELD_MEAN   AIC:                3311.7646
Date:               2018-11-30 12:07 BIC:                3327.7404
No. Observations:   401              Log-Likelihood:     -1651.9  
Df Model:           4                F-statistic:        5425.    
Df Residuals:       397              Prob (F-statistic): 0.00     
R-squared:          0.982            Scale:              223.85   
--------------------------------------------------------------------
              Coef.    Std.Err.     t      P>|t|     [0.025   0.975]
--------------------------------------------------------------------
TMAX_AVG      2.1037     0.5767   3.6478   0.0003    0.9699   3.2374
TMAX_STD      2.1553     1.9356   1.1135   0.2662   -1.6500   5.9606
TMIN_AVG      5.7322     0.8525   6.7239   0.0000    4.0562   7.4082
TMIN_STD      4.1380     1.8688   2.2142   0.0274    0.4639   7.8120

Index(['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD'], dtype='object')

In [535]:
#Test coefficients with test data

coeff_=tc['TMAX_AVG']
testheat_env['TMAX_AVG']=testheat_env['TMAX_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMAX_STD']
testheat_env['TMAX_STD']=testheat_env['TMAX_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_AVG']
testheat_env['TMIN_AVG']=testheat_env['TMIN_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['TMIN_STD']
testheat_env['TMIN_STD']=testheat_env['TMIN_STD'].apply (lambda x: coeff_ * x )

testheatreg_env=testheat_env[['TMAX_AVG', 'TMAX_STD', 'TMIN_AVG', 'TMIN_STD']]
real_=testheat_env["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testheatreg_env.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
# print(regress_)
# print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)



sqmeanerror YIELDREG    37.473368
dtype: float64
sqstderror YIELDREG    56.642275
dtype: float64


In [521]:
heat_coeff=tc
heat_coeff

Unnamed: 0,TMAX_AVG,TMAX_STD,TMIN_AVG,TMIN_STD
0,2.103662,2.155283,5.732214,4.137954


# DROUGHT LR

In [504]:
#DROUGHT

traindr_env=drought_stress_env.loc[:400]
testdr_env=drought_stress_env.loc[401:]
testdr_env=testdr_env[['ENV_ID','ENV_YIELD_MEAN','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]


features_=['PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']
X = traindr_env[features_]
y = traindr_env["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary2())

coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns


                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.981    
Dependent Variable: ENV_YIELD_MEAN   AIC:                3334.7336
Date:               2018-11-30 11:03 BIC:                3366.6853
No. Observations:   401              Log-Likelihood:     -1659.4  
Df Model:           8                F-statistic:        2585.    
Df Residuals:       393              Prob (F-statistic): 0.00     
R-squared:          0.981            Scale:              234.73   
--------------------------------------------------------------------
            Coef.    Std.Err.      t      P>|t|     [0.025    0.975]
--------------------------------------------------------------------
PREC_AVG   -4.7865     3.0649   -1.5617   0.1192   -10.8121   1.2391
PREC_STD    4.4281     1.0583    4.1843   0.0000     2.3476   6.5087
KSAT        3.0154     0.4163    7.2433   0.0000     2.1969   3.8338
SWE_AVG    -0.6899     0.3668   -1.8806   0.0608    -1.4111   0.0313

Index(['PREC_AVG', 'PREC_STD', 'KSAT', 'SWE_AVG', 'SWE_STD', 'VP_AVG',
       'VP_STD', 'AWC'],
      dtype='object')

In [505]:
traindr_env=drought_stress_env.loc[:400]
testdr_env=drought_stress_env.loc[401:]
testdr_env=testdr_env[['ENV_ID','ENV_YIELD_MEAN','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]

#REMOVING SWE due to p value and low coeff

features_=['PREC_AVG','PREC_STD','KSAT','VP_AVG','VP_STD','AWC']
X = traindr_env[features_]
y = traindr_env["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary2())

coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns


                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.981    
Dependent Variable: ENV_YIELD_MEAN   AIC:                3342.2176
Date:               2018-11-30 11:15 BIC:                3366.1814
No. Observations:   401              Log-Likelihood:     -1665.1  
Df Model:           6                F-statistic:        3364.    
Df Residuals:       395              Prob (F-statistic): 0.00     
R-squared:          0.981            Scale:              240.32   
-------------------------------------------------------------------
              Coef.    Std.Err.     t     P>|t|    [0.025    0.975]
-------------------------------------------------------------------
PREC_AVG     -12.1006    1.8652  -6.4875  0.0000  -15.7676  -8.4336
PREC_STD       5.8866    0.9426   6.2449  0.0000    4.0334   7.7398
KSAT           2.8337    0.4177   6.7839  0.0000    2.0125   3.6549
VP_AVG         0.1073    0.0133   8.0929  0.0000    0.0813   0.1334
VP_STD

Index(['PREC_AVG', 'PREC_STD', 'KSAT', 'VP_AVG', 'VP_STD', 'AWC'], dtype='object')

In [506]:
#Test coefficients with test data


'PREC_AVG','PREC_STD','KSAT','VP_AVG','VP_STD','AWC'

coeff_=tc['PREC_AVG']
testdr_env['PREC_AVG']=testdr_env['PREC_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['PREC_STD']
testdr_env['PREC_STD']=testdr_env['PREC_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['KSAT']
testdr_env['KSAT']=testdr_env['KSAT'].apply (lambda x: coeff_ * x )
coeff_=tc['VP_AVG']
testdr_env['VP_AVG']=testdr_env['VP_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['VP_STD']
testdr_env['VP_STD']=testdr_env['VP_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['AWC']
testdr_env['AWC']=testdr_env['AWC'].apply (lambda x: coeff_ * x )


testdrreg_env=testdr_env[['PREC_AVG','PREC_STD','KSAT','VP_AVG','VP_STD','AWC']]
real_=testdr_env["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testdrreg_env.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
# print(regress_)
# print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)


sqmeanerror YIELDREG    38.425607
dtype: float64
sqstderror YIELDREG    62.835857
dtype: float64


In [522]:
traindr_env=drought_stress_env.loc[:400]
testdr_env=drought_stress_env.loc[401:]
testdr_env=testdr_env[['ENV_ID','ENV_YIELD_MEAN','PREC_AVG','PREC_STD','KSAT',
                              'SWE_AVG','SWE_STD','VP_AVG','VP_STD','AWC']]

#REMOVING VP due to low coeff

features_=['PREC_AVG','PREC_STD','KSAT','AWC']
X = traindr_env[features_]
y = traindr_env["ENV_YIELD_MEAN"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary2())

coeff_df=pd.DataFrame(model.params)
print(coeff_df)
tc=coeff_df.transpose()
tc.columns

                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.977    
Dependent Variable: ENV_YIELD_MEAN   AIC:                3403.6910
Date:               2018-11-30 11:46 BIC:                3419.6668
No. Observations:   401              Log-Likelihood:     -1697.8  
Df Model:           4                F-statistic:        4293.    
Df Residuals:       397              Prob (F-statistic): 0.00     
R-squared:          0.977            Scale:              281.52   
--------------------------------------------------------------------
            Coef.    Std.Err.      t      P>|t|     [0.025    0.975]
--------------------------------------------------------------------
PREC_AVG   -3.4617     1.4986   -2.3099   0.0214   -6.4079   -0.5155
PREC_STD    5.7836     0.8061    7.1748   0.0000    4.1989    7.3684
KSAT        3.5621     0.4047    8.8025   0.0000    2.7665    4.3576
AWC         3.1812     0.1914   16.6218   0.0000    2.8049    3.5574

Index(['PREC_AVG', 'PREC_STD', 'KSAT', 'AWC'], dtype='object')

In [523]:
#Test coefficients with test data


# 'PREC_AVG','PREC_STD','KSAT','VP_AVG','VP_STD','AWC'

coeff_=tc['PREC_AVG']
testdr_env['PREC_AVG']=testdr_env['PREC_AVG'].apply (lambda x: coeff_ * x )
coeff_=tc['PREC_STD']
testdr_env['PREC_STD']=testdr_env['PREC_STD'].apply (lambda x: coeff_ * x )
coeff_=tc['KSAT']
testdr_env['KSAT']=testdr_env['KSAT'].apply (lambda x: coeff_ * x )


coeff_=tc['AWC']
testdr_env['AWC']=testdr_env['AWC'].apply (lambda x: coeff_ * x )


testdrreg_env=testdr_env[['PREC_AVG','PREC_STD','KSAT','AWC']]
real_=testdr_env["ENV_YIELD_MEAN"]
regress_=[]
regress_=list(testdrreg_env.sum(1))
regress_=pd.DataFrame(regress_, columns=['YIELDREG'])
real_=pd.DataFrame(list(real_), columns=['YIELDREG'])
# print(regress_)
# print(real_)
error_=(real_-regress_)**2
# print(error_)
print('sqmeanerror',error_.mean()**0.5)
print('sqstderror',error_.std()**0.5)


sqmeanerror YIELDREG    26.160086
dtype: float64
sqstderror YIELDREG    33.444837
dtype: float64


In [524]:
drought_coeff=tc
drought_coeff

Unnamed: 0,PREC_AVG,PREC_STD,KSAT,AWC
0,-3.461698,5.783615,3.56207,3.181174
