This dataset is part of the [Farming Systems Project](https://www.ars.usda.gov/northeast-area/beltsville-md-barc/beltsville-agricultural-research-center/sustainable-agricultural-systems-laboratory/docs/farming-systems-project/) at USDA, Beltsville MD.  This data is not available online on the USDA website but can be found on my [GitHub](https://github.com/mmtokay/DATA606/tree/master/datasets).


The data is split in two files, one that contains crop information and other with weather data.

Crop file:
* Crop - wheat, corn or soybean           
* GrowingSeason - year crop was cultivated 
* SystemName - crop management (traditional: NT and CT; organic: Org2, Org3 and Org6')    
* GrainYield - grain yield measured in kg/ha     
* PlantingDate - date seeds were planted  
* HarvestDate - date crop was harvested


Weather file:
* Year 
* Julian Day 
* Month
* Day
* Date
* avgtTempC - average temperature in C
* maxTempC - maximum temperature in C
* minTempC - minimum temperature in C
* maxHumPct - maximum humidity in %
* minHumPct - minimum humidity in %
* avgRadWm-2 - average radiation in w/m2
* meanWindMs-1 - mean wind in m/s
* PrecipitationMm - precipitation/snow melt in mm

# Feature Engineering

In [None]:
import io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import warnings
import time

from datetime import datetime, timedelta
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeClassifier
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, RobustScaler, Normalizer, MinMaxScaler, StandardScaler, Binarizer
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.utils import shuffle
from time import time

warnings.simplefilter(action='ignore', category=FutureWarning)

# Crop Data
Import crop data file.

In [None]:
data = pd.read_csv('./dataset/FSPGrainYieldsV3Clean.csv')
data.info()

In [None]:
# Convert PlantingDate and HarvestDate from object to date
data['PlantingDate'] = pd.to_datetime(data.PlantingDate)
data['HarvestDate'] = pd.to_datetime(data.HarvestDate)

print("\nLet's check if there is any data missing on the dataset.\n")
data.isna().sum()

In [None]:
# Calculate duration between PlantingDate and HarvestDate
data['weekDuration'] = data['HarvestDate'] - data['PlantingDate']
data['weekDuration'] = data['weekDuration']/np.timedelta64(1,'W')
print('\nCheck unique values for Crop, GrowingSeason and SystemName columns.\n')
print("Crop", data.Crop.unique())
print("\nGrowing Season", data.GrowingSeason.unique())
print("\nCrop Management Type", data.SystemName.unique())
print("\nThere are duplicate values for SystemManagement because column values are case sensitive. I will convert SystemName column to uppercase.")
data['SystemName'] = data['SystemName'].str.upper()
print("\nCrop Management Type", data.SystemName.unique())

In [None]:
# 1 for conventional
# 0 for organic
data['SystemNameType'] = ((data.SystemName == "NT") | (data.SystemName == "CT")).map({True:'1', False:'0'})
# Drop SystemName column
data.drop('SystemName', axis=1, inplace=True)
data.head()

I will separate the data by crop: corn, soybean and wheat and I will display basic statistics for each crop.

# Corn dataset - Statistics

In [None]:
data_corn = data.loc[data['Crop'] == "CRN"]
data_corn.reset_index(inplace = True)
data_corn.describe(include="all")

# Soybean dataset - Statistics

In [None]:
data_soy = data.loc[data['Crop'] == "SOY"]
data_soy.reset_index(inplace = True)
data_soy.describe(include="all")

# Wheat dataset - Statistics

In [None]:
data_wheat = data.loc[data['Crop'] == "WHT"]
data_wheat.reset_index(inplace = True)
data_wheat.describe(include="all")

# Week Duration For Each Crop
I will use the minimum week duration to generate weather features:

corn = 16

soybeans = 15

wheat = 31


# Weather Data

Import weather data.

In [None]:
weather_data = pd.read_csv('./dataset/FSPWeather1996-2019V2.csv')
weather_data['Date'] = pd.to_datetime(weather_data.Date)
print(weather_data.describe(include="all"))
print("\nLet's check if there is any data missing on the dataset.\n")
print(weather_data.isna().sum())

In [None]:
weather_data.drop(['Year','JulianDay','Month',' Day','avgRadWm-2'], axis=1, inplace=True)
print(weather_data.describe(include="all"))

I will calculate growing degree days (GDD) that "are used to estimate the growth and development of plants and insects during the growing season. The basic concept is that development will only occur if the temperature exceeds some minimum development threshold, or base temperature (TBASE). The base temperatures are determined experimentally and are different for each organism". [1]

GDD formula for corn and soybean:

GDD = (Daily Max Temp °C + Daily Min Temp °C) / 2 - 10

GDD formula wheat:

GDD = (Daily Max Temp °C + Daily Min Temp °C) / 2 - 4.4

In [None]:
def calcGDD(df,startDate,endDate,factor):
    gdd = 0
    for i, j in df.loc[(df.Date >= startDate) & (df.Date <= endDate)].iterrows():
        gdd = gdd + (((j['maxTempC']+j['minTempC'])/2)-factor)
    return gdd

def calcAverage(df,startDate,endDate,var):
    sum = 0
    avg = 0
    for i, j in df.loc[(df.Date >= startDate) & (df.Date <= endDate)].iterrows():
        sum = sum + j[var]
    if sum > 0:
        avg = sum/(i+1)
    return avg

def calcMax(df,startDate,endDate,var):
    val = []
    for i, j in df.loc[(df.Date >= startDate) & (df.Date <= endDate)].iterrows():
        val.append(j[var])
    maxVal = max(val)
    return maxVal

def calcMin(df,startDate,endDate,var):
    val = []
    for i, j in df.loc[(df.Date >= startDate) & (df.Date <= endDate)].iterrows():
        val.append(j[var])
    minVal = min(val)
    return minVal

def calcSum(df,startDate,endDate,var):
    sum = 0
    for i, j in df.loc[(df.Date >= startDate) & (df.Date <= endDate)].iterrows():
        sum = sum + j[var]
    return sum

def createFeaturesMatrix(cropData,weatherData,numWeeks,GDDFactor):
    master_tp = list()
    colName = ()
    i = 0
    j = 0
    for i, j in cropData.iterrows():
        if (i == 0):
            startDate = j['PlantingDate']
        #start calculating date ranges to aggregate weather data for 16 weeks starting from plantingDate
        new_tp = ()
        for w in range(numWeeks):
            temp_tuple = ()
            beginWeek = j['PlantingDate'] + timedelta(days=7)*w
            endWeek = j['PlantingDate'] + timedelta(days=7)*(w+1)
            if(w==(numWeeks-1)):
                temp_tuple = (calcAverage(weather_data,beginWeek,endWeek,'avgtTempC'),\
                              calcMax(weather_data,beginWeek,endWeek,'maxTempC'),\
                              calcMin(weather_data,beginWeek,endWeek,'minTempC'),\
                              calcMax(weather_data,beginWeek,endWeek,'maxHumPct'),\
                              calcMin(weather_data,beginWeek,endWeek,'minHumPct'),\
                              calcAverage(weather_data,beginWeek,endWeek,'meanWindMs-1'),\
                              calcSum(weather_data,beginWeek,endWeek,'PrecipitationMm'),\
                              calcGDD(weather_data,startDate,endWeek,GDDFactor),\
                              j['SystemNameType'],j['GrainYield'])
                if (i == 0):
                    colName = colName + ('avgTemp'+str(w+1),'maxTemp'+str(w+1),'minTemp'+str(w+1),\
                                         'maxHum'+str(w+1),'minHum'+str(w+1),'meanWind'+str(w+1),\
                                         'Precip'+str(w+1),'GDD','SystemNameType','GrainYield')
            else:
                temp_tuple = (calcAverage(weather_data,beginWeek,endWeek,'avgtTempC'),\
                              calcMax(weather_data,beginWeek,endWeek,'maxTempC'),\
                              calcMin(weather_data,beginWeek,endWeek,'minTempC'),\
                              calcMax(weather_data,beginWeek,endWeek,'maxHumPct'),\
                              calcMin(weather_data,beginWeek,endWeek,'minHumPct'),\
                              calcAverage(weather_data,beginWeek,endWeek,'meanWindMs-1'),\
                              calcSum(weather_data,beginWeek,endWeek,'PrecipitationMm'))
                if (i == 0):
                    colName = colName + ('avgTemp'+str(w+1),'maxTemp'+str(w+1),'minTemp'+str(w+1),\
                                         'maxHum'+str(w+1),'minHum'+str(w+1),'meanWind'+str(w+1),\
                                         'Precip'+str(w+1))
            new_tp = new_tp + temp_tuple
        #print(new_tp)
        master_tp.append(new_tp)

    new_df = pd.DataFrame(list(master_tp),columns = colName)
    return(new_df)

# Corn

In [None]:
data_corn.drop(['Crop','GrowingSeason','HarvestDate','weekDuration'], axis=1, inplace=True)

In [None]:
new_df16 = createFeaturesMatrix(data_corn,weather_data,16,10)
new_df16.to_csv(r'./dataset/cornFeatures16w.csv', index = False, header=True)

In [None]:
new_df15 = createFeaturesMatrix(data_corn,weather_data,15,10)
new_df15.to_csv(r'./dataset/cornFeatures15w.csv', index = False, header=True)

In [None]:
new_df14 = createFeaturesMatrix(data_corn,weather_data,14,10)
new_df14.to_csv(r'./dataset/cornFeatures14w.csv', index = False, header=True)

In [None]:
new_df13 = createFeaturesMatrix(data_corn,weather_data,13,10)
new_df13.to_csv(r'./dataset/cornFeatures13w.csv', index = False, header=True)

# Soybean

In [None]:
data_soy.drop(['Crop','GrowingSeason','HarvestDate','weekDuration'], axis=1, inplace=True)

In [None]:
new_soy_df15 = createFeaturesMatrix(data_soy,weather_data,15,10)
new_soy_df15.to_csv(r'./dataset/soyFeatures15w.csv', index = False, header=True)

In [None]:
new_soy_df14 = createFeaturesMatrix(data_soy,weather_data,14,10)
new_soy_df14.to_csv(r'./dataset/soyFeatures14w.csv', index = False, header=True)

In [None]:
new_soy_df13 = createFeaturesMatrix(data_soy,weather_data,13,10)
new_soy_df13.to_csv(r'./dataset/soyFeatures13w.csv', index = False, header=True)

In [None]:
new_soy_df12 = createFeaturesMatrix(data_soy,weather_data,12,10)
new_soy_df12.to_csv(r'./dataset/soyFeatures12w.csv', index = False, header=True)

# Wheat

In [None]:
data_wheat.drop(['Crop','GrowingSeason','HarvestDate','weekDuration'], axis=1, inplace=True)

In [None]:
new_wheat_df31 = createFeaturesMatrix(data_wheat,weather_data,31,4.4)
new_wheat_df31.to_csv(r'./dataset/wheatFeatures31w.csv', index = False, header=True)

In [None]:
new_wheat_df30 = createFeaturesMatrix(data_wheat,weather_data,30,4.4)
new_wheat_df30.to_csv(r'./dataset/wheatFeatures30w.csv', index = False, header=True)

In [None]:
new_wheat_df29 = createFeaturesMatrix(data_wheat,weather_data,29,4.4)
new_wheat_df29.to_csv(r'./dataset/wheatFeatures29w.csv', index = False, header=True)

In [None]:
new_wheat_df28 = createFeaturesMatrix(data_wheat,weather_data,28,4.4)
new_wheat_df28.to_csv(r'./dataset/wheatFeatures28w.csv', index = False, header=True)