In [1]:
# Import all needed libraries and sublibraries

import tensorflow as tf

import numpy as np 
    
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


import pandas as pd

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error

from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
dataframe = pd.read_csv("aafc_data.csv")  # importing the dataset
# summarize shape
print(dataframe.shape)
print("\n")
# summarize first few lines
dataframe.head()

(42007, 166)




Unnamed: 0.1,Unnamed: 0,TWP_ID,ECODISTRICT_ID,YEAR,YieldKgAcre,SumPcpn18_20,SumPcpn19_21,SumPcpn20_22,SumPcpn21_23,SumPcpn22_24,...,SoilMoisture29_31,SoilMoisture30_32,SoilMoisture31_33,SoilMoisture32_34,SoilMoisture33_35,SoilMoisture34_36,SoilMoisture35_37,SoilMoisture36_38,SoilMoisture37_39,SoilMoisture38_40
0,0,00101E1,852.0,2010,867.766846,53.6,111.1,109.7,117.9,46.4,...,16.960125,18.766207,17.186998,15.461519,19.738222,22.958089,27.206203,26.480087,28.678156,26.308484
1,1,00101W1,852.0,2010,673.685028,57.2,114.7,110.5,114.0,46.2,...,16.32852,17.926029,16.787544,14.779726,20.245149,23.608204,28.56099,27.324254,29.079177,26.927224
2,2,00101W2,796.0,2010,824.303864,39.0,96.4,109.8,101.2,111.4,...,13.117879,12.869142,12.831834,14.126196,16.385776,18.650751,20.287069,20.514132,19.564788,16.681692
3,3,00102E1,853.0,2010,1006.708496,37.5,158.2,157.8,161.4,46.9,...,17.060778,18.699156,17.345822,15.998957,20.091525,22.761273,26.33743,25.559602,27.611729,25.575794
4,4,00102W1,852.0,2010,869.040283,57.2,114.7,110.5,114.0,46.2,...,16.050993,17.55686,16.612026,14.48015,20.467884,23.893858,29.156274,27.695178,29.255386,27.199097


In [3]:
# dropping those columns that are not useful
dataframe.drop(['Unnamed: 0', 'TWP_ID', 'YEAR'], axis=1, inplace=True)

In [4]:
# looking at the dataset after dropping the columns 
dataframe.head()

Unnamed: 0,ECODISTRICT_ID,YieldKgAcre,SumPcpn18_20,SumPcpn19_21,SumPcpn20_22,SumPcpn21_23,SumPcpn22_24,SumPcpn23_25,SumPcpn24_26,SumPcpn25_27,...,SoilMoisture29_31,SoilMoisture30_32,SoilMoisture31_33,SoilMoisture32_34,SoilMoisture33_35,SoilMoisture34_36,SoilMoisture35_37,SoilMoisture36_38,SoilMoisture37_39,SoilMoisture38_40
0,852.0,867.766846,53.6,111.1,109.7,117.9,46.4,69.3,60.0,44.6,...,16.960125,18.766207,17.186998,15.461519,19.738222,22.958089,27.206203,26.480087,28.678156,26.308484
1,852.0,673.685028,57.2,114.7,110.5,114.0,46.2,68.1,55.9,34.9,...,16.32852,17.926029,16.787544,14.779726,20.245149,23.608204,28.56099,27.324254,29.079177,26.927224
2,796.0,824.303864,39.0,96.4,109.8,101.2,111.4,153.0,163.6,98.8,...,13.117879,12.869142,12.831834,14.126196,16.385776,18.650751,20.287069,20.514132,19.564788,16.681692
3,853.0,1006.708496,37.5,158.2,157.8,161.4,46.9,79.5,67.5,40.4,...,17.060778,18.699156,17.345822,15.998957,20.091525,22.761273,26.33743,25.559602,27.611729,25.575794
4,852.0,869.040283,57.2,114.7,110.5,114.0,46.2,68.1,55.9,34.9,...,16.050993,17.55686,16.612026,14.48015,20.467884,23.893858,29.156274,27.695178,29.255386,27.199097


In [5]:
eco_district_ids_list = [748, 826, 752, 745, 808, 792, 849, 729, 753, 709]

In [6]:
msetest = []
msetrain = []
mae = []
acc = []
for i in eco_district_ids_list:
    df1 = dataframe[dataframe['ECODISTRICT_ID']==i]   # created a dataframe for one of the eco district
    df1.drop(['ECODISTRICT_ID'], axis=1, inplace=True)  # dropped the 'ECODISTRICT_ID' column before modeling

    # split data into X and y
    x = pd.DataFrame(df1.drop(labels=['YieldKgAcre'], axis=1)) # x contains the predictors (not the target value 'YieldKgAcre')
    y = pd.DataFrame(df1['YieldKgAcre'])                       # y contains the dependent variable ('YieldKgAcre').
    
    
    # splitting the x and y datasets to train and test with the ratio 80:20 ratio.
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

# Run standardization on X variables
# So that it centers around 0 and has standard deviation of 1.
    X_train = scale(X_train)
    X_test = scale(X_test)
    
    
    
    # Defines "deep" model and its structure
    model = Sequential()
    model.add(Dense(15, input_shape=(161,), activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(15, activation='relu'))    
    model.add(Dense(1,))
    model.compile(Adam(lr=0.003), 'mean_squared_error')

# Pass several parameters to 'EarlyStopping' function and assigns it to 'earlystopper'
    earlystopper = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto')

# Fits model over 2000 iterations with 'earlystopper' callback, and assigns it to history
    history = model.fit(X_train, y_train, epochs = 2000, validation_split = 0.2,shuffle = True, verbose = 0, 
                    callbacks = [earlystopper])

# Plots 'history'
# history_dict=history.history
# loss_values = history_dict['loss']
# val_loss_values=history_dict['val_loss']
# plt.plot(loss_values,'bo',label='training loss')
# plt.plot(val_loss_values,'r',label='training loss val')

# Runs model with its current weights on the training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    msetest.append(np.sqrt(mean_squared_error(y_test,y_test_pred)))
    msetrain.append(np.sqrt(mean_squared_error(y_train,y_train_pred)))
# Calculates and prints r2 score of training and testing data
    print("The R2 score on the Train set is:\t{:0.3f}".format(r2_score(y_train, y_train_pred)))
    print("The R2 score on the Test set is:\t{:0.3f}".format(r2_score(y_test, y_test_pred)))
    
    # Calculate the absolute errors
    errors = abs(y_test_pred - y_test)

#  # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', np.mean(errors), 'degrees.')
    mae.append(np.mean(errors))


# Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / y_test)

# Calculate and display the accuracy
    accuracy = 100 - np.mean(mape)
    #print('Accuracy:', round(accuracy, 2), '%.')
    acc.append(round(accuracy, 2))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 88: early stopping
The R2 score on the Train set is:	0.859
The R2 score on the Test set is:	0.722


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 51: early stopping
The R2 score on the Train set is:	0.756
The R2 score on the Test set is:	0.682


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 61: early stopping
The R2 score on the Train set is:	0.867
The R2 score on the Test set is:	0.795


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 76: early stopping
The R2 score on the Train set is:	0.910
The R2 score on the Test set is:	0.850


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 58: early stopping
The R2 score on the Train set is:	0.767
The R2 score on the Test set is:	0.678


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 55: early stopping
The R2 score on the Train set is:	0.817
The R2 score on the Test set is:	0.638


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 69: early stopping
The R2 score on the Train set is:	0.915
The R2 score on the Test set is:	0.900


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 51: early stopping
The R2 score on the Train set is:	0.753
The R2 score on the Test set is:	0.625


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 43: early stopping
The R2 score on the Train set is:	0.880
The R2 score on the Test set is:	0.731


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  super(Adam, self).__init__(name, **kwargs)


Epoch 66: early stopping
The R2 score on the Train set is:	0.881
The R2 score on the Test set is:	0.816


In [7]:
import re
accn=[]
for i in range(len(acc)):
    acc[i] = acc[i].to_string()
    sentence = acc[i]
    s = [float(s) for s in re.findall('[0-9.]+', sentence)]
    accn.append(s)

In [8]:
maen=[]
for i in range(len(mae)):
    mae[i] = mae[i].to_string()
    sentence = mae[i]
    s = [float(s) for s in re.findall('[0-9.]+', sentence)]
    maen.append(s)

In [9]:
mae = [item for sublist in maen for item in sublist]
acc = [item for sublist in accn for item in sublist]

In [10]:
results = pd.DataFrame(list(zip(msetrain, msetest, mae, acc)),
               columns =['MSE Train', 'MSE Test', 'Mean Absolute Error', 'Accuracy'])

In [11]:
results

Unnamed: 0,MSE Train,MSE Test,Mean Absolute Error,Accuracy
0,74.110524,102.131297,82.878678,88.1
1,101.478231,114.354444,87.908937,86.56
2,76.431874,94.539204,72.183839,89.42
3,68.938673,88.434558,67.704378,91.59
4,96.541757,120.209507,95.298198,87.07
5,87.720692,121.956178,96.095012,87.65
6,74.637193,82.104415,64.682261,91.73
7,84.472468,102.762788,79.520072,91.48
8,60.206442,96.71421,73.650977,91.33
9,82.431152,113.937669,87.070471,89.61


In [13]:
results["Accuracy"].mean()

89.454