In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,KFold, cross_val_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor

In [56]:
class pcr_rf_ecodistrict_model_fit():
    def __init__(self,aafc_data, ecodistrict):
        self.aafc_data=aafc_data
        self.ecodistrict=ecodistrict
        
    def model_train_test(self):
        pca = PCA()
        data=self.aafc_data[self.aafc_data['ECODISTRICT_ID']==self.ecodistrict]
        records=len(data)
        unique_twnships=data['TWP_ID'].nunique()
        labels = data['YieldKgAcre']
        features= data.drop(['YieldKgAcre'], axis = 1)
        
        train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 42)
        scaler = StandardScaler().fit(train_features.loc[:, ~train_features.columns.isin(['TWP_ID', 'ECODISTRICT_ID', 'YEAR'])])

        train_index=train_features
        test_index=test_features
        
        X_train_scaled = scale(train_features.loc[:, ~features.columns.isin(['TWP_ID', 'ECODISTRICT_ID', 'YEAR'])])
        X_test_scaled = scale(test_features.loc[:, ~features.columns.isin(['TWP_ID', 'ECODISTRICT_ID', 'YEAR'])])
        
        X_train_pc = pca.fit_transform(X_train_scaled)

        result = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100) ## List of cummulative variances by all the principal components in %
        for j in range(0, len(result)):
            if result[j] > 98:
                index = j
                variance = result[j]
                break
                
        # determine optimal number of principal components
        best_pc_num = index + 1
        
        
        rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
        rf.fit(X_train_pc[:,:best_pc_num], train_labels);
        X_test_pc = pca.transform(X_test_scaled)[:,:best_pc_num]
        pred_train = rf.predict(X_train_pc[:,:best_pc_num])
        pred_test = rf.predict(X_test_pc[:,:best_pc_num])

        return scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance
        
        
        

In [57]:
class pcr_rf_ecodistrict_model_validation_scoring(pcr_rf_ecodistrict_model_fit):
    
    def __init__(self,aafc_data, ecodistrict):
        pcr_rf_ecodistrict_model_fit.__init__(self,aafc_data, ecodistrict)
    
    def validation_metrics(self):
        scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance=pcr_rf_ecodistrict_model_fit.model_train_test(self)
        
        mse_train = round(mean_squared_error(train_labels, pred_train,squared=False),2)
        # Test data
        mse_test =round(mean_squared_error(test_labels, pred_test,squared=False),2)
        
        
 
        errors = abs(pred_test - test_labels)

        mae=round(np.mean(errors), 2)
        

        mape = 100 * (errors / test_labels)
        # Calculate and display accuracy
        accuracy = round(100 - np.mean(mape),2)
        
        print("Mean Squared Error Train: ",mse_train)
        print("Mean Squared Error Test: ",mse_test)

        print("Mean Absolute Error: ",mae)
        print("Accuracy: ",accuracy)
    
    def predicted_train_dataset(self):
        scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance=pcr_rf_ecodistrict_model_fit.model_train_test(self)
        train_index['Predicted_Yield']=pred_train
        return train_index
    
    def predicted_test_dataset(self):
        scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance=pcr_rf_ecodistrict_model_fit.model_train_test(self)
        test_index['Predicted_Yield']=pred_test
        return test_index
    
    def number_principal_components(self):
        scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance=pcr_rf_ecodistrict_model_fit.model_train_test(self)
        print("Number of Principal Components is:",best_pc_num)

    def cummulative_explained_variance(self):
        scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance=pcr_rf_ecodistrict_model_fit.model_train_test(self)
        print("Cummulative Explained Variance is:",round(variance,2),"%")
    
    def score(self,data):
        scaler,rf,train_labels,test_labels,train_index,test_index,pred_train,pred_test,best_pc_num,variance=pcr_rf_ecodistrict_model_fit.model_train_test(self)
        features=scaler.transform(data.loc[:, ~data.columns.isin(['TWP_ID', 'ECODISTRICT_ID', 'YEAR'])])
        
        predicted_yield = rf.predict(features)
        data['Predicted_Yield']=predicted_yield        
        return data

    


In [58]:
import pandas as pd
data=pd.read_csv("aafc_data.csv",index_col='Unnamed: 0')

In [59]:
test=pcr_rf_ecodistrict_model_validation_scoring(data,748)
test.validation_metrics()

Mean Squared Error Train:  30.68
Mean Squared Error Test:  87.13
Mean Absolute Error:  65.59
Accuracy:  90.62


In [46]:
test.predicted_train_dataset()

Unnamed: 0,TWP_ID,ECODISTRICT_ID,YEAR,SumPcpn18_20,SumPcpn19_21,SumPcpn20_22,SumPcpn21_23,SumPcpn22_24,SumPcpn23_25,SumPcpn24_26,...,SoilMoisture30_32,SoilMoisture31_33,SoilMoisture32_34,SoilMoisture33_35,SoilMoisture34_36,SoilMoisture35_37,SoilMoisture36_38,SoilMoisture37_39,SoilMoisture38_40,Predicted_Yield
13295,02213W2,748.0,2013,6.1,2.3,20.2,36.9,59.2,71.8,85.9,...,17.981174,16.765533,16.276152,16.673688,17.371735,17.082472,18.084177,19.738484,21.282022,961.884567
13831,02914W2,748.0,2013,15.1,16.2,13.2,13.0,40.3,78.4,77.0,...,21.885810,19.811717,19.821398,20.094774,20.900401,19.940595,19.568532,20.794136,22.447396,879.706169
2361,02908W2,748.0,2010,31.0,53.0,68.0,93.5,80.5,91.0,134.5,...,25.642627,26.917807,28.146627,28.435319,29.932915,30.308479,31.161005,30.483541,29.885650,413.298236
29253,03118W2,748.0,2017,7.9,17.6,13.1,16.1,38.9,44.0,46.6,...,21.074314,20.242132,20.079251,19.196583,18.761889,19.484885,20.396568,21.120691,22.933938,1080.909099
5880,02511W2,748.0,2011,31.4,53.0,66.0,69.1,99.6,130.6,119.2,...,31.077518,32.096615,31.934064,31.750426,30.942115,30.284993,29.332133,27.532984,28.078072,610.418754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36658,02807W2,748.0,2019,10.1,6.0,4.2,7.5,52.5,74.6,72.7,...,12.861590,12.986172,13.241977,15.142962,16.697468,18.759806,20.120438,20.806290,21.384953,1104.868079
40089,02308W2,748.0,2020,16.8,9.1,6.5,17.0,22.3,22.1,21.1,...,11.924898,11.098456,11.819164,11.887890,12.729781,13.376161,14.056077,14.345916,14.828639,796.611542
40331,02610W2,748.0,2020,16.8,9.1,6.5,17.0,22.3,22.1,21.1,...,16.141255,15.989338,16.071511,15.328163,15.435112,15.717559,16.385579,16.719314,17.341310,933.443792
29236,03110W2,748.0,2017,7.9,17.6,13.1,16.1,38.9,44.0,46.6,...,19.187773,18.196467,17.062438,14.442647,14.311135,15.391966,17.002844,18.023474,19.315273,1004.069556


In [47]:
test.predicted_test_dataset()

Unnamed: 0,TWP_ID,ECODISTRICT_ID,YEAR,SumPcpn18_20,SumPcpn19_21,SumPcpn20_22,SumPcpn21_23,SumPcpn22_24,SumPcpn23_25,SumPcpn24_26,...,SoilMoisture30_32,SoilMoisture31_33,SoilMoisture32_34,SoilMoisture33_35,SoilMoisture34_36,SoilMoisture35_37,SoilMoisture36_38,SoilMoisture37_39,SoilMoisture38_40,Predicted_Yield
40624,03009W2,748.0,2020,16.8,9.1,6.5,17.0,22.3,22.1,21.1,...,15.235482,15.024330,15.154617,15.091999,15.760560,15.797961,15.685274,15.835196,16.251589,891.858101
29246,03115W2,748.0,2017,7.9,17.6,13.1,16.1,38.9,44.0,46.6,...,19.703098,18.753291,18.160012,16.979312,16.967661,18.719555,20.784561,21.444098,22.505679,970.838347
2513,03115W2,748.0,2010,52.0,65.0,80.7,102.3,87.0,84.5,52.5,...,25.273404,27.452747,29.750311,30.857499,34.211875,33.929237,34.615255,32.269810,31.282000,273.938887
17112,02212W2,748.0,2014,28.5,22.5,33.3,46.3,67.2,93.3,173.5,...,23.944748,23.024555,23.008966,23.793041,24.948404,26.628201,26.766013,25.853900,25.614269,642.384252
2232,02715W2,748.0,2010,52.0,65.0,80.7,102.3,87.0,84.5,52.5,...,23.729485,25.900685,28.545644,30.516925,33.451371,33.482113,33.563329,32.577507,31.869235,515.657359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13682,02712W2,748.0,2013,15.1,16.2,13.2,13.0,40.3,78.4,77.0,...,26.523946,24.373960,23.490339,23.784885,24.500236,23.392279,23.569345,26.491220,29.270419,840.181412
29250,03117W2,748.0,2017,7.9,17.6,13.1,16.1,38.9,44.0,46.6,...,20.083118,19.395305,19.420294,18.698961,18.301215,18.972054,20.003949,20.828671,23.045577,972.578097
17853,03215W2,748.0,2014,17.9,12.4,55.1,74.7,92.7,118.3,147.6,...,26.227639,25.174253,25.634867,26.523125,28.321356,29.222553,29.352997,28.956514,29.386362,660.195556
40705,03116W2,748.0,2020,18.4,16.7,8.5,27.5,44.7,53.6,43.8,...,16.784715,15.641812,15.584498,14.952186,15.879456,15.685356,15.599220,16.200028,17.269666,952.267172


In [48]:
test.number_principal_components()

Number of Principal Components is: 20


In [49]:
test.cummulative_explained_variance()

Cummulative Explained Variance is: 95.48 %


In [60]:
data_to_score=pd.read_csv("scoring_test_df.csv",index_col='Unnamed: 0')
test.score(data_to_score)

ValueError: X has 161 features, but RandomForestRegressor is expecting 28 features as input.