In [None]:
# MODEL 4: random forest with tuning, using all predictors

# set predictors
numerical_cols_all = numerical_cols
categorical_cols_all = categorical_cols

# set tuning space
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of trees in random forest
max_features = ['auto', 'sqrt'] # Number of features to consider at every split
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # Maximum number of levels in tree
max_depth.append(None)
min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node
min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node
bootstrap = [True, False] # Method of selecting samples for training each tree

# Create the random grid
random_grid = {'preprocessor__num__imputer__strategy': ['mean', 'median'],
                'rf__n_estimators': n_estimators,
               'rf__max_features': max_features,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf,
               'rf__bootstrap': bootstrap}

# preprocessing pipelines for both numeric and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_all = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_all),
        ('cat', categorical_transformer, categorical_cols_all)])

# Append classifier to preprocessing pipeline
pipeline_all = Pipeline(steps=[('preprocessor', preprocessor_all),
                      ('rf', RandomForestClassifier())])

rf_all_tuned = RandomizedSearchCV(estimator = pipeline_all, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_all_tuned.fit(X_train, y_train)

print(rf_all_tuned.best_params_)

print(("best accuracy from grid search, with all features: %.3f"
       % rf_all_tuned.score(X_test, y_test)))

Best hyperparameters: {'rf__n_estimators': 1600, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_features': 'sqrt', 'rf__max_depth': None, 'rf__bootstrap': False, 'preprocessor__num__imputer__strategy': 'mean'}

With all predictors, the tuned model has an accuracy of 0.88; this is slightly higher than the untuned model's accuracy of 0.84, but the gain from tuning when usign all predictors is less than the gain when avoiding geographic predictors.

What if we used geogrpahic predictors?

In [None]:

# use it to make predictions of irrigation growth in other years
# use only predictors that appear in all years

# set predictors
numerical_cols_allyears = list(mod_ts_cols_allyears) + phenospectral_cols + missingness_cols
numerical_cols_all = numerical_cols_allyears
categorical_cols_all = categorical_cols

# preprocessing pipelines for both numeric and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_all = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_all),
        ('cat', categorical_transformer, categorical_cols_all)])

# Append classifier to preprocessing pipeline
tuned_rf = RandomForestClassifier(n_estimators = 1600, min_samples_split = 2, 
                                   min_samples_leaf = 1, max_features = 'sqrt', max_depth = None, 
                                   bootstrap = False)
                            
pipeline_all = Pipeline(steps=[('preprocessor', preprocessor_all),
                      ('rf', tuned_rf)])

# training
pipeline_all.fit(X_train, y_train)

# prediction
acc_all = pipeline_all.score(X_test, y_test)
print(acc_all)

In [None]:
# predictions in new years - using all predictors

# data frame to save predicted percent irrigation in each region
columns_list = ['year', 'Brazil', 'SUL', 'CENTRO-OESTE', 'SUDESTE', 'NORDESTE', 'NORTE']
predicted_irrig_all = pd.DataFrame(columns = columns_list)

years_list = list(range(2004, 2014)) + list(range(2015, 2019))

for year in years_list:
    data_year = pd.read_csv('predictionpts_cleaned_' + str(year) + '.csv', index_col = 0)
    chosen_predictors = numerical_cols_all + categorical_cols_all
    X_year = data_year[chosen_predictors]

    prediction = pipeline_all.predict(X_year)
    
    data_year['prediction'] = prediction
    
    # percent irrigated by region
    percent_Brazil = prediction.mean()
    percent_south = data_year[data_year.region == 'SUL'].prediction.mean()
    percent_centerwest = data_year[data_year.region == 'CENTRO-OESTE'].prediction.mean()
    percent_southeast = data_year[data_year.region == 'SUDESTE'].prediction.mean()
    percent_northeast = data_year[data_year.region == 'NORDESTE'].prediction.mean()
    percent_north = data_year[data_year.region == 'NORTE'].prediction.mean()

    predicted_irrig_all.loc[len(predicted_irrig_all)] = [year, percent_Brazil, percent_south,
                                             percent_centerwest, percent_southeast,
                                             percent_northeast, percent_north]
    print(year)
    
print(predicted_irrig_all)

In [None]:
# plot predictions

tidy_predictions_all = predicted_irrig_all.melt(id_vars = ['year'], value_name = 'percent_irrigated', 
                                        var_name = 'region')

sns.lineplot(x = 'year', y = 'percent_irrigated', data = tidy_predictions_all, hue = 'region')
plt.title('Percent irrigation, predicted; using geographic features')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
Although the classifer that is trained with geographic information has a high test accuracy, it doesn't reflect the time trends in irrigation expansion as well as a classifier that doesn't include geographic information. The percent irrigation predicted across time looks very similar to what was observed in 2014, indicating that the geographic spread of irrigation in 2014 has an overly strong pull on predicted irrigation in other years.