# Harvest Regressions using Satellite Imagery

### Libraries

In [None]:
import os
import pandas as pd
import geopandas as gpd
import geowombat as gw

import matplotlib.pyplot as plt
import numpy as np
from geowombat.ml import fit, predict, fit_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as MSE

### Declare relevant dirs and files

In [None]:
tif_dir = "./data/processed/tif"
harvest_dir = "./data/processed/csv/harvest"

### Categorize and store all band TIFs

In [None]:
years = {}
for (root, dirs, files) in os.walk(tif_dir):
	for file in files:
		if (file.endswith(".tif")):
			# Determine year from file name
			underscore = len(file) - file[::-1].index("_")
			dot = len(file) - file[::-1].index(".") - 1
			year = int(file[underscore:dot])

			if year not in years:
				years[year] = {'file_names': [], 'band_names': []}

			# Add file and band name to year
			years[year]['file_names'].append(root + "/" + file)
			years[year]['band_names'].append(file[file.index("__") + 2:len(file)
				- file[::-1].index("__") - 2])

### For each year, open all bands into one stack and store it

In [None]:
for year in years:
	files = years[year]['file_names']
	bands = years[year]['band_names']

	with gw.open(files, stack_dim = 'band', band_names = bands) as stack:
		years[year]['stack'] = stack

### Read in all harvest data

In [None]:
harvest_dfs = []
for (root, dirs, files) in os.walk(harvest_dir):
	for file in files:
		if (file.endswith(".csv")):
			harvest_dfs.append(pd.read_csv(root + "/" + file))

### Combine all harvest data into one DataFrame

In [None]:
harvest = pd.concat(harvest_dfs, ignore_index = True)

### Convert DataFrame to GeoDataFrame

In [None]:
geometry = gpd.points_from_xy(x = harvest.Longitude, y = harvest.Latitude)
harvest = gpd.GeoDataFrame(harvest, crs = 'EPSG:4326', geometry = geometry)

### Add year attribute to each row

In [None]:
harvest['Year'] = [int(date[-4:]) for date in harvest['Date']]

### Add harvest data to corresponding year and product category

In [None]:
for year in years:
	gdf = harvest[harvest['Year'] == year]

	# Remove temporary column 'Year'
	gdf.drop(columns = ['Year'], inplace = True)

	years[year]['Products'] = []

	for product in gdf['Product'].unique():
		years[year]['Products'].append(gdf[gdf['Product'] == product])

### Perform data extraction of the approriate stack on each year/product AOI

In [None]:
extracted = []
for year in years:
		for gdf in years[year]['Products']:
				# Extract
				bands = years[year]['stack'].band.values.tolist()
				gdf = gw.extract(years[year]['stack'], gdf, band_names = bands)

				if len(gdf.index) > 0:
					# Select numerical columns to be used for analysis
					num = bands
					num.extend(['Yld Mass(Dry)(lb/ac)'])

					# Add data to list with its metadata
					extracted.append({'Year': year, 
						'Product': gdf['Product'].values[0], 'Data': gdf[num]})

### Create a random forest object to hold the decision trees

In [None]:
rf = RandomForestRegressor(criterion = "squared_error", 
	bootstrap = True, oob_score = True, n_jobs = -1)

### idk

In [None]:
hyperparameter_space = {'max_depth': [None, 4, 6, 8, 10, 12, 15, 20], 
	'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30],
	'max_features': ['1.0', 'sqrt', 'log2']}


gs = GridSearchCV(rf, param_grid = hyperparameter_space, 
    scoring = "neg_mean_squared_error", n_jobs = -1, cv = 5, return_train_score = True)

### idk

In [None]:
X_gdf = extracted[2]['Data'].iloc[:,:-1]
X_gdf.dropna(axis = 1, inplace = True)

y_gdf = extracted[2]['Data'].iloc[:,[0,-1]]
y_gdf.dropna(axis = 1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(
		X_gdf, y_gdf, test_size = 0.30, random_state = 50)

### idk

In [None]:
gs.fit(X_train, y_train)

print("Optimal hyperparameter combination: ", gs.best_params_)
print("Mean cross-validated MSE or training score of the best_estimator: ",
       np.sqrt(-gs.best_score_))

gs.best_estimator_.fit(X_train, y_train)
y_pred = gs.best_estimator_.predict(X_test)

print("Test score: ", np.round(np.sqrt(MSE(y_test, y_pred)), 2))

print(r2_score(y_test, gs.predict(X_test)))