In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import seaborn as sns
import math
from IPython.display import Markdown, display
import importlib
import json
from tabulate import tabulate
import colorsys
from scipy.ndimage.filters import gaussian_filter
import sklearn
import matplotlib
import xgboost as xgb
import random
from tqdm import tqdm as tqdm


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import src.cropnet as cropnet


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def printmd(string):
    display(Markdown(string))
    
import tensorflow as tf
tf.version.VERSION

  from scipy.ndimage.filters import gaussian_filter
2024-12-05 10:59:30.970524: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-05 10:59:30.995459: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'2.16.1'

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [3]:
path = "data/full_data/processed/processed_merged.feather"
df = pd.read_feather(path)
f = open(path.replace(".feather", ".json"), "r")
correlation_filters = json.load(f)
f.close()
if 'level_0' in df.columns:
    df = df.drop(columns=['level_0'])
df = df.reset_index()
if 'level_0' in df.columns:
    df = df.drop(columns=['level_0'])

df = df[np.isnan(df['yield']) == False]

In [11]:
target = 'yield_1'
predictors = [c for c in correlation_filters['base'] if c not in ['index', 'x', 'y', 'x_int', 'y_int', 'polygon_id', 'grdkod_mar', 'year'] and "yield" not in c and "_pid" not in c]


In [12]:

mX = np.mean(df[predictors], axis=0)
sX = np.std(df[predictors], axis=0)
sX[sX == 0] = 1

mY = np.mean(df[target], axis=0)
sY = np.std(df[target], axis=0)

norm_info = {
    'X': {'mean': mX, 'std': sX},
    'y': {'mean': mY, 'std': sY}
}


In [13]:

dataset = []
for pid in tqdm(set( df['matching_pid'])):
    pdf = df[df['matching_pid'] == pid]
    X = torch.tensor(((pdf.loc[df['matching_pid'] == pid, predictors]-mX)/sX).values, device=device, dtype=torch.float)
    y = torch.tensor(((pdf.loc[df['matching_pid'] == pid, target]-mY)/sY).values.reshape((-1, 1)), device=device, dtype=torch.float)
    info = {
        'x': pdf.x.values,
        'y': pdf.y.values,
        'pid': pid
    }
    dataset.append((X, y, info))

100%|██████████| 152/152 [00:06<00:00, 22.60it/s]


In [14]:
def split_data(dataset, p_val, p_test, seed=-1):
    
    N = np.sum([X.shape[0] for X, y, info in dataset])
    rem = [data for data in dataset]
    
    data_val = []
    val_size = 0
    while val_size/N < p_val:
        idx = random.choice(range(len(rem)))
        data = rem.pop(idx)
        data_val.append(data)
        val_size += data[0].shape[0]
        
        
    data_test = []
    test_size = 0
    while test_size/N < p_test:
        idx = random.choice(range(len(rem)))
        data = rem.pop(idx)
        data_test.append(data)
        test_size += data[0].shape[0]
        
    return rem, data_val, data_test


In [18]:
class Pointwize(torch.nn.Module):
    def __init__(self, sizes, dropout=0.0):
        super(Pointwize, self).__init__()

        layers = []
        for i in range(len(sizes)-1):
            layers.append(
                nn.Linear(sizes[i], sizes[i+1])  
            )

        self.layers = torch.nn.ModuleList(layers[:-1])
        self.output = layers[-1]

    def forward(self, data):
        x = data
        for layer in self.layers:
            x = layer(x)
            x = torch.relu(x)
        x = self.output(x)
        return x

In [19]:


def run_model(model, data_train, data_val, data_test, epochs=100, lr=0.001):
    

    history = {
        'train': [],
        'val': [],
        'test': []
    }


    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss()

    scores = {}
    top_score = 10000

    # Training loop
    model.train()

    pbar = tqdm(range(epochs))
    for epoch in pbar:
        
        random.shuffle(data_train)
        err = np.zeros(0)
        total_los = []
        for X, y, info in data_train:
            optimizer.zero_grad()
            pred = model(X)
            y_cpu = y.clone().cpu().detach().numpy()
            pred_cpu = pred.clone().cpu().detach().numpy()
            y_gpu = torch.tensor(y_cpu).to(device)
            loss = criterion(pred, y_gpu)
            loss.backward()
            total_los.append(loss.item())
            optimizer.step()
            err = np.concatenate([err, cropnet.eval_error(pred, y, info, norm_info)])
        train_rmse = np.sqrt(np.nanmean(err**2))
        history['train'].append(train_rmse)

        optimizer.zero_grad()

        val_rmse, val_err = cropnet.eval_dataset(data_val, model, norm_info)
        #val_err = val_err - np.nanmean(val_err)
        history['val'].append(val_rmse)
        val_acc10 = np.mean((np.abs(val_err) < 1)[np.isnan(val_err) == False])
        metric = val_rmse
            
        test_rmse, test_err = cropnet.eval_dataset(data_test, model, norm_info)
        #test_err = test_err - np.nanmean(test_err)
        #test_rmse = np.sqrt(np.nanmean(test_err**2))
        history['test'].append(test_rmse)
        acc5 = np.mean((np.abs(test_err) < 0.5)[np.isnan(test_err) == False])
        acc10 = np.mean((np.abs(test_err) < 1)[np.isnan(test_err) == False])
        acc20 = np.mean((np.abs(test_err) < 2)[np.isnan(test_err) == False])

        if metric < top_score:
            top_score = metric
            torch.save(model, "/".join(["models", "current.pth"]))
            scores = {
                'test_rmse': test_rmse,
                'test_acc_0.5': acc5,
                'test_acc_1.0': acc10, 
                'test_acc_2.0': acc20,
                'test_rel_rmse': test_rmse/norm_info['y']['mean'],
            }

        pbar.set_description(f'Loss: {np.round(np.mean(total_los), 3)}, train rmse: {np.round(train_rmse, 3)}, val rmse: {np.round(val_rmse, 3)}, test rmse: {np.round(test_rmse, 3)}, top: [{np.round(metric, 3)} >= {np.round(top_score, 3)} {[np.round(scores[s], 3) for s in scores]}]')

    model = torch.load("models/current.pth")
    return model

In [23]:
import random
epochs = 100

score_list = []
for i in range(10):
    data_train, data_val, data_test = split_data(dataset, 0.15, 0.15)
    M = dataset[0][0].shape[1]
    model = Pointwize([M, M//4, 64, 1])
    model.to(device)
    model = run_model(model, data_train, data_val, data_test, epochs=100, lr=0.0001)

    y_tot = np.zeros(0)
    y_pred = np.zeros(0)
    test_err = np.zeros(0)
    test_err_norm = np.zeros(0)
    pids = np.zeros(0)
    for X, y, info in data_test:
        pred = model(X)
        
        yp = pred.cpu().detach().numpy().reshape(-1)
        yt = y.cpu().detach().numpy().reshape(-1)
        err = (yt - yp)*norm_info['y']['std']
        
        y_tot = np.concatenate([y_tot, yt])
        y_pred = np.concatenate([y_pred, yp])
        test_err = np.concatenate([test_err, err])
        test_err_norm = np.concatenate([test_err_norm, err - np.nanmean(err)])
        pids = np.concatenate([pids, np.ones(err.shape)*info['pid']])
        
    y_tot = y_tot*norm_info['y']['std'] + norm_info['y']['mean']
    scores_unnorm = [np.sqrt(np.nanmean(test_err**2)), np.nanmean((np.abs(test_err) < 1)[np.isnan(test_err) == False]), np.nanmean(np.abs(test_err/y_tot)), 1-np.sum(test_err**2)/np.sum((y_tot-np.mean(y_tot))**2)]
    scores_norm = [np.sqrt(np.nanmean(test_err_norm**2)), np.nanmean((np.abs(test_err_norm) < 1)[np.isnan(test_err_norm) == False]), np.nanmean(np.abs(test_err_norm/y_tot)), 1-np.sum(test_err_norm**2)/np.sum((y_tot-np.mean(y_tot))**2)]
    result = [scores_unnorm, scores_norm, [list(y_tot), list(y_pred), list(test_err), list(test_err_norm), list(pids)]]
    score_list.append(result)


cropnet.save_score_to_json(score_list, "results_final/other_models/pointwize.json")

Loss: 0.131, train rmse: 0.821, val rmse: 1.179, test rmse: 1.069, top: [1.179 >= 1.145 [1.094, 0.424, 0.678, 0.933, 0.133]]: 100%|██████████| 100/100 [00:05<00:00, 18.60it/s]
Loss: 0.149, train rmse: 0.906, val rmse: 1.06, test rmse: 0.917, top: [1.06 >= 0.968 [0.876, 0.527, 0.81, 0.965, 0.106]]: 100%|██████████| 100/100 [00:05<00:00, 18.31it/s]  
Loss: 0.143, train rmse: 0.89, val rmse: 1.086, test rmse: 0.995, top: [1.086 >= 1.044 [0.976, 0.495, 0.762, 0.946, 0.118]]: 100%|██████████| 100/100 [00:05<00:00, 18.78it/s]
Loss: 0.146, train rmse: 0.877, val rmse: 1.155, test rmse: 0.952, top: [1.155 >= 1.116 [0.948, 0.401, 0.725, 0.963, 0.115]]: 100%|██████████| 100/100 [00:04<00:00, 20.20it/s]
Loss: 0.133, train rmse: 0.857, val rmse: 1.142, test rmse: 1.022, top: [1.142 >= 1.122 [1.017, 0.381, 0.716, 0.954, 0.123]]: 100%|██████████| 100/100 [00:05<00:00, 19.54it/s]
Loss: 0.133, train rmse: 0.813, val rmse: 1.288, test rmse: 1.104, top: [1.288 >= 1.174 [1.239, 0.342, 0.631, 0.895, 0.15]