In [None]:
!pip install pygam

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
from xgboost import XGBRegressor
import os
from glob import glob
import matplotlib.pyplot as plt
from skimage import data
from scipy.stats import gmean

# GAM
from pygam import LinearGAM, s, f, PoissonGAM,te

In [None]:
gt_path = '../input/esahyber/train_data/train_data/train_gt.csv'
wavelength_path = '../input/esahyber/train_data/train_data/wavelengths.csv'

In [None]:
hsi_path = '../input/esahyber/train_data/train_data/train_data/1000.npz'

gt_df = pd.read_csv(gt_path)
wavelength_df = pd.read_csv(wavelength_path)

## Load the data

In [None]:
def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values
    return labels


In [None]:
tr_preds = pd.read_csv('../input/postprocess-gam/train_preds.csv')
te_preds = pd.read_csv('../input/postprocess-gam/submissions_gmean_cat_rf_lgbm_xgbm_et.csv')


y = load_gt("../input/esahyber/train_data/train_data/train_gt.csv")
y = pd.DataFrame(y, columns=["P", "K", "Mg", "pH"])

In [None]:
X = np.array(tr_preds[['P','K','Mg','pH']])
X_te = np.array(te_preds[['P','K','Mg','pH']])

y_p = np.array(y['P'])
y_K = np.array(y['K'])
y_Mg = np.array(y['Mg'])
y_pH = np.array(y['pH'])


### Post processing

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import random

In [None]:
# Evaluation metric
class BaselineRegressor:
    """
    Baseline regressor, which calculates the mean value of the target from the training
    data and returns it for each testing sample.
    """
    def __init__(self):
        self.mean = 0

    def fit(self, X_train: np.ndarray, y_train: np.ndarray):
      self.mean = np.mean(y_train, axis=0)
      #self.classes_count = y_train.shape[1]
      self.classes_count = 1
      return self

    def predict(self, X_test: np.ndarray):
      return np.full((len(X_test), self.classes_count), self.mean)


In [None]:
def EvaluationMetric(baseline_reg, x_val, y_val, val_preds):
    #baseline_model = baseline_reg
    baseline_predictions = baseline_reg.predict(x_val)
    baseline_predictions = baseline_predictions.squeeze()
    baselines = np.mean((y_val - baseline_predictions) ** 2, axis=0)
    
    mse = np.mean((y_val - val_preds) ** 2, axis=0)
    score = mse / baselines

    return score

In [None]:
def regressor(x_train, y_train, x_val, y_val, X_test):
    
    gam_p = LinearGAM(s(0) + s(1) +s(2) + s(3) + te(0, 1)+ te(0, 2)+ te(0, 3)+ te(1, 2) + te(1, 3) + te(2, 3))
    gam_p.gridsearch(x_train, y_train)
    val_preds = gam_p.predict(x_val)
    
    # Predictions
    baseline_reg = BaselineRegressor()
    baseline_reg = baseline_reg.fit(x_train, y_train)
    
    score = EvaluationMetric(baseline_reg, x_val, y_val, val_preds)
    te_preds =  gam_p.predict(X_test)

    return score, te_preds

In [None]:
np.random.seed(2022)

kf = KFold(n_splits =5,shuffle=True,random_state=2022)

final_scores = []
#val_predictions = []
final_predictions = []


for i,(tr_index, val_index) in enumerate(kf.split(X, y_p)):
    print(f'######### FOLD {i+1} / {kf.n_splits}')
    scores = []
    preds = np.zeros((X_te.shape[0],4))
    
    x_train, y_train, x_val, y_val = X[tr_index], y_p[tr_index], X[val_index], y_p[val_index]
    score, P_preds = regressor(x_train, y_train, x_val, y_val, X_te)
    scores.append(score)
    print(f'P Score: {score}')
    preds[:,0] = P_preds
    
    x_train, y_train, x_val, y_val = X[tr_index], y_K[tr_index], X[val_index], y_K[val_index]
    score, k_preds = regressor(x_train, y_train, x_val, y_val, X_te)
    scores.append(score)
    print(f'k Score: {score}')
    preds[:,1] = k_preds
    
 
    x_train, y_train, x_val, y_val = X[tr_index], y_Mg[tr_index], X[val_index], y_Mg[val_index]
    score, Mg_preds = regressor(x_train, y_train, x_val, y_val, X_te)
    scores.append(score)
    print(f'Mg Score: {score}')
    preds[:,2] = Mg_preds
    
 
    x_train, y_train, x_val, y_val = X[tr_index], y_pH[tr_index], X[val_index], y_pH[val_index]
    score, pH_preds = regressor(x_train, y_train, x_val, y_val, X_te)
    scores.append(score)
    print(f'pH Score: {score}')
    preds[:,3] = pH_preds
    
    
    final_score = np.mean(scores)
    print(f'Overall score: {final_score} ')

    final_scores.append(final_score)
    final_predictions.append(preds)


print('mean scores: {} '.format(np.mean(final_scores)))   

In [None]:
final_test_predictions = gmean(final_predictions, axis=0)

submission = pd.DataFrame(data = final_test_predictions, columns=["P", "K", "Mg", "pH"])
submission.to_csv("./PostProcessing_GAM_gmean.csv", index_label="sample_index")