In [50]:
# Let's import the needed packages

# Find the El Nino peaks in the signal
from scipy.signal import find_peaks

# To deal with arrays 
import xarray as xr
import pandas as pd 
pd.options.mode.chained_assignment = None
import numpy as np

# To plot the Nino 3.4 index
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# To plot the map
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 0. Useful functions

In [54]:
def get_data(Path):
    """
    Path: Path of the preprocessed data you want to get
    """
    data = np.load(Path)
    # Get the variables
    X = data["X"]
    y = data["y"]

    # Check dimensions
    print('==== GET THE DATA ====')
    print("Shape of X:", X.shape)
    print("Shape of y:", y.shape)

    print("First 5 elements of y:", y[:25])
    return(X,y)


In [55]:
def split_scale_data(X, y, RANDOM_SPLIT=False, train_size=374):
    """
    X, y: Inputs and outputs
    RANDOM_SPLIT: Boolean, if False splits for train and test are done in in time order, if True, splits are random
    train_size: number of data in the train sample (test_size=474 - train_size)
    """
    # Split the data
    if RANDOM_SPLIT:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=42)
    else:
        X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:]

    # Introduce a scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    # Standardize the train and test data
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Check dimensions
    print("==== SPLIT & SCALE THE DATA ====")
    print("Shape of X_train:", X_train.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_test:", y_test.shape)

    return(X_train, X_test, y_train, y_test)


# 1. One-month prediction

In [56]:
# Get the data for a 1-month prediction, split and scale the data
X, y = get_data("data/preprocessed_data/nino_dataset_1m.npz")
X_train, X_test, y_train, y_test= split_scale_data(X, y, RANDOM_SPLIT=False, train_size=374)

==== GET THE DATA ====
Shape of X: (474, 64800)
Shape of y: (474,)
First 5 elements of y: [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
==== SPLIT & SCALE THE DATA ====
Shape of X_train: (374, 64800)
Shape of y_train: (374,)
Shape of X_test: (100, 64800)
Shape of y_test: (100,)


The problem here is that we have a small amount of data and the data are in high dimension. 
**=> High risk of overfitting**

Let us try to first simple model so we can have a baseline.

## 1.1. The baseline

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score, accuracy_score

In [40]:
# Define the classifier
lda = LinearDiscriminantAnalysis()

# Fit the model on the data
lda.fit(X_train,y_train)

# Get the prediction
y_test_pred_lda = lda.predict(X_test)

# Computation of the AUC and accuracy
roc_auc_score_lda = roc_auc_score(y_test, lda.predict_proba(X_test)[:, 1])
accuracy_score_lda = accuracy_score(y_test, y_test_pred_lda)

# Print the results
print(f"Accuracy for test data: {accuracy_score_lda:.3f}")
print(f"     AUC for test data: {roc_auc_score_lda:.3f}")

Accuracy for test data: 0.860
     AUC for test data: 0.947
