# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import confusion_matrix
import tensorflow as tf

# Read data


In [None]:
data_file = 'data.csv'
data = pd.read_csv(data_file)
col_names = ['year',
 'month',
 'Longitude_[deg]',
 'Latitude_[deg]',
 'Depth_[m]',
 'Temp_[Â°C]',
 'gradientD_T']

# Preprocessing


In [None]:
# Drop rows with year 2006
data.drop(data[data.year == 2006].index, inplace=True) # a lot of bad data are located in this year, to avoid imbalance we exclude it

# Filter out suspect gradient and spike flag data
Sebastian = data[data.QF_trad2.isin([2, 4])]
Mohamed = data[data.QF.isin([2, 4, 12, 14])] # we are interested only on the suspect gradient (flag 2) and spike flag (flag 4)
data = data.drop(data[(data['QF_trad2'] == 3) | (data['QF'] == 3)].index) # since the density inversion error -flag 3 has been applied after the suspect gradient -flag 2, we decide to drop them from the training dataset

# Combine filtered data
data2_4 = data[data.Prof_no.isin(np.unique(np.concatenate([Mohamed.Prof_no.unique(), Sebastian.Prof_no.unique()])))]
data2_4.QF.replace([2,4,12,14], 1, inplace=True)
data2_4.QF[data2_4.QF != 1] = 0 # to make binary classification problem

# Remove rows with missing or -999 values
data2_4.drop(data2_4[(data2_4[col_names] == -999).any(axis=1)].index, inplace=True)
data2_4.dropna(inplace=True) # drop the nan values


# Model Training


In [None]:
# Define the list of seeds
seeds = [555, 187, 29]

# Loop over different seeds to test the effect of splitting on model performance
for model_nb, seed in enumerate(seeds, start=1):
    print(f"Training Model {model_nb} with seed {seed}")

    # Split data
    Prof_no = data2_4.Prof_no.unique()
    np.random.seed(seed)
    np.random.shuffle(Prof_no)
    train_no, validate_no, test_no = np.split(Prof_no, [int(len(Prof_no) * 0.7), int(len(Prof_no) * 0.85)])

    train_data = data2_4[data2_4['Prof_no'].isin(train_no.tolist())]
    validation_data = data2_4[data2_4['Prof_no'].isin(validate_no.tolist())]
    test_data = data2_4[data2_4['Prof_no'].isin(test_no.tolist())]

    # Standardization
    scaler = StandardScaler()
    scaler.fit(train_data[col_names].values)
    train_scaled_features = scaler.transform(train_data[col_names].values)
    val_scaled_features = scaler.transform(validation_data[col_names].values)
    test_scaled_features = scaler.transform(test_data[col_names].values)

    # Define X and y
    X_train, y_train = train_scaled_features, train_data['QF']
    X_val, y_val = val_scaled_features, validation_data['QF']
    X_test, y_test = test_scaled_features, test_data['QF']

    # Build the model
    input_dim = X_train.shape[1]
    model = Sequential([
        Dense(512, kernel_initializer='glorot_normal', input_dim=input_dim, activation='relu'),
        Dropout(0.1),
        Dense(256, kernel_initializer='glorot_normal', activation='relu'),
        Dropout(0.1),
        Dense(128, kernel_initializer='glorot_normal', activation='relu'),
        Dropout(0.2),
        Dense(64, kernel_initializer='glorot_normal', activation='relu'),
        Dropout(0.2),
        Dense(1, kernel_initializer='glorot_normal', activation='sigmoid')
    ])

    model.summary()

    # Class weights are calculated to address class imbalance in classification problems.
    # Class imbalance occurs when there is a significant difference in the number of instances between different classes in the dataset,
    # leading the model to be biased towards the majority class and perform poorly on the minority class.

    # By assigning higher weights to minority class samples and lower weights to majority class samples during training,
    # we ensure the model pays more attention to the minority class, improving its performance on the minority class.

    # Benefits of class weights:
    # - Handling Imbalance: Helps the model learn better representations by overcoming the imbalance in class distribution.
    # - Improving Performance: Encourages the model to prioritize correctly predicting minority class instances, improving sensitivity and overall accuracy.
    # - Balanced Learning: Adjusts the impact of each class based on its frequency, leading to a more balanced decision boundary and better generalization.

    # We calculate class weights using compute_class_weight function from sklearn.utils.
    # The 'balanced' parameter adjusts the weights inversely proportional to class frequencies in the input data.

    # Calculate class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weight = dict(zip(np.unique(y_train), class_weights))


    # Train the model
    epochs = 150
    batch_size = 512
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    filepath = f"model_checkpoint_seed.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1,
                        validation_data=(X_val, y_val), shuffle=True, callbacks=callbacks_list, class_weight=class_weight)

    # Save model and history
    model.save(f'new_model_full_{model_nb}_seed_{seed}_binary_crossentropy.h5')
    np.save(f'new_model_full_{model_nb}_seed_{seed}_history_binary_crossentropy.npy', history.history)

    # Evaluate the model
    print("Evaluating Model on Test Data")
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
    y_pred = model.predict_classes(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
