# 3. Knowledge Embedding Based on CGAN-Bi-LSTM

# 3.1 MLR-in

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split,KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import (classification_report, confusion_matrix, precision_recall_curve,
                             average_precision_score, roc_curve, auc, mean_absolute_error, log_loss)
from tensorflow.keras.layers import Bidirectional

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12  
plt.rcParams['font.weight'] = 'bold'  

plt.rcParams['axes.titlesize'] = 14  
plt.rcParams['axes.labelsize'] = 12   
plt.rcParams['axes.titleweight'] = 'bold' 
plt.rcParams['axes.labelweight'] = 'bold'  
plt.rcParams['xtick.labelsize'] = 10  
plt.rcParams['ytick.labelsize'] = 10  
plt.rcParams['legend.fontsize'] = 10  
plt.rcParams['legend.title_fontsize'] = 12 

data = pd.read_csv(r'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/Well_B_cgan_best.csv')

X = data[['WellType','WellDepth', 'TVD', 'Layer', 'Lithology', 'FormationStructure', 'WOB', 'RPM', 'TOR', 
          'PumpPressure','Displacement','Density','ECD', 'HookLoad', 'ROP', 'DrillTime', 'DC', 'LagTime', 
          'OutletFlow','FormationPressureGradient', 'FormationRupturePressureGradient','TheoreticalMaximumDisplacement','MLR']]
y = data['LostCirculation']  

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
performance_metrics = []

fold = 1
for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_val_lstm = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

    model = Sequential()
    model.add(Bidirectional(LSTM(64, activation='relu'), input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_lstm, y_train, epochs=100, batch_size=64, verbose=1)

    y_train_prob = model.predict(X_train_lstm)
    y_val_prob = model.predict(X_val_lstm)
    y_train_pred = (y_train_prob > 0.5).astype(int).flatten()
    y_val_pred = (y_val_prob > 0.5).astype(int).flatten()

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_val = confusion_matrix(y_val, y_val_pred)

    def get_metrics(y_true, y_pred, y_prob):
        report = classification_report(y_true, y_pred, output_dict=True,labels=[0, 1])
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        return {
            'Accuracy': report['accuracy'],
            'Precision': report['1']['precision'],
            'Recall': report['1']['recall'],
            'F1-score': report['1']['f1-score'],
            'MAE': mean_absolute_error(y_true, y_pred),
            'Log-Loss': log_loss(y_true, y_prob),
            'AP': average_precision_score(y_true, y_prob),
            'AUC': auc(fpr, tpr)
        }
    
    train_metrics = get_metrics(y_train, y_train_pred, y_train_prob)
    val_metrics = get_metrics(y_val, y_val_pred, y_val_prob)

    performance_metrics.append({'Fold': fold, 'Dataset': 'Train', **train_metrics})
    performance_metrics.append({'Fold': fold, 'Dataset': 'Test', **val_metrics})

    def plot_confusion_matrix(cm, title, ax):
        ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        ax.set_title(title)
        ax.set_xticks(np.arange(2))
        ax.set_yticks(np.arange(2))
        ax.set_xticklabels(['Normal', 'Lost'])
        ax.set_yticklabels(['Normal', 'Lost'])
        thresh = cm.max() / 2.
        for i, j in np.ndindex(cm.shape):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_confusion_matrix(cm_train, f'Confusion Matrix - Train (Fold {fold})', axs[0])
    plot_confusion_matrix(cm_val, f'Confusion Matrix - Test (Fold {fold})', axs[1])
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_in-bi-lstm/confusion_matrix_fold_{fold}.png',dpi=300)
    plt.show()

    def plot_curve(y_true, y_prob, curve_type, ax, label, color):
        if curve_type == 'roc':
            fpr, tpr, _ = roc_curve(y_true, y_prob)
            ax.plot(fpr, tpr, color=color, lw=2, label=f'{label} (AUC = {auc(fpr, tpr):.4f})')
            ax.plot([0, 1], [0, 1], linestyle='--', color='red', lw=2) 
            ax.set_xlabel('False Positive Rate (FPR)')
            ax.set_ylabel('True Positive Rate (TPR)')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

        elif curve_type == 'pr':
            precision, recall, _ = precision_recall_curve(y_true, y_prob)
            ap = average_precision_score(y_true, y_prob)
            ax.plot(recall, precision, color=color, lw=2, label=f'{label} (AP = {ap:.4f})')
            positive_ratio = np.sum(y_true) / len(y_true)  
            ax.plot([0, 1], [positive_ratio, positive_ratio], linestyle='--', color='gray', lw=2)
            ax.set_xlabel('Recall')
            ax.set_ylabel('Precision')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_curve(y_train, y_train_prob, 'roc', axs[0], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'roc', axs[0], 'Test', 'green')
    plot_curve(y_train, y_train_prob, 'pr', axs[1], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'pr', axs[1], 'Test', 'green')
    axs[0].set_title(f'ROC Curve (Fold {fold})')
    axs[1].set_title(f'PR Curve (Fold {fold})')
    axs[0].legend(loc='lower right')
    axs[1].legend(loc='lower left')
    plt.tight_layout()

    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_in-bi-lstm/roc_pr_curves_fold_{fold}.png',dpi=300)
    plt.show()

    fold += 1

performance_df = pd.DataFrame(performance_metrics)
performance_df.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_in-bi-lstm/cross_validation_performance_cgan.csv', index=False)

mean_metrics = performance_df.groupby('Dataset').mean().reset_index()
mean_metrics.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_in-bi-lstm/mean_model_performance_cgan.csv', index=False)

print(mean_metrics)

def plot_metric(metric_name):
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = np.arange(len(performance_df['Fold'].unique()))

    for dataset, color in zip(['Train', 'Test'], ['#004488', '#ffbb78']):
        subset = performance_df[performance_df['Dataset'] == dataset]
        plt.bar(index + (0 if dataset == 'Train' else bar_width), 
                subset[metric_name], 
                bar_width, 
                label=f'{dataset} {metric_name}', 
                color=color)
  
        for idx, value in enumerate(subset[metric_name]):
            plt.text(idx + (0 if dataset == 'Train' else bar_width), 
                     value + 0.0001, 
                     f'{value:.4f}', 
                     fontsize=9, 
                     ha='center')
    plt.xlabel('Fold')
    plt.ylabel(metric_name)
    plt.title(f'{metric_name} of MLR-in_CGAN_Bi-LSTM per fold')
    plt.xticks(index + bar_width / 2, performance_df['Fold'].unique()) 
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(loc='lower right') 
    plt.tight_layout()

    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_in-bi-lstm/{metric_name}.png', dpi=300)
    plt.show()
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'MAE', 'Log-Loss', 'AP', 'AUC']
for metric in metrics_to_plot:
    plot_metric(metric)

model.save('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/model/mlr_in-bi-lstm_cgan.h5')

# 3.2 MLR-at

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Input, Attention, Concatenate
from tensorflow.keras import Model
from sklearn.metrics import (classification_report, confusion_matrix, precision_recall_curve,
                             average_precision_score, roc_curve, auc, mean_absolute_error, log_loss)
import tensorflow as tf

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False

plt.rcParams['font.size'] = 12
plt.rcParams['font.weight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 14 
plt.rcParams['axes.labelsize'] = 12  
plt.rcParams['axes.titleweight'] = 'bold'  
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['xtick.labelsize'] = 10  
plt.rcParams['ytick.labelsize'] = 10   
plt.rcParams['legend.fontsize'] = 10   
plt.rcParams['legend.title_fontsize'] = 12 

data = pd.read_csv(r'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/Well_B_cgan_best.csv')

X = data[['WellType', 'WellDepth', 'TVD', 'Layer', 'Lithology', 'FormationStructure', 'WOB', 
           'RPM', 'TOR', 'PumpPressure', 'Displacement', 'Density', 'ECD', 'HookLoad', 
           'ROP', 'DrillTime', 'DC', 'LagTime', 'OutletFlow', 
           'FormationPressureGradient', 'FormationRupturePressureGradient', 
           'TheoreticalMaximumDisplacement']]
y = data['LostCirculation'] 

MLR = data['MLR'].values 

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test, MLR_train, MLR_test = train_test_split(X_scaled, y, MLR, test_size=0.3, random_state=42)

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()

    def call(self, inputs, MLR):
        attention_weights = tf.sigmoid(MLR)
        attention_weights = tf.where(MLR > 0.55, attention_weights * 2, attention_weights)
        
        context_vector = tf.reduce_sum(inputs * tf.expand_dims(attention_weights, axis=-1), axis=1)
        return context_vector

kf = KFold(n_splits=5, shuffle=True, random_state=42)
performance_metrics = []

fold = 1
for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    MLR_train, MLR_val = MLR[train_index], MLR[val_index]  # 分割MLR

    X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_val_lstm = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

    input_layer = Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]))
    lstm_out = Bidirectional(LSTM(64, return_sequences=True))(input_layer)

    attention_layer = AttentionLayer()(lstm_out, MLR_train)
    output_layer = Dense(1, activation='sigmoid')(attention_layer)
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train_lstm, y_train, epochs=100, batch_size=64, verbose=1)

    y_train_prob = model.predict(X_train_lstm)
    y_val_prob = model.predict(X_val_lstm)
    y_train_pred = (y_train_prob > 0.5).astype(int).flatten()
    y_val_pred = (y_val_prob > 0.5).astype(int).flatten()

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_val = confusion_matrix(y_val, y_val_pred)

    def get_metrics(y_true, y_pred, y_prob):
        report = classification_report(y_true, y_pred, output_dict=True,labels=[0, 1])
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        return {
            'Accuracy': report['accuracy'],
            'Precision': report['1']['precision'],
            'Recall': report['1']['recall'],
            'F1-score': report['1']['f1-score'],
            'MAE': mean_absolute_error(y_true, y_pred),
            'Log-Loss': log_loss(y_true, y_prob),
            'AP': average_precision_score(y_true, y_prob),
            'AUC': auc(fpr, tpr)
        }
    
    train_metrics = get_metrics(y_train, y_train_pred, y_train_prob)
    val_metrics = get_metrics(y_val, y_val_pred, y_val_prob)

    performance_metrics.append({'Fold': fold, 'Dataset': 'Train', **train_metrics})
    performance_metrics.append({'Fold': fold, 'Dataset': 'Test', **val_metrics})

    def plot_confusion_matrix(cm, title, ax):
        ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        ax.set_title(title)
        ax.set_xticks(np.arange(2))
        ax.set_yticks(np.arange(2))
        ax.set_xticklabels(['Normal', 'Lost'])
        ax.set_yticklabels(['Normal', 'Lost'])
        thresh = cm.max() / 2.
        for i, j in np.ndindex(cm.shape):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_confusion_matrix(cm_train, f'Confusion Matrix - Train (Fold {fold})', axs[0])
    plot_confusion_matrix(cm_val, f'Confusion Matrix - Test (Fold {fold})', axs[1])
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_at-bi-lstm/confusion_matrix_fold_{fold}.png',dpi=300)
    plt.show()

    def plot_curve(y_true, y_prob, curve_type, ax, label, color):
        if curve_type == 'roc':
            fpr, tpr, _ = roc_curve(y_true, y_prob)
            ax.plot(fpr, tpr, color=color, lw=2, label=f'{label} (AUC = {auc(fpr, tpr):.4f})')
            ax.plot([0, 1], [0, 1], linestyle='--', color='red', lw=2)
            ax.set_xlabel('False Positive Rate (FPR)')
            ax.set_ylabel('True Positive Rate (TPR)')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

        elif curve_type == 'pr':
            precision, recall, _ = precision_recall_curve(y_true, y_prob)
            ap = average_precision_score(y_true, y_prob)
            ax.plot(recall, precision, color=color, lw=2, label=f'{label} (AP = {ap:.4f})')
            positive_ratio = np.sum(y_true) / len(y_true) 
            ax.plot([0, 1], [positive_ratio, positive_ratio], linestyle='--', color='gray', lw=2)
            ax.set_xlabel('Recall')
            ax.set_ylabel('Precision')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_curve(y_train, y_train_prob, 'roc', axs[0], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'roc', axs[0], 'Test', 'green')
    plot_curve(y_train, y_train_prob, 'pr', axs[1], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'pr', axs[1], 'Test', 'green')
    axs[0].set_title(f'ROC Curve (Fold {fold})')
    axs[1].set_title(f'PR Curve (Fold {fold})')
    axs[0].legend(loc='lower right')
    axs[1].legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_at-bi-lstm/roc_pr_curves_fold_{fold}.png',dpi=300)
    plt.show()

    fold += 1

performance_df = pd.DataFrame(performance_metrics)
performance_df.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_at-bi-lstm/cross_validation_performance_cgan.csv', index=False)

mean_metrics = performance_df.groupby('Dataset').mean().reset_index()
mean_metrics.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_at-bi-lstm/mean_model_performance_cgan.csv', index=False)
print(mean_metrics)

def plot_metric(metric_name):
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = np.arange(len(performance_df['Fold'].unique()))

    for dataset, color in zip(['Train', 'Test'], ['#4b4b4b', '#87ceeb']):
        subset = performance_df[performance_df['Dataset'] == dataset]
        plt.bar(index + (0 if dataset == 'Train' else bar_width), 
                subset[metric_name], 
                bar_width, 
                label=f'{dataset} {metric_name}', 
                color=color)
        for idx, value in enumerate(subset[metric_name]):
            plt.text(idx + (0 if dataset == 'Train' else bar_width), 
                     value + 0.0001, 
                     f'{value:.4f}', 
                     fontsize=9, 
                     ha='center')
    plt.xlabel('Fold')
    plt.ylabel(metric_name)
    plt.title(f'{metric_name} of MLR-at_CGAN_Bi-LSTM per fold')
    plt.xticks(index + bar_width / 2, performance_df['Fold'].unique()) 
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(loc='lower right') 
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_at-bi-lstm/{metric_name}.png', dpi=300)
    plt.show()
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'MAE', 'Log-Loss', 'AP', 'AUC']
for metric in metrics_to_plot:
    plot_metric(metric)

model.save('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/model/mlr_at-bi-lstm_cgan.h5')

# 3.3 MLR-mt

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Input, Concatenate
from tensorflow.keras import Model
from sklearn.metrics import (classification_report, confusion_matrix, precision_recall_curve,
                             average_precision_score, roc_curve, auc, mean_absolute_error, log_loss)
import tensorflow as tf

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False

plt.rcParams['font.size'] = 12  
plt.rcParams['font.weight'] = 'bold'  
plt.rcParams['axes.titlesize'] = 14 
plt.rcParams['axes.labelsize'] = 12   
plt.rcParams['axes.titleweight'] = 'bold'  
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10  
plt.rcParams['legend.fontsize'] = 10  
plt.rcParams['legend.title_fontsize'] = 12  

data = pd.read_csv(r'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/Well_B_cgan_best.csv')

X = data[['WellType', 'WellDepth', 'TVD', 'Layer', 'Lithology', 'FormationStructure', 'WOB', 
           'RPM', 'TOR', 'PumpPressure', 'Displacement', 'Density', 'ECD', 'HookLoad', 
           'ROP', 'DrillTime', 'DC', 'LagTime', 'OutletFlow', 
           'FormationPressureGradient', 'FormationRupturePressureGradient', 
           'TheoreticalMaximumDisplacement']]
y = data['LostCirculation'] 

MLR = data['MLR'].values 

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test, MLR_train, MLR_test = train_test_split(X_scaled, y, MLR, test_size=0.3, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
performance_metrics = []

fold = 1
for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    MLR_train, MLR_val = MLR[train_index], MLR[val_index]  

    X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_val_lstm = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

    input_layer = Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]))
    lstm_out = Bidirectional(LSTM(64, return_sequences=False))(input_layer)

    output_layer_main = Dense(1, activation='sigmoid', name='main_output')(lstm_out)
    output_layer_aux = Dense(1, name='aux_output')(lstm_out)

    model = Model(inputs=input_layer, outputs=[output_layer_main, output_layer_aux])

    model.compile(optimizer='adam', 
                  loss={'main_output': 'binary_crossentropy', 'aux_output': 'mean_squared_error'},
                  metrics={'main_output': 'accuracy', 'aux_output': 'mean_absolute_error'})

    model.fit(X_train_lstm, [y_train, MLR_train], epochs=100, batch_size=64, verbose=1)

    y_train_prob, MLR_train_pred = model.predict(X_train_lstm)
    y_val_prob, MLR_val_pred = model.predict(X_val_lstm)
    y_train_pred = (y_train_prob > 0.5).astype(int).flatten()
    y_val_pred = (y_val_prob > 0.5).astype(int).flatten()

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_val = confusion_matrix(y_val, y_val_pred)

    def get_metrics(y_true, y_pred, y_prob):
        report = classification_report(y_true, y_pred, output_dict=True, labels=[0, 1])
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        return {
            'Accuracy': report['accuracy'],
            'Precision': report['1']['precision'],
            'Recall': report['1']['recall'],
            'F1-score': report['1']['f1-score'],
            'MAE': mean_absolute_error(y_true, y_pred),
            'Log-Loss': log_loss(y_true, y_prob),
            'AP': average_precision_score(y_true, y_prob),
            'AUC': auc(fpr, tpr)
        }

    train_metrics = get_metrics(y_train, y_train_pred, y_train_prob)
    val_metrics = get_metrics(y_val, y_val_pred, y_val_prob)

    performance_metrics.append({'Fold': fold, 'Dataset': 'Train', **train_metrics})
    performance_metrics.append({'Fold': fold, 'Dataset': 'Test', **val_metrics})

    def plot_confusion_matrix(cm, title, ax):
        ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        ax.set_title(title)
        ax.set_xticks(np.arange(2))
        ax.set_yticks(np.arange(2))
        ax.set_xticklabels(['Normal', 'Lost'])
        ax.set_yticklabels(['Normal', 'Lost'])
        thresh = cm.max() / 2.
        for i, j in np.ndindex(cm.shape):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_confusion_matrix(cm_train, f'Confusion Matrix - Train (Fold {fold})', axs[0])
    plot_confusion_matrix(cm_val, f'Confusion Matrix - Test (Fold {fold})', axs[1])
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_mt-bi-lstm/confusion_matrix_fold_{fold}.png',dpi=300)
    plt.show()

    def plot_curve(y_true, y_prob, curve_type, ax, label, color):
        if curve_type == 'roc':
            fpr, tpr, _ = roc_curve(y_true, y_prob)
            ax.plot(fpr, tpr, color=color, lw=2, label=f'{label} (AUC = {auc(fpr, tpr):.4f})')
            ax.plot([0, 1], [0, 1], linestyle='--', color='red', lw=2)
            ax.set_xlabel('False Positive Rate (FPR)')
            ax.set_ylabel('True Positive Rate (TPR)')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

        elif curve_type == 'pr':
            precision, recall, _ = precision_recall_curve(y_true, y_prob)
            ap = average_precision_score(y_true, y_prob)
            ax.plot(recall, precision, color=color, lw=2, label=f'{label} (AP = {ap:.4f})')
            positive_ratio = np.sum(y_true) / len(y_true) 
            ax.plot([0, 1], [positive_ratio, positive_ratio], linestyle='--', color='gray', lw=2)
            ax.set_xlabel('Recall')
            ax.set_ylabel('Precision')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_curve(y_train, y_train_prob, 'roc', axs[0], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'roc', axs[0], 'Test', 'green')
    plot_curve(y_train, y_train_prob, 'pr', axs[1], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'pr', axs[1], 'Test', 'green')
    axs[0].set_title(f'ROC Curve (Fold {fold})')
    axs[1].set_title(f'PR Curve (Fold {fold})')
    axs[0].legend(loc='lower right')
    axs[1].legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_mt-bi-lstm/roc_pr_curves_fold_{fold}.png',dpi=300)
    plt.show()

    fold += 1

performance_df = pd.DataFrame(performance_metrics)
performance_df.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_mt-bi-lstm/cross_validation_performance_cgan.csv', index=False)
mean_metrics = performance_df.groupby('Dataset').mean().reset_index()
mean_metrics.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_mt-bi-lstm/mean_model_performance_cgan.csv', index=False)
print(mean_metrics)

def plot_metric(metric_name):
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = np.arange(len(performance_df['Fold'].unique()))

    for dataset, color in zip(['Train', 'Test'], ['#000000', '#9e7b3e']):
        subset = performance_df[performance_df['Dataset'] == dataset]
        plt.bar(index + (0 if dataset == 'Train' else bar_width), 
                subset[metric_name], 
                bar_width, 
                label=f'{dataset} {metric_name}', 
                color=color)
        for idx, value in enumerate(subset[metric_name]):
            plt.text(idx + (0 if dataset == 'Train' else bar_width), 
                     value + 0.0001, 
                     f'{value:.4f}', 
                     fontsize=9, 
                     ha='center')
    plt.xlabel('Fold')
    plt.ylabel(metric_name)
    plt.title(f'{metric_name} of MLR-mt_CGAN_Bi-LSTM per fold')
    plt.xticks(index + bar_width / 2, performance_df['Fold'].unique())  
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(loc='lower right') 
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_mt-bi-lstm/{metric_name}.png', dpi=300)
    plt.show()
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'MAE', 'Log-Loss', 'AP', 'AUC']
for metric in metrics_to_plot:
    plot_metric(metric)

model.save('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/model/mlr_mt-bi-lstm_cgan.h5')

# 3.4 MLR-Loss

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Input
from tensorflow.keras import Model
from sklearn.metrics import (classification_report, confusion_matrix, precision_recall_curve,
                             average_precision_score, roc_curve, auc, mean_absolute_error, log_loss)
import tensorflow as tf

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False

plt.rcParams['font.size'] = 12  
plt.rcParams['font.weight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 14 
plt.rcParams['axes.labelsize'] = 12  
plt.rcParams['axes.titleweight'] = 'bold' 
plt.rcParams['axes.labelweight'] = 'bold'  
plt.rcParams['xtick.labelsize'] = 10   
plt.rcParams['ytick.labelsize'] = 10   
plt.rcParams['legend.fontsize'] = 10   
plt.rcParams['legend.title_fontsize'] = 12  

data = pd.read_csv(r'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/Well_B_cgan_best.csv')

X = data[['WellType', 'WellDepth', 'TVD', 'Layer', 'Lithology', 'FormationStructure', 'WOB', 
           'RPM', 'TOR', 'PumpPressure', 'Displacement', 'Density', 'ECD', 'HookLoad', 
           'ROP', 'DrillTime', 'DC', 'LagTime', 'OutletFlow', 
           'FormationPressureGradient', 'FormationRupturePressureGradient', 
           'TheoreticalMaximumDisplacement']]
y = data['LostCirculation'] 
MLR = data['MLR'].values 

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test, MLR_train, MLR_test = train_test_split(X_scaled, y, MLR, test_size=0.3, random_state=42)

def custom_loss(y_true, y_pred, MLR):
    bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
    
    MLR_constraint = tf.where(MLR > 0.55, 1.0, 0.0)
    constraint_loss = tf.reduce_mean(MLR_constraint * (y_pred - MLR) ** 2)

    return bce + constraint_loss

kf = KFold(n_splits=5, shuffle=True, random_state=42)
performance_metrics = []

fold = 1
for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    MLR_train, MLR_val = MLR[train_index], MLR[val_index] 

    X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_val_lstm = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

    input_layer = Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]))
    lstm_out = Bidirectional(LSTM(64, activation='relu'))(input_layer)
    output_layer = Dense(1, activation='sigmoid')(lstm_out)
    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(optimizer='adam', loss=lambda y_true, y_pred: custom_loss(y_true, y_pred, MLR_train), metrics=['accuracy'])

    model.fit(X_train_lstm, y_train, epochs=100, batch_size=64, verbose=1)

    y_train_prob = model.predict(X_train_lstm)
    y_val_prob = model.predict(X_val_lstm)
    y_train_pred = (y_train_prob > 0.5).astype(int).flatten()
    y_val_pred = (y_val_prob > 0.5).astype(int).flatten()

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_val = confusion_matrix(y_val, y_val_pred)

    def get_metrics(y_true, y_pred, y_prob):
        report = classification_report(y_true, y_pred, output_dict=True,labels=[0, 1])
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        return {
            'Accuracy': report['accuracy'],
            'Precision': report['1']['precision'],
            'Recall': report['1']['recall'],
            'F1-score': report['1']['f1-score'],
            'MAE': mean_absolute_error(y_true, y_pred),
            'Log-Loss': log_loss(y_true, y_prob),
            'AP': average_precision_score(y_true, y_prob),
            'AUC': auc(fpr, tpr)
        }
    
    train_metrics = get_metrics(y_train, y_train_pred, y_train_prob)
    val_metrics = get_metrics(y_val, y_val_pred, y_val_prob)

    performance_metrics.append({'Fold': fold, 'Dataset': 'Train', **train_metrics})
    performance_metrics.append({'Fold': fold, 'Dataset': 'Test', **val_metrics})

    def plot_confusion_matrix(cm, title, ax):
        ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        ax.set_title(title)
        ax.set_xticks(np.arange(2))
        ax.set_yticks(np.arange(2))
        ax.set_xticklabels(['Normal', 'Lost'])
        ax.set_yticklabels(['Normal', 'Lost'])
        thresh = cm.max() / 2.
        for i, j in np.ndindex(cm.shape):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_confusion_matrix(cm_train, f'Confusion Matrix - Train (Fold {fold})', axs[0])
    plot_confusion_matrix(cm_val, f'Confusion Matrix - Test (Fold {fold})', axs[1])
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_loss-bi-lstm/confusion_matrix_fold_{fold}.png',dpi=300)
    plt.show()

    def plot_curve(y_true, y_prob, curve_type, ax, label, color):
        if curve_type == 'roc':
            fpr, tpr, _ = roc_curve(y_true, y_prob)
            ax.plot(fpr, tpr, color=color, lw=2, label=f'{label} (AUC = {auc(fpr, tpr):.4f})')
            ax.plot([0, 1], [0, 1], linestyle='--', color='red', lw=2) 
            ax.set_xlabel('False Positive Rate (FPR)')
            ax.set_ylabel('True Positive Rate (TPR)')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

        elif curve_type == 'pr':
            precision, recall, _ = precision_recall_curve(y_true, y_prob)
            ap = average_precision_score(y_true, y_prob)
            ax.plot(recall, precision, color=color, lw=2, label=f'{label} (AP = {ap:.4f})')
            positive_ratio = np.sum(y_true) / len(y_true)
            ax.plot([0, 1], [positive_ratio, positive_ratio], linestyle='--', color='gray', lw=2)
            ax.set_xlabel('Recall')
            ax.set_ylabel('Precision')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    plot_curve(y_train, y_train_prob, 'roc', axs[0], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'roc', axs[0], 'Test', 'green')
    plot_curve(y_train, y_train_prob, 'pr', axs[1], 'Train', 'blue')
    plot_curve(y_val, y_val_prob, 'pr', axs[1], 'Test', 'green')
    axs[0].set_title(f'ROC Curve (Fold {fold})')
    axs[1].set_title(f'PR Curve (Fold {fold})')
    axs[0].legend(loc='lower right')
    axs[1].legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_loss-bi-lstm/roc_pr_curves_fold_{fold}.png',dpi=300)
    plt.show()

    fold += 1

performance_df = pd.DataFrame(performance_metrics)
performance_df.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_loss-bi-lstm/cross_validation_performance_cgan.csv', index=False)
mean_metrics = performance_df.groupby('Dataset').mean().reset_index()
mean_metrics.to_csv('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/data/mlr_loss-bi-lstm/mean_model_performance_cgan.csv', index=False)
print(mean_metrics)

def plot_metric(metric_name):
    plt.figure(figsize=(10, 6))

    bar_width = 0.35
    index = np.arange(len(performance_df['Fold'].unique()))

    for dataset, color in zip(['Train', 'Test'], ['#2c6b2f', '#ffcc00']):
        subset = performance_df[performance_df['Dataset'] == dataset]
        plt.bar(index + (0 if dataset == 'Train' else bar_width), 
                subset[metric_name], 
                bar_width, 
                label=f'{dataset} {metric_name}', 
                color=color)
        for idx, value in enumerate(subset[metric_name]):
            plt.text(idx + (0 if dataset == 'Train' else bar_width), 
                     value + 0.0001, 
                     f'{value:.4f}', 
                     fontsize=9, 
                     ha='center')
    plt.xlabel('Fold')
    plt.ylabel(metric_name)
    plt.title(f'{metric_name} of MLR-loss_CGAN_Bi-LSTM per fold')
    plt.xticks(index + bar_width / 2, performance_df['Fold'].unique()) 
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(f'E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/picture/cgan/mlr_loss-bi-lstm/{metric_name}.png', dpi=300)
    plt.show()
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'MAE', 'Log-Loss', 'AP', 'AUC']
for metric in metrics_to_plot:
    plot_metric(metric)

model.save('E:/jupyter/lost_circulation/records/paper-bhyt/Diagnosis/model/mlr_loss-bi-lstm_cgan.h5')
