# 01 Data summary

In [None]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
data = pd.read_excel("../data/raw_data.xlsx")
list(data.columns)

# 02 Processing missing values

In [None]:
data = pd.read_csv("../result/00pre-processing/01base_data.csv")
clinical_features = [
    "Gender",
    "Age",
    "ALT",
    "AST",
    "TProt",
    "Albumin",
    "Globulin",
    "ALP",
    "GGT",
    "DBIL",
    "IBIL",
    "AFP",
    "DNA load",
    "HBsAg",
    "HBsAb",
    "HBeAg_COI",
    "HBeAb_COI",
    "HBcAb_COI",
    "Subtype",
]
treat_features = [
    "ThSched",
    "ADV",
    "ETV",
    "PEG-IFN",
    "TAF",
    "TDF",
    "TFV",
    "TMF",
    "UnusedD",
]
specific_features = [
    "HBV-T",
    "HBsAg1_T",
    "HBsAg2_T",
    "HBpol1_T",
    "HBpol2_T",
    "HBx1_T",
    "HBx2_T",
    "HBeAg1_T",
    "HBeAg2_T",
]

In [None]:
missing_ratio = data.isna().mean()
selected_features = missing_ratio[missing_ratio < 0.4].index
final_data = data[selected_features]
final_data.drop(columns=["name"], inplace=True)

print("Selected features:", selected_features)
print("Filtered data shape:", final_data.shape)

## Comparison of filling methods

In [None]:
df = final_data.copy()

import matplotlib.pyplot as plt
import random

plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

df_no_miss = df.dropna()
print(df_no_miss["label"].value_counts())
df_no_miss_no_label = df_no_miss.drop(["label"], axis=1, inplace=False)

df.drop(["label"], axis=1, inplace=True)
missing_ratio = df.isna().mean()
df_miss = df_no_miss_no_label.copy(deep=True)
row_num, index_num = df_no_miss_no_label.shape
miss_num = round(row_num * missing_ratio, 0)

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.impute import IterativeImputer
from missingpy import MissForest
import torch
import torch.nn as nn
import torch.optim as optim
from tools.common import setup_seed
from torch.utils.data import DataLoader, TensorDataset


# log_mse
def log_mse(original, imputed, epsilon=1e-8):
    log_original = np.log(original + epsilon)
    log_imputed = np.log(imputed + epsilon)
    return ((log_original - log_imputed) ** 2).mean(axis=0)


class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(), nn.Linear(64, 16), nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 64), nn.ReLU(), nn.Linear(64, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def autoencoder_impute(df, num_epochs=300, batch_size=64):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dim = df.shape[1]

    imputer = SimpleImputer(strategy="mean")
    df_filled = imputer.fit_transform(df)

    data = torch.tensor(df_filled, dtype=torch.float32).to(device)
    data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

    model = Autoencoder(input_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.MSELoss()

    model.train()
    for _ in range(num_epochs):
        for batch in data_loader:
            imputed = batch.clone()
            # imputed[torch.isnan(batch)] = 0
            optimizer.zero_grad()
            output = model(imputed)
            loss = criterion(output, batch)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        imputed_data = model(data).cpu().numpy()

    df_imputed = df.copy()
    mask = df.isna()
    imputed_data = pd.DataFrame(imputed_data, columns=df.columns, index=df.index)
    df_imputed[mask] = imputed_data[mask]

    return df_imputed


strategies = {
    "Mean": SimpleImputer(strategy="mean"),
    "Zero": SimpleImputer(strategy="constant", fill_value=0),
    "Median": SimpleImputer(strategy="median"),
    "Mode": SimpleImputer(strategy="most_frequent"),
    "KNN": KNNImputer(n_neighbors=5),
    "Iter": IterativeImputer(random_state=42),
    "RF": MissForest(criterion="squared_error", max_features="sqrt", max_iter=100),
    "AE": autoencoder_impute,
}

setup_seed(42)
n_experiments = 5
results_modified_msre = {strategy: [] for strategy in strategies}
results_log_mse = {strategy: [] for strategy in strategies}

for _ in range(n_experiments):
    df_miss = df_no_miss_no_label.copy(deep=True)
    for feature, missing_count in miss_num.items():

        if int(missing_count) == 0:
            continue
        else:
            missing_indices = random.sample(
                list(df_no_miss_no_label.index), int(missing_count)
            )
            df_miss.loc[missing_indices, feature] = np.nan

    for name, imputer in strategies.items():
        if name in ["AE"]:
            df_imputed = imputer(df_miss)
        else:
            df_imputed = pd.DataFrame(
                imputer.fit_transform(df_miss),
                columns=df_miss.columns,
                index=df_miss.index,
            )

        modified_msre_score = modified_msre(df_no_miss_no_label, df_imputed).mean()
        log_mse_score = log_mse(df_no_miss_no_label, df_imputed).mean()
        results_modified_msre[name].append(modified_msre_score)
        results_log_mse[name].append(log_mse_score)

msre_means = {name: np.mean(scores) for name, scores in results_modified_msre.items()}
msre_stds = {name: np.std(scores) for name, scores in results_modified_msre.items()}
log_mse_means = {name: np.mean(scores) for name, scores in results_log_mse.items()}
log_mse_stds = {name: np.std(scores) for name, scores in results_log_mse.items()}

In [None]:
import seaborn as sns

plt.figure(figsize=(12, 8))
plt.rcParams["font.family"] = "Arial"
methods = list(log_mse_means.keys())
means = np.array(list(log_mse_means.values()))
stds = np.array(list(log_mse_stds.values()))

sorted_indices = np.argsort(means)[::-1]
methods = np.array(methods)[sorted_indices]
means = means[sorted_indices]
stds = stds[sorted_indices]
ax = sns.barplot(x=means, y=methods, palette="viridis", orient="h")

ax.spines["bottom"].set_linewidth(2)
ax.spines["left"].set_linewidth(2)

for i, (mean, std) in enumerate(zip(means, stds)):
    ax.errorbar(mean, i, xerr=std, fmt="none", c="black", capsize=5, elinewidth=1.5)

for i, (mean, std) in enumerate(zip(means, stds)):
    ax.text(mean + std + 0.05, i, f"{mean:.2f}±{std:.2f}", va="center", c="black")

for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontweight("bold")
# plt.title("Log MSE Evaluation", fontsize=16, fontweight='bold', c='k')
plt.xlabel("Log MSE", fontsize=14, fontweight="bold", c="k")
plt.ylabel("Imputation Method", fontsize=14, fontweight="bold", c="k")
plt.tick_params(axis="x", colors="k")
plt.tick_params(axis="y", colors="k")
plt.tick_params(axis="x", labelcolor="k")
plt.tick_params(axis="y", labelcolor="k")
ax.set_xlim(0, 14)
ax = plt.gca()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_linewidth(2)
ax.spines["left"].set_linewidth(2)

ax.xaxis.set_tick_params(width=2)
ax.yaxis.set_tick_params(width=2)

plt.tight_layout()
# plt.show()

# 03 Feature selection

In [9]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, chi2


In [None]:
plt.rcParams["font.family"] = "Arial"
method = ["pearson", "spearman"]
correlation_matrix = df.corr(method="pearson")
plt.figure(figsize=(15, 10))
ax = sns.heatmap(
    correlation_matrix,
    annot=False,
    cmap="bwr",
    annot_kws={"size": 12},
    linewidths=1,
    cbar_kws={"shrink": 0.5},
)
colorbar = ax.collections[0].colorbar

colorbar.set_ticks([-0.8, 0.95])
colorbar.set_ticklabels(["Low", "High"])
colorbar.ax.tick_params(labelsize=12)
colorbar.ax.set_aspect(15)
for label in colorbar.ax.get_yticklabels():
    label.set_fontname("Arial")
    label.set_fontweight("bold")
plt.xticks(fontsize=12, fontweight="bold")
plt.yticks(fontsize=12, fontweight="bold")
# plt.title("Correlation Heatmap", fontsize=16, fontweight='bold',c='k')
plt.tick_params(axis="x", colors="k")
plt.tick_params(axis="y", colors="k")
plt.tick_params(axis="x", labelcolor="k")
plt.tick_params(axis="y", labelcolor="k")
plt.tight_layout()
plt.savefig(
    "../result/00pre-processing/03Correlation Heatmap.pdf",
    format="pdf",
    dpi=600,
    bbox_inches="tight",
)
plt.savefig(
    "../result/00pre-processing/03Correlation Heatmap.tif", dpi=300, bbox_inches="tight"
)
plt.show()

In [None]:
x = df[clinical_features]
y = df['Label']

chi2_stat, p_values = chi2(x, y)
chi2_df = pd.DataFrame({'Feature': x.columns, 'Chi2_Stat': chi2_stat, 'P_Value': p_values})
chi2_df['Chi2_Stat'] = chi2_df['Chi2_Stat'].round(4)
top10_chi2 = chi2_df.sort_values(by='Chi2_Stat', ascending=False).head(10)

correlation = df[clinical_features + ['Label']].corr()['Label'].drop('Label')
corr_df = pd.DataFrame({'Feature': correlation.index, 'Correlation': correlation.values})
corr_df['Correlation'] = corr_df['Correlation'].round(4)
top10_corr = corr_df.sort_values(by='Correlation', ascending=False).head(10)

mutual_info = mutual_info_classif(x, y,random_state=42)
mi_df = pd.DataFrame({'Feature': x.columns, 'Mutual_Info': mutual_info})
mi_df['Mutual_Info'] = mi_df['Mutual_Info'].round(4)
top10_mi = mi_df.sort_values(by='Mutual_Info', ascending=False).head(10)

from collections import Counter
all_top_features = list(top10_chi2['Feature']) + list(top10_mi['Feature']) + list(top10_corr['Feature'])
feature_counts = Counter(all_top_features)

final_top10_features = [feature for feature, count in feature_counts.most_common(10)]
print(final_top10_features)
drop_clinical_features = [item for item in clinical_features if item not in final_top10_features]
feature_counts

In [None]:
df.drop(columns=drop_clinical_features, inplace=True)
df.to_csv()

# 04 Data normalization

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

df = pd.read_csv("")

clinical_features = ['Gender', 'ALT', 'AST', 'Globulin', 'DBIL', 'IBIL', 'AFP', 'DNA load', 'HBsAg', 'HBeAg_COI']
specific_features = ['SFU', 'HBsAg1_T', 'HBsAg2_T', 'HBpol1_T', 'HBpol2_T', 'HBx1_T', 'HBx2_T', 'HBeAg1_T', 'HBeAg2_T']
treat_features = ['ThSched', 'ADV', 'ETV', 'PEG_IFN', 'TAF', 'TDF', 'TFV', 'TMF', 'UnusedD']

In [None]:
df = df[clinical_features + specific_features + treat_features + ["Label"]]

In [None]:
df_scaler = df.copy()
df_scaler.drop(["Label"], axis=1, inplace=True)

minmax_transfer = MinMaxScaler()
df_scaler_minmax = minmax_transfer.fit_transform(df_scaler)
joblib.dump(minmax_transfer, "scaler.joblib")

df_scaler = pd.DataFrame(df_scaler_minmax, columns=df_scaler.columns)
df_scaler["Label"] = df["Label"]

df_scaler.to_csv("")