# Import

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import polars as pl
import tsfel
import json
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

https://tsfel.readthedocs.io/en/latest/descriptions/get_started.html

In [None]:
with open('../params.json', 'r') as file :
    params = json.load(file)

DATASET, VERSION = params['dataset'], params['version']

In [None]:
df = pd.read_parquet(f'/data2/poette.m/dypo/{VERSION}/3.analysis/imputation_48/{DATASET}/tables/first_48_imputed_saits.parquet')

# Split train/test

In [None]:
encounters_list = df['encounterId'].unique()

In [None]:
train_encounters, test_encounters = train_test_split(encounters_list, test_size=0.3, random_state=42)

In [None]:
print(train_encounters.shape)
print(test_encounters.shape)

In [None]:
train_df = df[df['encounterId'].isin(train_encounters)]
test_df = df[df['encounterId'].isin(test_encounters)]

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df

# TSFEL Feature selections

In [None]:
cfg = tsfel.get_features_by_domain()  # Extrait toutes les caractéristiques par domaine (statistiques, temporelles, spectrales)

In [None]:
train_features = tsfel.time_series_features_extractor(config=cfg, timeseries=train_df.iloc[:,2:], window_size=48)


In [None]:
train_features.shape

In [None]:
test_features = tsfel.time_series_features_extractor(config=cfg, timeseries=test_df.iloc[:,2:], window_size=48)
test_features.shape

In [None]:
test_features

In [None]:
corr_features, Dataset_trainv1 = tsfel.correlated_features(train_features, drop_correlated=True)

In [None]:
cols = Dataset_trainv1.columns

In [None]:
selector = VarianceThreshold()

In [None]:
selector.fit(Dataset_trainv1)

In [None]:
colsV2=selector.get_feature_names_out(cols)

In [None]:
colsV3 = [col for col in colsV2 if not (col.endswith('_Skewness') or col.endswith('_Kurtosis'))]

In [None]:
Dataset_trainv2 = Dataset_trainv1[colsV3]
Dataset_testv2 = test_features[colsV3]

In [None]:
# Normalising Features
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(Dataset_trainv2)
X_test = scaler.transform(Dataset_testv2)

# Classification

In [None]:
demo_df = pd.read_parquet(f'/data2/poette.m/dypo/{VERSION}/2.clean_data/{DATASET}/static/clean_static_encounters.parquet')

In [None]:
y_train = demo_df[demo_df['encounterId'].isin(train_encounters)]['isDeceased'].to_numpy()
y_test = demo_df[demo_df['encounterId'].isin(test_encounters)]['isDeceased'].to_numpy()

In [None]:
X_train.shape[0] == y_train.shape[0]

In [None]:
X_test.shape[0] == y_test.shape[0]

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
labels = ['survie', 'deces']

In [None]:
y_train.sum()/len(y_train)

In [None]:
y_train_resampled.sum()/len(y_train_resampled)

In [None]:
classif = RandomForestClassifier()
# Train the classifier
classif.fit(X_train_resampled, y_train_resampled)

# Predict test data
y_test_predict = classif.predict(X_test)

# Get the classification report
accuracy = accuracy_score(y_test, y_test_predict) * 100
print(classification_report(y_test, y_test_predict, target_names=labels))
print("Accuracy: " + str(accuracy) + "%")

In [None]:
# @title Confusion Matrix
cm = confusion_matrix(y_test, y_test_predict)
df_cm = pd.DataFrame(cm, index=[i for i in labels], columns=[i for i in labels])
plt.figure()
ax = sns.heatmap(df_cm, cbar=False, cmap="BuGn", annot=True, fmt="d")
plt.setp(ax.get_xticklabels(), rotation=45)

plt.ylabel("True label", fontweight="bold", fontsize=18)
plt.xlabel("Predicted label", fontweight="bold", fontsize=18)
bottom, top = ax.get_ylim()
plt.show()

In [None]:
import shap

# Appliquer les valeurs de Shapley
explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X_test)

In [None]:
plt_shap = shap.summary_plot(shap_values, #Use Shap values array
                             features=X_train, # Use training set features
                             feature_names=colsV3, #Use column names
                             show=False, #Set to false to output to folder
                             plot_size=(30,15)) # Change plot size
# Save my figure to a directory
plt.savefig("global_shapHAR.png")

In [None]:

# Visualisation des contributions des caractéristiques
# Résumé des valeurs de Shapley pour chaque classe
for i, shap_val in enumerate(shap_values):
    print(f"Résumé pour la classe {i}:")
    shap.summary_plot(shap_val, X_test, feature_names=colsV3)

# T-SNE

In [None]:
from openTSNE import TSNE


In [None]:
tsne = TSNE(
    perplexity=500,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    verbose=True,
)

In [None]:
%time embedding_train = tsne.fit(X_train)

In [None]:
embedding_train.shape

In [None]:
y_train.shape

In [None]:
# Visualize the results
plt.figure(figsize=(8, 6))
sns.scatterplot(x=embedding_train[:, 0], y=embedding_train[:, 1], edgecolor='k', alpha=0.7, hue=y_train)
plt.title("t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()


In [None]:
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

tsne = TSNE(n_components=3, random_state=42, perplexity=500, n_iter=1000)
X_tsne_3d = tsne.fit_transform(X_train)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2], 
    c='blue', edgecolor='k', alpha=0.7
)

# Ajouter des titres et des étiquettes
ax.set_title("Visualisation t-SNE en 3D")
ax.set_xlabel("Composante t-SNE 1")
ax.set_ylabel("Composante t-SNE 2")
ax.set_zlabel("Composante t-SNE 3")

# Afficher le graphique
plt.show()
