In [None]:
import pandas as pd
from clustering.objects.finder import Explorer, RequestReport
from clustering.objects.structures import Bundle

# Clustering dependency

This notebook contains some of the main features of the clustering dependency. This will give a new user a first approach to the tools that are found on the dependency. 

Check the code on github or use python built-in functions `dir` and `help` to get and read about all the other methods not listed here.

In [None]:
HD5_PATH = "/media/mad3/hd5"  # path where the hd5 files are stored
pd.set_option('max_colwidth', 3000)

In [None]:
explorer = Explorer(HD5_PATH)

### 1. Getting statistics about all the HD5 files

Create a table with statistics about the HD5 files stored in a path. 
You can see group them by their conatined source, you can see how many patiens have been in BLK08 or how many files are not readable (corrupted)

In [None]:
quality = explorer.get_quality()

In [None]:
quality.by_sources()

In [None]:
quality.by_blk08()

### 2. Finding signals by keyword

If you have the name of a medicine and want to get a list of all the signals containing that name, just run:

In [None]:
norepine = explorer.guess_signal_name("norepine", "med")

In [None]:
pd.DataFrame(norepine.most_common(), columns=["medicine", "occurrences"])

### 3. Selecting a cohort

Get a statistics for each file about the desired signals and select a cohort of patients.

In [None]:
signals = ["hr", "art1m", "pa2m"]
report = explorer.find(signals, cell="max_unfilled", stay_length=12)

In [None]:
report.df.head(10)  # Output from this cell has been removed to keep MRNs private

You can now get a cohort of patients based on some conditions:

In [None]:
df = report.df
alt_df = df[(df["Overlap"]) & (df["Overlap length (h)"] > 10) & (df["Max unfilled time (s)"] < 3600) & (df["Max non-monotonicities (#)"] < 4)]
alt_df.shape

In [None]:
report.df = alt_df
report.to_csv("cohort_stats.csv")

### 4. Extracting signals

Extract all the signals from a cohort created like on point 3 and store them on a file.

In [None]:
report = RequestReport.from_csv("cohort_stats.csv")
bundle = explorer.extract_data(report)
bundle.store("dummy_cohort.bundle")

### 5. Curating data

Data curation on the cohort

In [None]:
bundle = Bundle.from_pickle("dummy_cohort.bundle")

In [None]:
bundle.remove_outliers(list_methods=True)   # The same can be done with pad, downsample and normalize to list the available methods

In [None]:
bundle.remove_outliers(method="remove_isolated", find_method="zscore", z_threshold=3, filter_method="global_predefined_limits", jump_pc=0.01, outliers_pc_th=20)
bundle.pad(filling="mean")
bundle.downsample(method="linear_interpolation", new_rate=600)
bundle.normalize(method="min_max_values")

In [None]:
bundle.store("dummy_cohort_curated.bundle")

You can also plot signals from all patients time-aligned to their entrance to BLK08

In [None]:
bundle.plot_signal("hr")

Or from a subset of patients:

In [None]:
patients = bundle.patient_list()
bundle.plot_signal("hr", patients=patients[:3])

### 6. Clustering

Cluster the cohort and get statistics from the clusters.

In [None]:
bundle = Bundle.from_pickle("dummy_cohort_curated.bundle")

In [None]:
features = bundle.feature_matrix(method="concatenation", order="xxyy")
cluster_results = bundle.cluster(method="kmeans", distances=features, distance_algo="euclidian", cluster_algo="full", n_clusters=3)

In [None]:
cluster_stats = bundle.cluster_stats(cluster_results)
cluster_stats.get_summary()

In [None]:
cluster_stats.plot_distribution("BLK08 stay (d)")