# tutorial for *ATOL: Automatic Topologically-Oriented Learning*

__Author:__ Martin Royer

## Outline:
In this notebook:
- select a graph dataset that exists in the /perslay/ submodule
- generate the associated persistence diagrams
- show an example of centers and ATOL-features
- run a ten-fold classification experiment solely based on the resulting ATOL graph features

### Select problem and budget

In [1]:
import os
import numpy as np
from itertools import product

graph_problem = "BZR"
graph_folder = "../../perslay/tutorial/data/" + graph_problem + "/" # this should point to a vanilla perslay

atol_params = {
    "filtrations": ['0.1-hks', '10.0-hks'],
    "n_centers": 10,
}

### Compute HKS-extended persistence diagrams for this problem

In [2]:
from atol import compute_tda_for_graphs
print("- [%s] TDA computation" % graph_problem)
compute_tda_for_graphs(atol_params, graph_folder)

- [BZR] TDA computation
Pad size for eigenvalues in this dataset is: 57


### Compute centers and features, plot

In [7]:
from atol.atol import _graph_dtypes, _csv_toarray

In [None]:
from atol import centers_and_inertias, atol_feats_graphs
import pandas as pd
import seaborn as sns
sns.set

num_elements = len(os.listdir(graph_folder + "mat/"))
all_diags = {}  # load all filtrations into memory once for all
for dtype, gid, filt in product(_graph_dtypes, np.arange(num_elements), atol_params["filtrations"]):
    all_diags[(dtype, filt, gid)] = _csv_toarray(graph_folder + "diagrams/%s/graph_%06i_filt_%s.csv" % (dtype, gid, filt))

centers = {(dtype, filt): centers_and_inertias(diags=np.concatenate([all_diags[(dtype, filt, gid)] for gid in np.arange(num_elements)]),
                                               n_centers=atol_params["n_centers"]) for dtype, filt in product(_graph_dtypes, atol_params["filtrations"])}
centers_df = []
for dtype, filt in product(_graph_dtypes, atol_params["filtrations"]):
    clustercenters = centers[dtype, filt][0].cluster_centers_
    [centers_df.append({"center": _, "x": clustercenters[_, 0], "y": clustercenters[_, 1], "dtype": dtype, "filt": filt}) for _ in range(clustercenters.shape[0])]
centers_df = pd.DataFrame(centers_df)
g = sns.relplot(x="x", y="y", hue="center", row="dtype", col="filt", data=centers_df)

In [None]:
feats = pd.DataFrame(atol_feats_graphs(graph_folder, all_diags, atol_params, centers),
                     columns=["index", "type", "center", "value", "label"])

import seaborn as sns
sns.set()

from sklearn.preprocessing import MinMaxScaler
def renormalize(df):
    df["value"] = MinMaxScaler().fit_transform(df["value"].values.reshape(-1, 1))
    return df

sns.relplot(x="index", y="value", kind="line", hue="center", col="type", col_wrap=2, legend="full",
            data=feats.groupby(["type"]).apply(renormalize))

### Ten-fold classification evaluation

In [16]:
from atol import graph_tenfold
print("- [%s] RF classification" % graph_problem)
vfold_scores, feature_times = graph_tenfold(graph_folder, atol_params)
print("- Crossval ended with avg %.4f, sd %.4f" % tuple(f(vfold_scores) for f in [np.mean, np.std]))
print("- Featurisation took %.3f ms" % (np.mean(feature_times) * 1000.0))
print("- [%s] end\n" % graph_problem)

- [BZR] RF classification
-- Fold 1




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.96
  (Test) balanced_accuracy_score: 0.77
-- Fold 2




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.95
  (Test) balanced_accuracy_score: 0.71
-- Fold 3




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.97
  (Test) balanced_accuracy_score: 0.77
-- Fold 4




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.97
  (Test) balanced_accuracy_score: 0.83
-- Fold 5




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.95
  (Test) balanced_accuracy_score: 0.66
-- Fold 6




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.93
  (Test) balanced_accuracy_score: 0.68
-- Fold 7




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.97
  (Test) balanced_accuracy_score: 0.65
-- Fold 8




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.96
  (Test) balanced_accuracy_score: 0.54
-- Fold 9




 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.94
  (Test) balanced_accuracy_score: 0.76
-- Fold 10
 Descriptors have size: (array([80]), array([365], dtype=int64))
  (train) balanced_accuracy_score: 0.96
  (Test) balanced_accuracy_score: 1.00
- Crossval ended with avg 0.7351, sd 0.1171
- Featurisation took 2752.100 ms
- [BZR] end



