In [1]:
import pygraspi
import numpy as np
from pygraspi.combined_descriptors import make_descriptors, make_graphdescriptors
import pandas
import networkx as nx
import sknw
import glob
import os
import zipfile

import dask.array as da
from dask.distributed import Client
from dask_ml.preprocessing import MinMaxScaler
from dask_ml.linear_model import LogisticRegression
from dask_ml.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from toolz.curried import pipe, curry

from pymks import (
    solve_cahn_hilliard,
    plot_microstructures,
    graph_descriptors,
    GenericTransformer,
    GraphDescriptors
)

In [2]:
zip_stream = zipfile.ZipFile('data/cahn-hilliard.zip', 'r')

def read_data(file_name):
    return np.array(
        pandas.read_csv(
            zip_stream.open(file_name, 'r'),
            delimiter=' ',
            header=None               
        ).swapaxes(0, 1)
    )

data = np.array(
    list(
        map(read_data, zip_stream.namelist()[1:6])
    )
)

In [3]:
result = pandas.concat([make_descriptors(data), make_graphdescriptors(data)], axis=1, join='inner')

In [4]:
result

Unnamed: 0,branch_length_a,branch_length_b,dist_to_interface_avg_a,dist_to_interface_avg_b,dist_to_interface_max_a,dist_to_interface_max_b,dist_to_interface_min_a,dist_to_interface_min_b,f_skeletal_pixels_a,f_skeletal_pixels_b,...,distance_to_interface,distance_to_interface_0,distance_to_interface_1,interfacial_area,phase_0_cc,phase_0_count,phase_0_interface,phase_1_cc,phase_1_count,phase_1_interface
0,46.08,33.81,9.63,10.36,16.124515,20.808652,1.0,1.0,0.03,0.02,...,4.760697,4.790135,4.73379,5082,16,19341,2518,5,21160,2564
1,42.78,32.97,10.72,11.79,17.691806,21.587033,4.0,1.0,0.03,0.02,...,5.261401,5.29517,5.230863,4508,15,19233,2230,4,21268,2278
2,41.83,34.24,11.29,11.9,18.867962,20.880613,6.0,1.0,0.02,0.02,...,5.42794,5.461208,5.397553,4340,14,19334,2150,5,21167,2190
3,48.43,37.25,11.4,12.66,18.973666,21.0,1.0,1.0,0.03,0.01,...,5.624182,5.667133,5.585109,4166,14,19293,2062,4,21208,2104
4,54.41,55.27,12.43,13.88,23.430749,22.0,1.0,8.0,0.02,0.01,...,6.05632,6.093471,6.022356,3836,12,19343,1901,4,21158,1935


In [5]:
def generate_data(n_category, n_chunks, n_domain, seed=99):
    da.random.seed(seed)
    solve_ch = curry(solve_cahn_hilliard)(delta_t=1.0, delta_x=0.5)
    x_data = pipe(
        da.random.random((n_category * 2, n_domain, n_domain),
                         chunks=(n_chunks, n_domain, n_domain)),
        lambda x: 2 * x - 1,
        lambda x: [
            solve_ch(x[:n_category], n_steps=10),
            solve_ch(x[n_category:], n_steps=100)
        ],
        da.concatenate,
        lambda x: da.where(x > 0, 1, 0).persist()
    )
    y_data = da.from_array(
        np.concatenate([np.zeros(n_category), np.ones(n_category)]).astype(int),
        chunks=(n_chunks,)
    )
    return x_data, y_data

In [6]:
n_category = 96
n_chunks = 24
n_domain = 101

x_data, y_data = generate_data(n_category, n_chunks, n_domain)

In [7]:
make_descriptors(x_data)

Unnamed: 0,branch_length_a,branch_length_b,dist_to_interface_avg_a,dist_to_interface_avg_b,dist_to_interface_max_a,dist_to_interface_max_b,dist_to_interface_min_a,dist_to_interface_min_b,f_skeletal_pixels_a,f_skeletal_pixels_b,number_of_branches_a,number_of_branches_b,number_of_ends_a,number_of_ends_b,number_of_intersections_a,number_of_intersections_b
0,12.81,16.22,4.09,4.14,8.485281,9.433981,1.0,1.0,0.07,0.06,62.0,40.0,34,34,26,14
1,15.53,18.43,4.17,4.68,8.000000,17.888544,1.0,1.0,0.06,0.07,41.0,42.0,33,27,13,17
2,15.63,17.95,4.16,4.48,7.810250,11.661904,1.0,1.0,0.06,0.07,44.0,43.0,33,30,17,18
3,13.42,16.09,4.08,4.22,11.313708,9.000000,1.0,1.0,0.08,0.05,65.0,40.0,35,40,27,10
4,13.00,13.48,4.10,4.35,8.944272,9.433981,1.0,1.0,0.06,0.06,56.0,55.0,39,39,22,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,24.75,26.36,6.87,6.15,11.180340,13.341664,1.0,1.0,0.03,0.05,14.0,22.0,19,14,3,10
188,54.66,36.32,6.65,6.62,13.000000,15.000000,1.0,1.0,0.04,0.05,8.0,14.0,10,13,2,5
189,28.36,24.15,6.32,6.93,12.041595,15.524175,1.0,1.0,0.04,0.04,15.0,13.0,11,19,5,1
190,26.23,25.41,6.57,6.61,10.770330,14.142136,1.0,1.0,0.04,0.04,17.0,14.0,16,15,6,3


In [8]:
#pd.concat([make_descriptors(x_data), make_graphdescriptors(x_data)], axis=1, join='inner')
x_train, x_test, y_train, y_test = train_test_split(
    x_data.reshape(x_data.shape[0], -1),
    y_data,
    test_size=0.2,
    random_state=99
)

In [9]:
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)

In [10]:
# NBVAL_IGNORE_OUTPUT
clf = LogisticRegression().fit(x_train_scaled, y_train)

  return np.exp(A)


In [11]:
x_test_scaled = scaler.transform(x_test)
y_predict = clf.predict(x_test_scaled)

In [12]:
confusion_matrix(y_test.compute(), y_predict.compute())

array([[10, 10],
       [11,  9]])