# Norms

In [1]:
import numpy as np


def no_norm(matrix):
    return matrix

def max_norm(matrix):
    normed_matrix = matrix / np.max(matrix)
    return normed_matrix

def binar_norm(matrix):
    bin_matrix = matrix.copy()
    bin_matrix[bin_matrix > 0] = 1
    return bin_matrix

def mean_norm(matrix):
    normed_matrix = matrix / np.mean(matrix)
    return normed_matrix

def double_norm(function, matrix1, matrix2):
    return function(matrix1), function(matrix2)

# Transformers

In [2]:
import numpy as np
import os
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from scipy.spatial.distance import *



#     y = DataTransformer( target ).fit_transform( pairs_data )['pairs_data'].are_same
#     X = DataTransformer( target ).fit_transform( pairs_data )

def generate_even_sample(data, n = 1000, seed = 0):
    sample_of_1 = data[data.are_same == 1].sample(n=n, random_state=seed)
    sample_of_0 = data[data.are_same == 0].sample(n=n, random_state=seed)
    return pd.concat([sample_of_1, sample_of_0], axis=0)

def load_data(target, data_dir='data/adni/matrices/', pairs_data_dir='data/adni_pairs_data_with_dx_group_without_isolated_nodes.csv'):
    pairs_data = pd.read_csv(pairs_data_dir, index_col = None)
    matrices = {}
    if target != 'All':
        pairs_data = pairs_data[(pairs_data.subject1_target == target) & (pairs_data.subject2_target == target)]
    file_ids = np.unique(pairs_data[['subject1_id', 'subject2_id']])
    for file_id in file_ids:
        for file in os.listdir(data_dir+file_id):
            if 'NORM' not in file:
                matrix = np.loadtxt(data_dir + file_id + '/' + file)
                matrix = np.delete(matrix, [3,38], axis = 1)
                matrix = np.delete(matrix, [3,38], axis = 0)
                np.fill_diagonal(matrix, 0)
                matrices[file_id] = matrix

    pairs_data = generate_even_sample(pairs_data, n = int(pairs_data.are_same.sum()))
    return {
        'pairs_data': pairs_data,
        'matrices': matrices,
        'y': pairs_data.are_same
    }

class MatrixNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, norm, copy=True):
        self.norm    = norm
        self.copy    = copy

    def fit(self, data, y=None, **fit_params):
        return self

    def transform(self, data):
        matrices_transformed = {}

        for key in data['matrices'].keys():
            matrices_transformed[key] = self.norm(data['matrices'][key])
        
        data['matrices'] = matrices_transformed

        return data

class MatrixFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, features, copy=True):
        self.features = features
        self.copy = copy

    def fit(self, data, y=None, **fit_params):
        return self

    def transform(self, data):

        cur_features = {}
        for key in data['matrices'].keys():
            cur_features[key] = self.features[0](data['matrices'][key])
            for feature_func in self.features[1:]:
                cur_features[key] = np.append(cur_features[key], feature_func(data['matrices'][key]))
        data['features'] = cur_features

        return data

def gen_dist(p): return lambda x,y: minkowski(x.reshape(-1),y.reshape(-1),p)
func_list = [chebyshev] + [gen_dist(i) for i in [1, 2]]

class VectorFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, func_list):
        self.func_list = func_list

    def fit(self, data, y=None, **fit_params):
        return self

    def transform(self, data):
        vectors1 = data['pairs_data'].subject1_id.apply(lambda x: data['features'][x])
        vectors2 = data['pairs_data'].subject2_id.apply(lambda x: data['features'][x])
        features = []
        for index in vectors1.index:
            feats = []
            for function in self.func_list:
                feats.append(function(vectors1[index], vectors2[index]))
            features.append(feats)

        return np.array(features), data['y']

# Featurizers

In [3]:
import numpy as np
import os
import sys
import pandas as pd
from scipy.spatial.distance import *
from scipy.sparse.csgraph import dijkstra, shortest_path, connected_components, laplacian

from sklearn.base import  BaseEstimator, TransformerMixin
from copy import deepcopy

from sklearn.model_selection import StratifiedKFold, cross_val_score, StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import networkx as nx
import igraph as ig
import scipy
import time


def bag_of_edges(X, SPL=None, symmetric = True, return_df = False, offset = 1):
    size = X.shape[1]
    if symmetric:
        indices = np.triu_indices(size, k = offset)
    else:
        grid = np.indices(X.shape[1:])
        indices = (grid[0].reshape(-1), grid[1].reshape(-1))
    if len(X.shape) == 3:
        featurized_X = X[:, indices[0], indices[1]]
    elif len(X.shape) == 2:
        featurized_X = X[indices[0], indices[1]]
    else:
        raise ValueError('Provide array of valid shape: (number_of_matrices, size, size).')
    if return_df:
        col_names = ['edge_' + str(i) + '_' + str(j) for i,j in zip(indices[0], indices[1])]
        featurized_X = pd.DataFrame(featurized_X, columns=col_names)
    return featurized_X

def degrees(X, return_df = False):
    if len(X.shape) == 3:
        featurized_X = np.sum(X, axis=1)
        shape = (X.shape[0], X.shape[1])
    elif len(X.shape) == 2:
        featurized_X = np.sum(X, axis=1)
        shape = (1, X.shape[1])
    else:
        raise ValueError('Provide array of valid shape: (number_of_matrices, size, size). ')

    if return_df:
        col_names = ['degree_' + str(i) for i in range(X.shape[1])]
        featurized_X = pd.DataFrame(featurized_X.reshape(shape), columns=col_names)
    return featurized_X

def closeness_centrality(X):
    n_nodes = X.shape[0]
    A_inv = 1./X
    SPL = scipy.sparse.csgraph.dijkstra(A_inv, directed=False,
            unweighted=False)
    sum_distances_vector = np.sum(SPL, 1)
    cl_c = float(n_nodes - 1)/sum_distances_vector
    featurized_X = cl_c
    return featurized_X

def betweenness_centrality(X):
    n_nodes = X.shape[0]
    A_inv = 1./X
    G_inv = ig.Graph.Weighted_Adjacency(list(A_inv), mode="UNDIRECTED", attr="weight", loops=False)
    btw = np.array(G_inv.betweenness(weights='weight', directed=False))*2./((n_nodes-1)*(n_nodes-2))
    return btw

def eigenvector_centrality(X):
    G = ig.Graph.Weighted_Adjacency(list(X), mode="UNDIRECTED",
                attr="weight", loops=False)
    eigc = G.eigenvector_centrality(weights='weight', directed=False)
    return np.array(eigc)

def pagerank(X):
    G = ig.Graph.Weighted_Adjacency(list(X), mode="DIRECTED", attr="weight", loops=False)
    return np.array(G.pagerank(weights="weight"))

def efficiency(X):
    A_inv = 1./X
    SPL = scipy.sparse.csgraph.dijkstra(A_inv, directed=False, unweighted=False)
    inv_SPL_with_inf = 1./SPL
    inv_SPL_with_nan = inv_SPL_with_inf.copy()
    inv_SPL_with_nan[np.isinf(inv_SPL_with_inf)]=np.nan
    efs = np.nanmean(inv_SPL_with_nan, 1)
    return efs

def clustering_coefficient(X):
    Gnx = nx.from_numpy_matrix(X)
    clst_geommean = list(nx.clustering(Gnx, weight='weight').values())
    clst_geommean
    return np.array(clst_geommean)

def triangles(X):
    clust = clustering_coefficient(X)

    G = ig.Graph.Weighted_Adjacency(list(X), mode="UNDIRECTED",
            attr="weight", loops=False)
    non_weighted_degrees = np.array(G.degree())
    non_weighted_deg_by_deg_minus_one = np.multiply(non_weighted_degrees,
            (non_weighted_degrees - 1))
    tr = np.multiply(np.array(clust),
            np.array(non_weighted_deg_by_deg_minus_one, dtype = float))/2.
    return tr




# Pipelier's settings

In [4]:
from reskit.core import Pipeliner
from reskit.core import Transformer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit


grid_cv = StratifiedKFold(n_splits=10, shuffle=True,  random_state=0)

eval_cv = StratifiedShuffleSplit(
            n_splits = 100,
            test_size = 0.2,
            random_state = 0 )

datasets = [
    ('Normal', Transformer(load_data, params=dict(target='Normal'))),
    ('EMCI',   Transformer(load_data, params=dict(target='EMCI'))),
    ('LMCI',   Transformer(load_data, params=dict(target='LMCI'))),
    ('AD',     Transformer(load_data, params=dict(target='AD'))),
    ('All',    Transformer(load_data, params=dict(target='All')))
]

normalizers = [
    ('no_norm', MatrixNormalizer(no_norm)),
    ('binar',   MatrixNormalizer(binar_norm)),
    ('max',     MatrixNormalizer(max_norm)),
    ('mean',    MatrixNormalizer(mean_norm)),
]

featurizers = [
    ('bag_of_edges',          MatrixFeaturizer([bag_of_edges])),
    ('degrees',               MatrixFeaturizer([degrees])),
    ('closeness_centrality',  MatrixFeaturizer([closeness_centrality])),
    ('betweenness_centrality',MatrixFeaturizer([betweenness_centrality])),
    ('eigenvector_centrality',MatrixFeaturizer([eigenvector_centrality])),
    ('pagerank',              MatrixFeaturizer([pagerank])),
    ('efficiency',            MatrixFeaturizer([efficiency])),
    ('clustering_coefficient',MatrixFeaturizer([clustering_coefficient])),
    ('triangles',             MatrixFeaturizer([triangles]))
]

pairwise_features = [
    ('l1_l2_linf', VectorFeaturizer(func_list=func_list))
]

scalers = [
    ('standard', StandardScaler())
]

classifiers = [
    ('LR', LogisticRegression())
]

steps = [
    ('Dataset', datasets),
    ('Normalizer', normalizers),
    ('Featurizer', featurizers),
    ('Pairwise_features', pairwise_features),
    ('Scaler', scalers),
    ('Classifier', classifiers)
]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    )
)



pipe = Pipeliner(steps, grid_cv=grid_cv, eval_cv=eval_cv, param_grid=param_grid)
pipe.plan_table

Unnamed: 0,Dataset,Normalizer,Featurizer,Pairwise_features,Scaler,Classifier
0,Normal,no_norm,bag_of_edges,l1_l2_linf,standard,LR
1,Normal,no_norm,degrees,l1_l2_linf,standard,LR
2,Normal,no_norm,closeness_centrality,l1_l2_linf,standard,LR
3,Normal,no_norm,betweenness_centrality,l1_l2_linf,standard,LR
4,Normal,no_norm,eigenvector_centrality,l1_l2_linf,standard,LR
5,Normal,no_norm,pagerank,l1_l2_linf,standard,LR
6,Normal,no_norm,efficiency,l1_l2_linf,standard,LR
7,Normal,no_norm,clustering_coefficient,l1_l2_linf,standard,LR
8,Normal,no_norm,triangles,l1_l2_linf,standard,LR
9,Normal,binar,bag_of_edges,l1_l2_linf,standard,LR


# Getting Results

In [5]:
results = pipe.get_results(
    data=None, 
    caching_steps=['Dataset', 'Normalizer', 'Featurizer', 'Pairwise_features'], 
    scoring=['accuracy', 'roc_auc', 'precision', 'recall'])

Removed previous results file -- results.csv.
Line: 1/180
Line: 2/180
Line: 3/180


Process ForkPoolWorker-67:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


KeyboardInterrupt: 