In [90]:
import numpy as np
import pandas as pd
import scipy.io as si
import utils as us
import Cluster_Ensembles as ce

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelBinarizer
from coclust.clustering.spherical_kmeans import SphericalKmeans
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
from coclust.coclustering import CoclustMod,CoclustSpecMod, CoclustInfo
from coclust.clustering.spherical_kmeans import SphericalKmeans
from coclust.visualization import (plot_reorganized_matrix,
                                  plot_cluster_top_terms,
                                  plot_max_modularities)

In [2]:
db_names = ['tr31', 'tr41', 're0', 're1' ]
data = us.load_data(db_names,dir_path='../cluto')

### Table des donnees

In [3]:
data_table = us.create_data_table(data)
data_table

Unnamed: 0,dataset,# document,# mots,# class,sparsité,pt class/ gr class
0,tr31,927,10128,7,0.973,0.006
1,tr41,878,7454,10,0.974,0.037
2,re0,1504,2886,13,0.982,0.018
3,re1,1657,3758,25,0.986,0.027


In [132]:
def coc_best_partitions(data,n_row,n_col,n=5, runs=20):
    
    partitions = []
    
    for i in range(runs):
        coc = CoclustInfo(n_row_clusters=n_row, n_col_clusters=n_col)
        coc.fit(data)
        partitions.append(coc)
        
    best_index = [ part.criterion for part in partitions]

    ind = np.array(best_index).argsort()[:n]
    best_part = np.take(partitions, ind)
    
    row_lab = np.array([item.row_labels_ for item in best_part])
    
    return best_part, row_lab


def sk_best_partitions(data,k,n=5,runs=20):
    
    partitions = []
    
    for i in range(runs):
        sk = SphericalKmeans(k)
        sk.fit(data)
        partitions.append(sk)
        
    best_index = [ part.criterion for part in partitions]

    ind = np.array(best_index).argsort()[-n:]
    best_part = np.take(partitions, ind)
    
    row_lab = np.array([item.labels_ for item in best_part])
    
    return best_part, row_lab

### 1- CoclustInfo best partions

In [133]:
# Top 8 partitons pour tr31 
cocs_tr31, tr31_part = coc_best_partitions(data['tr31'][0],7,7,n=5,runs=20)

In [134]:
# top 8 partions pour tr41
cocs_tr41, tr41_part = coc_best_partitions(data['tr41'][0],10,10,n=5,runs=20)

In [135]:
# top 8 partions pour re0
cocs_re0, re0_part = coc_best_partitions(data['re0'][0],13,13,n=5,runs=20)

In [136]:
# top 8 partions pour re1
cocs_re1, re1_part = coc_best_partitions(data['re1'][0],25,25,n=5,runs=20)

### 2- Cluster Ensembles avec CSPA, HGPA et MCLA

In [158]:
b = np.array([1,1,3,4])
(b == b[:,np.newaxis]) * 1

array([[ True,  True, False, False],
       [ True,  True, False, False],
       [False, False,  True, False],
       [False, False, False,  True]])

### 3- Matrix de co-association

In [150]:
def co_association(label):
    lb = np.max(label) + 1
    n = len(label) + 1
    I = np.eye(n,lb)
    return I[label,:] 

def total_association(labels):
    
    total = None
    
    for label in labels:
        if total is None:
            total = co_association(label)
        else:
            total = total + co_association(label)
            
    return total

In [153]:
tr31_tot_ass = total_association(tr31_part)
tr31_tot_ass

array([[1., 1., 0., ..., 0., 1., 1.],
       [0., 0., 1., ..., 2., 1., 1.],
       [1., 2., 1., ..., 0., 0., 1.],
       ...,
       [1., 0., 1., ..., 2., 1., 0.],
       [2., 2., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 2., 0.]])

### 4- Ajout du Spherical Kmeans 

In [147]:
# Top 8 partitons pour tr31
sk_tr31, tr31_part = sk_best_partitions(data['tr31'][0],7,n=5,runs=20)

In [159]:
tr31_part.shape

(5, 927)

In [146]:
# top 8 partions pour tr41
sk_tr41, tr41_part = sk_best_partitions(data['tr41'][0],10,n=5,runs=20)

In [145]:
# top 8 partions pour re0
sk_re0, re0_part = sk_best_partitions(data['re0'][0],13,n=5,runs=20)

In [144]:
# top 8 partions pour re1
sk_re1, re1_part = sk_best_partitions(data['re1'][0],25,n=5,runs=20)