In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import joblib
from coclust.coclustering import CoclustInfo, CoclustMod
from ampute import ampute_mar, ampute_mcar
from utils import (show_matrix, _impute_block_representative, RMSE)
from colust_info_impute import CoclustInfoImpute

from sklearn.metrics.cluster import normalized_mutual_info_score as NMI

import warnings
warnings.filterwarnings("ignore")

# CoclustInfoImpute
La classe `CoclustInfoImpute` est une extension de `CoclustInfo` du package `Coclust` permetant d'exécuter `CoclustInfo` sur des jeux de données avec des valeurs maquantes. Elle s'utilise comme `CoclustInfo` avec des paramètres optionals dans sa methode `fit` qui sont les suivantes.

- `impute_func` la function d'imputation pour mettre à jour les valeurs manquantes lorque Z(classe en ligne) ou W(classe en colonne) changent. La function passée en paramètre doit avoir comme paramètre de position:
    1. `X` la matrice originale
    2. `Z` la matrice binaire des classes en ligne
    3. `W` la matrice binaire des classe en colonne
    4. `z` classe en ligne
    5. `w` classe en colonne
    6. `r_nan` indices des valeurs manquantes en ligne
    7. `c_nan` indices des valeurs manquantes en colonne
- `na_rows` et `na_cols` par default à `None` permet de passer l'indice en ligne et en colonne des valeurs maquantes.

Si la matrice contient des valeurs manquantes l'algorithm les remplacera par zero et affectera leur indices a `na_rows` et `na_cols`

### Load required data

In [2]:
# load cstr dataset 
temp = sio.loadmat('./data/complete_data/cstr.mat')
cstr_lab = temp['gnd'].reshape(1,-1)[0]
cstr_data = temp['fea']

In [3]:
# load cstr dataset 
temp = sio.loadmat('./data/complete_data/webAce.mat')
webace_lab = temp['gnd'].reshape(1,-1)[0]
webace_data = temp['fea']

In [5]:
# load cstr dataset 
temp = sio.loadmat('./data/complete_data/classic3.mat')
classic3_lab = temp['labels'].reshape(1,-1)[0]
classic3_data = temp['A'].toarray()

In [6]:
# load cstr dataset 
temp = sio.loadmat('./data/complete_data/classic4.mat')
classic4_lab = temp['labels'][0]
classic4_data = temp['mat'].toarray()

In [22]:
def set_nan(data, r_nan, c_nan):
    """ Replace cells value by numpy nan.
    """
    
    X = data.copy()
    X = X.astype(float)
    X[r_nan, c_nan] = np.nan
    return X

def load_index(name, prob, method='mar'):
    """ load nan cells row and col index.
    """
    
    index = joblib.load(f'./mar_missing_data/nan_idx-{name}-{method}-{prob}.joblib')
    return index['r_nan'], index['c_nan']
    

def execute_coclust_info_impute(datasets,method='mar',probs=[0.1, 0.2, 0.5,0.7],impute_fn=None):
    """ execute CoclustInfoImpute.
        zero is used to replace missing data at initialisation step.
        impute_fn is used to calculate missing data value at block level after changing Z or W.
    """
    for prob in probs:
        for key,value in datasets.items():
            X = value[0]
            lab = value[1]
            if method == 'mcar':
                X_nan, r_nan, c_nan = ampute_mcar(X,prop=prob)
            else:
                r_nan, c_nan = load_index(key,prob,method=method)
                X_nan = set_nan(X, r_nan, c_nan)
            n_clusters = len(np.unique(lab))
            model = CoclustInfoImpute(n_row_clusters=n_clusters,n_col_clusters=n_clusters,n_init=10)
            model.fit(X_nan,impute_func=impute_fn,na_rows=r_nan,na_cols=c_nan)
            
            joblib.dump({
                'imputed': model.X_[r_nan, c_nan],
                'real': X[r_nan, c_nan],
                'r_nan': r_nan,
                'c_nan': c_nan,
                'z': model.row_labels_,
                'w': model.column_labels_,
            }, f'./output/{key}-{method}-{prob}-zero.joblib', compress=3)
            
    return 

def replace_imputed_ca(data,name,prob,method='mar'):
    """ Replace real value of missing data cell of a given data by 
        imputeCA prediction.
    """
    X = data.copy()
    if method == 'mar':
        temp = joblib.load(f'./data/impute_data/{method}/{name}_{prob}__{method}.joblib')        
    else:
        temp = joblib.load(f'./data/impute_data/{method}/{name}_{prob}__{method}.joblib')  
    imputed = temp['imputed']
    r_nan = temp['r_nan']
    c_nan = temp['c_nan']
    X[r_nan,c_nan] = imputed
    return X, r_nan, c_nan
    

def execute_coclust_info_impute_ca(datasets,method='mar',probs=[0.1, 0.2, 0.5,0.7]):
    """ execute CoclustInfo on ImputeCA result
    """
    for prob in probs:
        for key, value in datasets.items():
            X = value[0]
            lab = value[1]
            X_nan, r_nan, c_nan = replace_imputed_ca(X, key, int(100 * prob), method=method)
            n_clusters = len(np.unique(lab))
            model = CoclustInfo(n_row_clusters=n_clusters,n_col_clusters=n_clusters,n_init=10)
            model.fit(X_nan)
            
            joblib.dump({
                'imputed': None,
                'real': X[r_nan, c_nan],
                'r_nan': r_nan,
                'c_nan': c_nan,
                'z': model.row_labels_,
                'w': model.column_labels_,
            }, f'./output/{key}-{method}-{prob}-NORMALCA.joblib', compress=3)
            
    return


def execute_coclust_info_impute_impute_ca(datasets,impute_fn=None, method='mar',probs=[0.1, 0.2, 0.5,0.7]):
    """ execute CoclustInfoImpute on ImputeCA result
    """
    for prob in probs:
        for key, value in datasets.items():
            X = value[0]
            lab = value[1]
            X_nan, r_nan, c_nan = replace_imputed_ca(X, key, int(100 * prob), method=method)
            n_clusters = len(np.unique(lab))
            model = CoclustInfoImpute(n_row_clusters=n_clusters,n_col_clusters=n_clusters,n_init=5)
            model.fit(X_nan,impute_func=impute_fn,na_rows=r_nan,na_cols=c_nan)
            
            joblib.dump({
                'imputed': model.X_[r_nan, c_nan],
                'real': X[r_nan, c_nan],
                'r_nan': r_nan,
                'c_nan': c_nan,
                'z': model.row_labels_,
                'w': model.column_labels_,
            }, f'./output/{key}-{method}-{prob}-NEWCA.joblib', compress=3)
            
    return

## Coclust Info Impute With zero as initial missing value

In [8]:
# create datasets
datasets = {'cstr': [cstr_data, cstr_lab],'webace': [webace_data, webace_lab],
           'classic3': [classic3_data, classic3_lab], 'classic4': [classic4_data, classic4_lab]}

#### Execution on mar method

In [9]:
execute_coclust_info_impute(datasets,impute_fn=_impute_block_representative)

#### Execution on mcar method

In [10]:
execute_coclust_info_impute(datasets,method='mcar',impute_fn=_impute_block_representative)

## Coclust Info with ImputeCA

#### Execution on mar method

In [13]:
execute_coclust_info_impute_ca(datasets)

#### Execution on mcar method

In [15]:
execute_coclust_info_impute_ca(datasets, method='mcar')

## Coclust Info Impute with ImputeCA

### Execution on mar method

In [20]:
execute_coclust_info_impute_impute_ca(datasets,impute_fn=_impute_block_representative)

#### Execution on mcar method

In [21]:
execute_coclust_info_impute_impute_ca(datasets,method='mcar',impute_fn=_impute_block_representative)

## CoclustInfoImpute with ImputeCA and New Imputation Function

In [None]:
# reduction dataset due to time.
datasets = {'cstr': [cstr_data, cstr_lab],'webace': [webace_data, webace_lab]}

### Execution on mar method

In [None]:
execute_coclust_info_impute_impute_ca(datasets,impute_fn=)

### Execution on mcar method

In [None]:
execute_coclust_info_impute_impute_ca(datasets,method='mcar',impute_fn=)