In [1]:
import os
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import CCA
import numpy as np

sys.path.append(os.path.abspath(".."))
from models import PCCA
from utils import plot_CCA_iris, introduce_missing_values

Let's first load the Iris dataset.

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data 
y = iris.target
target_names = iris.target_names

# colors to visualize the species clusters
colors = ['mediumturquoise', 'navy', 'orange']

# Print the features to know which column corresponds to which feature.
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


We perform CCA on $X_A$ containing width features (indices 0 and 2) and $X_B$ containing length features (indices 1 and 3). We run the experiment with the following ratios for missing data: 0%, 15% and 30%.

In [3]:
indices_A = [0, 2]
indices_B = [1, 3]

missing_ratios = [0, 0.15, 0.3]

In [4]:
for missing_ratio in missing_ratios:

    # Introduce missing values
    X_missing = introduce_missing_values(X, missing_ratio)
    scaler = StandardScaler()
    X_missing_scaled = scaler.fit_transform(X_missing)
    
    # PROBABILISTIC CCA -----------------------------------------------------------------------------------------------

    # Split the dataset
    XA, XB  = X_missing_scaled[:,indices_A], X_missing_scaled[:,indices_B]

    # Run the EM
    pcca = PCCA(n_components=1, max_iter=1000, tol=1e-10)
    pcca.fit(XA, XB)

    # Get the projections
    XA_c, XB_c = pcca.transform(XA, XB)

    # Results
    print(f'Correlation with Probabilistic CCA and {100*missing_ratio}% missing values: {np.corrcoef(XA_c.flatten(), XB_c.flatten())[0,1]:.3f}')
    plot_CCA_iris(XA_c, XB_c, colors, y, target_names, save=False, save_path=f"PCCA{100*missing_ratio:.0f}IRIS.pdf")
    
    
    # STANDARD CCA -----------------------------------------------------------------------------------------------
    
    # Fill missing values with the mean
    X_missing_filled = X_missing.copy()
    X_missing_filled[np.isnan(X_missing_filled)] = np.mean(X_missing_filled[~np.isnan(X_missing_filled)], axis=0)
    scaler = StandardScaler()
    X_missing_filled = scaler.fit_transform(X_missing_filled)
    
    # Split the dataset
    XA, XB = X_missing_filled[:,indices_A], X_missing_filled[:,indices_B]

    # Perform CCA
    cca = CCA(n_components=1) 
    cca.fit(XA, XB)

    # Get the projections
    XA_c, XB_c = cca.transform(XA, XB)

    # Results
    print(f'Correlation with Standard CCA and {100*missing_ratio}% missing values: {np.corrcoef(XA_c.flatten(), XB_c.flatten())[0,1]:.3f}')
    plot_CCA_iris(XA_c, XB_c, colors, y, target_names, save=False, save_path=f"CCA{100*missing_ratio:.0f}IRIS.pdf")

 64%|██████▎   | 635/1000 [00:01<00:00, 378.32it/s]


Correlation with Probabilistic CCA and 0% missing values: 0.972


Correlation with Standard CCA and 0% missing values: 0.972


 12%|█▏        | 122/1000 [00:00<00:02, 297.05it/s]

Correlation with Probabilistic CCA and 15.0% missing values: 0.816





Correlation with Standard CCA and 15.0% missing values: 0.570


  7%|▋         | 69/1000 [00:00<00:03, 265.88it/s]


Correlation with Probabilistic CCA and 30.0% missing values: 0.699


Correlation with Standard CCA and 30.0% missing values: 0.428
