# Google Drive

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive/Othercomputers/Il mio computer/Tesi/Computing/Datasets')

# Initialization

In [None]:
!pip install -r requirements.txt

In [1]:
from ipywidgets import widgets
from IPython.display import display
from halo import HaloNotebook as Halo
from tqdm.notebook import tqdm
from libraries.umap_wrapper import umap
from libraries.plots_wrapper import get_iplot, save_fig, plot_title
from libraries.wrappers import smote, load_PCAs, get_patients, get_silhouette, load_labels, result_filename, concatenate, results_subfolder, pipeline_str, clusters_metrics, log_state
from libraries.parameters_widgets import *
from libraries.datasets_subsets import get_subsets
from libraries.random_forest import RandomForest, Samplers
from libraries.iNMF_wrapper import inmf
from itertools import product
import libraries.pca as pca
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Paths 

In [2]:
pcas_root = '../../Datasets/'
pcas_filename = '10_PCA.pickle'
labels_path = '../../Datasets/6_downcasted'
results_root = '../Results'

## Datasets loading

In [3]:
pcas = load_PCAs(pcas_root, pcas_filename, log=True)

Output()

Loaded.


### Labels loading

In [4]:
labels = load_labels(labels_path, 'tcga_cdr_brca_labels.xz', log=True)

Output()

Loaded


## PCA ratio

In [5]:
ratio = widgets.Dropdown(options=pcas.keys(), description='First ratio', disabled=False)
second_ratio = widgets.BoundedFloatText(value=0.9, description='Second ratio', min=0, max=1, continuous_update=False)
display(ratio)
display(second_ratio)

Dropdown(description='First ratio', options=(0.75, 0.9), value=0.75)

BoundedFloatText(value=0.9, description='Second ratio', max=1.0)

## Datasets selection

In [6]:
datasets_names = list(pcas[ratio.value])
datasets_selection = widgets.SelectMultiple(options=datasets_names, description='Datasets:', disabled=False)

display(datasets_selection)

SelectMultiple(description='Datasets:', options=('clinical_data', 'cnv.score', 'met_Mval', 'miRNA_mor', 'miRNA…

## Hardware parameters

In [7]:
display_hw_parameters()

Checkbox(value=False, description='Use GPU')

IntSlider(value=-1, continuous_update=False, description='CPU threads:', max=64, min=-1)

## iNMF parameters

In [8]:
display_inmf_parameters()

BoundedFloatText(value=0.0001, description='Convergence tolerance:', max=9.223372036854776e+18)

BoundedFloatText(value=5.0, description='Regularization:', max=9.223372036854776e+18)

IntSlider(value=200, description='Max iteration/batch:, continuous_update=False', max=1000)

BoundedIntText(value=3, description='Factorization components:', max=9223372036854775807, min=1)

Dropdown(description='Algorithm:', index=1, options=('mu', 'halsvar', 'bpp'), value='halsvar')

## PCA -> iNMF -> PCA -> UMAP

In [16]:
W = None


def run_pipeline(datasets, n_components, n_forests, pipeline_desc, second_ratio, bar=None, sampling_pipeline: list = (), balance: bool = False, cached_inmf: bool = False):
    global W
    log_state('Computing iNMF', bar)
    if W is None:
        _, W, _, err = inmf(datasets, n_threads.value, use_gpu.value, n_components, tol.value, lam.value, algo.value, batch_max_iter.value)
    integration = pd.DataFrame(W.transpose(), index=get_patients(datasets))
    integration.columns = [str(col) for col in integration.columns]
    
    if not cached_inmf:
        W = None
    labels_ = labels['PFI']

    log_state('Computing second PCA', bar)

    try:
        reduced = pca.run(integration, n_components=None, param=second_ratio, threshold=100, show_bar=isinstance(bar, tqdm))
    except:
        log_state(f'PCA does not converge on {list(datasets.keys())}', bar)
        return None, None, None

    log_state('Projecting with UMAP', bar)
    projected = umap(reduced, n_components=3)

    log_state('Calculating metrics', bar)
    silhouette_pre, clusters_pre = get_silhouette(reduced)
    homogeneity_pre, purities_pre = clusters_metrics(reduced, labels_, clusters_pre)

    silhouette, clusters = get_silhouette(projected)
    homogeneity, purities = clusters_metrics(projected, labels_, clusters)
    projected['Cluster'] = clusters
    projected['Purity'] = projected['Cluster'].apply(lambda x: purities[x])

    log_state('Training random forest', bar)
    rf = RandomForest(reduced, labels_, n_forests=n_forests, random_state=0, n_folds=7, show_bar=isinstance(bar, tqdm), balance=balance)
    rf.run(sampling_pipeline)

    log_state('Generating plot', bar)
    title = plot_title(pipeline_desc, silhouette, homogeneity, rf.report(), datasets)
    fig = get_iplot(projected, labels_, title=title, hover_data=['Cluster', 'Purity'])

    result = (silhouette_pre, silhouette, homogeneity_pre, homogeneity, purities_pre, purities, *list(rf.report().values()), list(datasets.keys()))

    return fig, result, rf

In [16]:
selected_datasets = {name: dataset for name, dataset in pcas[ratio.value].items() if name in datasets_selection.value}

pipeline = [('PCA', ratio.value), ('iNMF', n_components.value), ('PCA', second_ratio.value), ('UMAP', ''), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)

fig, result, rf = run_pipeline(selected_datasets, n_components.value, 100, pipeline_desc, second_ratio=second_ratio.value)
fig.show()

Computing iNMF
Computing second PCA
PCA does not converge on ['cnv.score', 'met_Mval', 'miRNA_mor', 'miRNA_vst', 'mRNA_mor', 'mRNA_vst']


ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
datasets_string = ','.join(selected_datasets.keys())
save_fig(fig, root='../Results', info=f'{method_param}_[{datasets_string}]')

## Datasets sub-sets test

In [10]:
display(ratio)
display(second_ratio)
display_inmf_parameters()

Dropdown(description='First ratio', index=1, options=(0.75, 0.9), value=0.9)

BoundedFloatText(value=0.75, description='Second ratio', max=1.0)

BoundedFloatText(value=1.0, description='Convergence tolerance:', max=9.223372036854776e+18)

BoundedFloatText(value=1.0, description='Regularization:', max=9.223372036854776e+18)

IntSlider(value=50, description='Max iteration/batch:, continuous_update=False', max=1000)

BoundedIntText(value=200, description='Factorization components:', max=9223372036854775807, min=1)

Dropdown(description='Algorithm:', index=1, options=('mu', 'halsvar', 'bpp'), value='halsvar')

In [11]:
results = []

datasets = {key: dataset for key, dataset in pcas[ratio.value].items() if key in datasets_selection.value}
patients = get_patients(datasets)

pipeline = [('PCA', ratio.value), ('iNMF', n_components.value), ('PCA', second_ratio.value), ('UMAP', ''), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)

sub_folder = results_subfolder(method, method_param)
if not os.path.isdir(os.path.join(results_root, sub_folder)):
    os.mkdir(os.path.join(results_root, sub_folder))

for sub_set in (bar := tqdm(get_subsets(datasets))):
    keys = sub_set.keys()
    bar.set_postfix_str(f'Testing {list(keys)}')

    fig, result, rf = run_pipeline(sub_set, n_components.value, 100, pipeline_desc, second_ratio=second_ratio.value, bar=bar)

    if fig is None:
        continue

    bar.set_description('Saving scatter')
    filename = result_filename(method_param, sub_set).replace('.', '_')
    save_fig(fig, root=os.path.join(results_root, sub_folder), info=filename, subfolder = False)

    results.append(result)

    bar.set_description(f'Max silhoutte: {max([result[0] for result in results])}')

results_df = pd.DataFrame(data=results, columns=['Silhouette_pre', 'Silhouette', 'Homogeneity_pre', 'Homogeneity', 'Purity_pre', 'Purity', *list(rf.report().keys()), 'Datasets'])
results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')
results_df.to_excel(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.xlsx'))

  0%|          | 0/21 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

## Datasets sub-sets test different ratios

In [None]:
display_hw_parameters()
display(oversample)
display_inmf_parameters()

In [None]:
for ratio in (main_bar := tqdm(list(pcas.keys()))):
    datasets = {key: dataset for key, dataset in pcas[ratio].items()}
    patients = get_patients(datasets)

    pipeline = [('PCA', ratio), ('iNMF', n_components.value), ('UMAP', ''), ('RF', '')]
    method, method_param, pipeline_desc = pipeline_str(pipeline)

    sub_folder = results_subfolder(method, method_param)

    os.mkdir(os.path.join(results_root, sub_folder))
    main_bar.set_description(f'Ratio: {ratio}')
    results = []

    for sub_set in (bar := tqdm(get_subsets(datasets), leave=False)):
        keys = sub_set.keys()

        bar.set_postfix_str(f'Testing {list(keys)}')

        fig, result = run_pipeline(sub_set, ratio, n_components.value, 100, pipeline_desc, bar, oversample=oversample.value)

        bar.set_description('Saving scatter')
        filename = result_filename(method_param.replace('>', ''), sub_set).replace('.', '_')
        save_fig(fig, root=os.path.join(results_root, sub_folder), info=filename, subfolder = False)

        results.append(result)

    results_df = pd.DataFrame(data=results, columns=['Silhouette_pre', 'Silhouette', 'Homogeneity_pre', 'Homogeneity', 'Purity_pre', 'Purity', 'Sensitivity', 'Specificity', 'Accuracy', 'F-score', 'Datasets'])
    results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')

    main_bar.set_description('Done')

In [None]:
display_hw_parameters()
display(oversample)
display_inmf_parameters()

In [None]:
tests = ((.75, (['clinical_data', 'met_Mval'], ['cnv.score', 'miRNA_mor'], ['cnv.score', 'met_Mval'], ['cnv.score', 'met_Mval', 'miRNA_mor'])),
         (.9, (['clinical_data', 'met_Mval'], ['cnv.score', 'miRNA_mor'], ['cnv.score', 'miRNA_mor'], ['miRNA_mor', 'mRNA_mor'])))


for ratio, keys_subsets in (main_bar := tqdm(tests)):
    pipeline = [('PCA', ratio), ('iNMF', 200), ('UMAP', ''), ('RF', '')]
    method, method_param, pipeline_desc = pipeline_str(pipeline)

    sub_folder = results_subfolder(method, method_param)

    if not os.path.isdir(os.path.join(results_root, sub_folder)):
        os.mkdir(os.path.join(results_root, sub_folder))
    main_bar.set_description(f'Ratio: {ratio}')
    results = []

    for keys in (bar := tqdm(keys_subsets,  leave=False)):
        datasets = {key: pcas[ratio][key] for key in keys}
        patients = get_patients(datasets)
        bar.set_postfix_str(f'Testing {list(keys)}')

        fig, result = run_pipeline(datasets, ratio, n_components.value, 100, pipeline_desc, bar, oversample=oversample.value)

        bar.set_description('Saving scatter')
        filename = result_filename(method_param.replace('>', ''), datasets).replace('.', '_')
        save_fig(fig, root=os.path.join(results_root, sub_folder), info=filename, subfolder = False)

        results.append(result)

    results_df = pd.DataFrame(data=results, columns=['Silhouette_pre', 'Silhouette', 'Homogeneity_pre', 'Homogeneity', 'Purity_pre', 'Purity', 'Sensitivity', 'Specificity', 'Accuracy', 'F-score', 'Datasets'])
    results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')

## Oversampling and balancing

In [12]:
display(ratio)
display(second_ratio)
display_inmf_parameters()

Dropdown(description='First ratio', index=1, options=(0.75, 0.9), value=0.9)

BoundedFloatText(value=0.9, description='Second ratio', max=1.0)

BoundedFloatText(value=1.0, description='Convergence tolerance:', max=9.223372036854776e+18)

BoundedFloatText(value=1.0, description='Regularization:', max=9.223372036854776e+18)

IntSlider(value=50, description='Max iteration/batch:, continuous_update=False', max=1000)

BoundedIntText(value=200, description='Factorization components:', max=9223372036854775807, min=1)

Dropdown(description='Algorithm:', index=1, options=('mu', 'halsvar', 'bpp'), value='halsvar')

In [15]:
tests = ([], [Samplers.smote_oversampler(.5)], [Samplers.smote_oversampler(.25), Samplers.random_undersampler(.5)], [Samplers.random_undersampler(.25), Samplers.smote_oversampler(.5)],
         [Samplers.random_oversampler(.25), Samplers.random_undersampler(.5)], [Samplers.random_undersampler(.25), Samplers.random_oversampler(.5)], [Samplers.random_oversampler(.5)],
         [Samplers.random_undersampler(.5)], [Samplers.smote_oversampler(.75)], [Samplers.random_oversampler(.75)], [Samplers.random_undersampler(.75)], [Samplers.smote_oversampler(1)], 
         [Samplers.random_oversampler(1)], [Samplers.random_undersampler(1)])

results = []
pipeline = [('PCA', ratio.value), ('iNMF', 200), ('PCA', second_ratio.value), ('UMAP', ''), ('Resampling', ''), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)
datasets = {key: dataset for key, dataset in pcas[ratio.value].items() if key in ['met_Mval', 'miRNA_vst', 'mRNA_vst']}
patients = get_patients(datasets)

for balance_ in [True, False]:
    for test in (bar := tqdm(tests)):

        sub_folder = results_subfolder(method, method_param)

        if not os.path.isdir(os.path.join(results_root, sub_folder)):
            os.mkdir(os.path.join(results_root, sub_folder))
        fig, result, rf = run_pipeline(datasets, 200, 100, pipeline_desc, sampling_pipeline=test, balance=balance_, bar=bar, second_ratio=second_ratio.value, cached_inmf=True)
        result = list(result)
        result.append(balance_)
        result.append([sampler.__class__.__name__ for sampler in test])
        results.append(result)

filename = result_filename(method_param, datasets).replace('.', '_')
results_df = pd.DataFrame(data=results, columns=['Silhouette_pre', 'Silhouette', 'Homogeneity_pre', 'Homogeneity', 'Purity_pre', 'Purity', *list(rf.report().keys()), 'Datasets', 'Balancing', 'Sampler'])
results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')
results_df[results_df['Balancing'] == True].to_excel(os.path.join(results_root, sub_folder, 'pos.xlsx'))
results_df[results_df['Balancing'] == False].to_excel(os.path.join(results_root, sub_folder, 'neg.xlsx'))

  0%|          | 0/14 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]