# Google Drive

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive/Othercomputers/Il mio computer/Tesi/Computing/Datasets')

# Initialization

In [None]:
!pip install -r requirements.txt

In [3]:
from ipywidgets import widgets
from IPython.display import display
from halo import HaloNotebook as Halo
from tqdm.notebook import tqdm
from libraries.umap_wrapper import umap
from libraries.plots_wrapper import get_iplot, save_fig, plot_title
from libraries.wrappers import load_datasets, load_PCAs, get_patients, get_silhouette, load_labels, result_filename, concatenate, results_subfolder, pipeline_str, clusters_metrics, log_state, smote
from libraries.parameters_widgets import threaded
from libraries.datasets_subsets import get_subsets
from libraries.random_forest import RandomForest, Samplers
from itertools import product
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Paths 

In [4]:
datasets_root = '../../Datasets/9_Normalized'
labels_path = '../../Datasets/6_downcasted'
results_root = '../Results'

## Datasets selection

In [5]:
datasets_selection = widgets.SelectMultiple(options=os.listdir(datasets_root), description='Datasets:', disabled=False)

display(datasets_selection)

SelectMultiple(description='Datasets:', options=('clinical_data.xz', 'cnv.score.xz', 'met_Mval.xz', 'miRNA_morâ€¦

## Datasets loading

In [6]:
datasets = load_datasets(datasets_root, datasets_selection.value, pd, show_bar=True)

  0%|          | 0/7 [00:00<?, ?it/s]

### Labels loading

In [None]:
labels = load_labels(labels_path, 'tcga_cdr_brca_labels.xz', log=True)

## Concatenation -> UMAP

In [2]:
def run_pipeline(datasets, n_forests, pipeline_desc, bar=None, sampling_pipeline: list = (), balance: bool = False):
    log_state('Integrating', bar)
    integration = pd.concat(datasets.values(), axis=1)
    labels_ = labels['PFI']

    log_state('Projecting with UMAP', bar)
    # projected = umap(integration, n_components=3, low_memory=True)

    log_state('Calculating metrics', bar)
    silhouette_pre, clusters_pre = get_silhouette(integration)
    homogeneity_pre, purities_pre = clusters_metrics(integration, labels_, clusters_pre)

    # silhouette, clusters = get_silhouette(projected)
    # homogeneity, purities = clusters_metrics(projected, labels_, clusters)
    # projected['Cluster'] = clusters
    # projected['Purity'] = projected['Cluster'].apply(lambda x: purities[x])

    log_state('Training random forest', bar)
    rf = RandomForest(integration, labels_, n_forests=n_forests, random_state=0, n_folds=7, show_bar=isinstance(bar, tqdm), balance=balance)
    rf.run(sampling_pipeline)

    log_state('Generating plot', bar)
    # title = plot_title(pipeline_desc, silhouette, homogeneity, rf.report(), datasets)
    # fig = get_iplot(projected, labels_, title=title, hover_data=['Cluster', 'Purity'])
    fig = None

    # result = (silhouette_pre, silhouette, homogeneity_pre, homogeneity, purities_pre, purities, *list(rf.report().values()), list(datasets.keys()))
    result = (silhouette_pre, None, homogeneity_pre, None, purities_pre, None, *list(rf.report().values()), list(datasets.keys()))

    return fig, result, rf

In [1]:
pipeline = [('Concat', ''), ('UMAP', 15), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)
fig, result, rf = run_pipeline(datasets, 100, pipeline_desc)

NameError: name 'pipeline_str' is not defined

In [None]:
fig.show()

In [None]:
datasets_string = ','.join(selected_datasets.keys())
save_fig(fig, root='../Results', info=f'{method_param}_[{datasets_string}]')

## Datasets sub-sets test

In [None]:
results = []

patients = get_patients(datasets)

pipeline = [('Concat', ''), ('UMAP', 15), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)

sub_folder = results_subfolder(method, method_param)
os.mkdir(os.path.join(results_root, sub_folder))

rf = RandomForest(labels['PFI'], n_forests=10, random_state=0, n_folds=7, show_bar=True)


for sub_set in (bar := tqdm(get_subsets(datasets))):
    keys = sub_set.keys()
    bar.set_postfix_str(f'Testing {list(keys)}')

    bar.set_description('Integration')
    integrated = concatenate(sub_set, pd)

    silhouette_pre = get_silhouette(integration, labels['PFI'])

    bar.set_description('UMAP reduction')
    projected = umap(integrated, n_components=3)

    bar.set_description('Training random forest')
    rf.run(integrated)

    silhouette_post = get_silhouette(projected_integration, labels['PFI'])

    title = plot_title(pipeline_desc, silhouette_post, rf.report(), sub_set)
    filename = result_filename(method_param, sub_set).replace('.', '_')

    bar.set_description('Saving scatter')
    datasets_string = ','.join(keys)
    fig = get_iplot(projected, labels, title=title)
    save_fig(fig, root=os.path.join(results_root, sub_folder), info=filename, subfolder = False)

    results.append((silhouette_pre, silhouette_post, *list(rf.report().values()), list(sub_set.keys())))

    bar.set_description(f'Max silhoutte: {max([result[0] for result in results])}')

results_df = pd.DataFrame(data=results, columns=['Silhouette pre', 'Silhouette post', *[key.capitalize() for key in rf.report().keys()], 'Datasets'])
results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')

In [None]:
results = []

patients = get_patients(datasets)

pipeline = [('Concat', ''), ('UMAP', 15), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)

sub_folder = results_subfolder(method, method_param)
os.mkdir(os.path.join(results_root, sub_folder))
rf = None

for sub_set in (bar := tqdm(get_subsets(datasets))):
    keys = list(sub_set.keys())
    bar.set_postfix_str(f'Testing {keys}')

    fig, result, rf = run_pipeline(datasets, 100, pipeline_desc)

    if fig is None:
        continue

    results.append(result)

    filename = result_filename(method_param, sub_set).replace('.', '_')
    save_fig(fig, root=os.path.join(results_root, sub_folder), info=filename, subfolder = False)

    bar.set_description(f'Max silhoutte: {max([result[0] for result in results])}')

results_df = pd.DataFrame(data=results, columns=['Silhouette_pre', 'Silhouette', 'Homogeneity_pre', 'Homogeneity', 'Purity_pre', 'Purity', *list(rf.report().keys()), 'Datasets'])
results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')
results_df.to_excel(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.xlsx'))

## Oversampling and balancing

In [10]:
tests = ([], [Samplers.smote_oversampler(.5)], [Samplers.smote_oversampler(.25), Samplers.random_undersampler(.5)], [Samplers.random_undersampler(.25), Samplers.smote_oversampler(.5)],
         [Samplers.random_oversampler(.25), Samplers.random_undersampler(.5)], [Samplers.random_undersampler(.25), Samplers.random_oversampler(.5)], [Samplers.random_oversampler(.5)],
         [Samplers.random_undersampler(.5)], [Samplers.smote_oversampler(.75)], [Samplers.random_oversampler(.75)], [Samplers.random_undersampler(.75)], [Samplers.smote_oversampler(1)], 
         [Samplers.random_oversampler(1)], [Samplers.random_undersampler(1)])

results = []
pipeline = [('Concat', ''), ('UMAP', ''), ('RF', '')]
method, method_param, pipeline_desc = pipeline_str(pipeline)
datasets = {key: dataset for key, dataset in datasets.items() if key in ['cnv.score', 'miRNA_mor', 'mRNA_mor']}
patients = get_patients(datasets)

for balance_ in [True, False]:
    for test in (bar := tqdm(tests)):

        sub_folder = results_subfolder(method, method_param)

        if not os.path.isdir(os.path.join(results_root, sub_folder)):
            os.mkdir(os.path.join(results_root, sub_folder))
        fig, result, rf = run_pipeline(datasets, 100, pipeline_desc, sampling_pipeline=test, balance=balance_, bar=bar)
        result = list(result)
        result.append(balance_)
        result.append([sampler.__class__.__name__ for sampler in test])
        results.append(result)

filename = result_filename(method_param, datasets).replace('.', '_')
results_df = pd.DataFrame(data=results, columns=['Silhouette_pre', 'Silhouette', 'Homogeneity_pre', 'Homogeneity', 'Purity_pre', 'Purity', *list(rf.report().keys()), 'Datasets', 'Balancing', 'Sampler'])
results_df.to_csv(os.path.join(results_root, sub_folder, f'{filename.split("_[")[0]}.csv'), index=False, sep=';')
results_df[results_df['Balancing'] == True].to_excel(os.path.join(results_root, sub_folder, 'pos.xlsx'))
results_df[results_df['Balancing'] == False].to_excel(os.path.join(results_root, sub_folder, 'neg.xlsx'))

  0%|          | 0/14 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/7 [00:00<?, ?it/s]