# Google Drive

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive/Othercomputers/Il mio computer/Tesi/Computing/Datasets')

ModuleNotFoundError: No module named 'google'

# Initialization

In [None]:
!pip install -r requirements.txt

In [2]:
from ipywidgets import widgets
from IPython.display import display
from halo import HaloNotebook as Halo
from tqdm.notebook import tqdm
import numpy as np
from threading import Thread
import os

## Threaded execution
Using modin and Dask slightly increases performances, but requires more RAM.

In [3]:
threaded = widgets.Checkbox(value=True, description='Threaded pandas')
display(threaded)

Checkbox(value=True, description='Threaded pandas')

In [4]:
if threaded.value:
    import modin.pandas as pd
    from dask.distributed import Client

    client = Client(n_workers=4, threads_per_worker=2)  # More workers = more RAM needed

else:
    import pandas as pd

## Paths 

In [5]:
input_root = '../../Datasets/6_downcasted'
output_root = '../../Datasets/7_NaNs_dropped'

## Functions

In [6]:
def load_dataset(dataset_file, root, bar: tqdm = None) -> pd.DataFrame:
    path = os.path.join(root, dataset_file)
    if bar:
        bar.set_description(f'Loading "{dataset_file}" from "{path}"')
    dataset = pd.read_pickle(path)

    return dataset

In [7]:
def get_pairs(data: list) -> list:
    if len(data) % 2 == 1:
        data.append(data[-1])

    pairs = [(data[i], data[i+1]) for i in range(0, len(data), 2)]
    return pairs

In [8]:
def list_intersection(data):
    while len(data) > 2:
        pairs = get_pairs(data)
        data = [np.intersect1d(pair[0], pair[1]) for pair in pairs]

    return np.intersect1d(data[0], data[1])

In [9]:
def indexes_intersection(indexes):
    while len(indexes) > 2:
        pairs = get_pairs(indexes)
        indexes = [pair[0].intersection(pair[1]) for pair in pairs]

    return indexes[0].intersection(indexes[1])

In [10]:
def get_drop_rate(original, dropped):
    return (original - dropped)/original

In [11]:
def drop_rate_value(df: pd.DataFrame, axis=0):
    drop_rate = get_drop_rate(len(df), len(df.dropna(axis=axis)))

    return drop_rate

In [12]:
def drop_rate_rows(df):
    return drop_rate_value(df)

In [13]:
def drop_rate_columns(df):
    return drop_rate_value(df, axis=1)

In [14]:
def min_drop_rate_axis(df):
    with Halo(text='Testing rows dropping', spinner='dots'):
        rows = drop_rate_rows(df) / len(df)

    with Halo(text='Testing columns dropping', spinner='dots'):
        columns = drop_rate_columns(df) / len(df.columns)

    return int(rows > columns)

## Datasets loading

In [15]:
datasets = {}

datasets_files = os.listdir(input_root)

for filename in (bar := tqdm(datasets_files)):
    datasets[filename] = load_dataset(filename, input_root, bar)

    bar.set_description('Datasets loaded')

  0%|          | 0/8 [00:00<?, ?it/s]

To request implementation, send an email to feature_requests@modin.org.


## NaNs drop
Drop rates dropping rows and columns are calculated and used to reduce data loss; at the moment features have the priority, Clinical features are considered "key features" and in this way the probability that the rows will be dropped instead of columns is greater.
The only situation in which features will be droped is the case where each patient has at least one NaN feature.

In [16]:
for name, dataset in (bar := tqdm(datasets.items())):
    bar.set_description(f'Dropping NaNs in {name}')
    rows_drop_rate = drop_rate_rows(dataset)
    cols_drop_rate = drop_rate_columns(dataset)

    axis = 0 if rows_drop_rate < 1 else 1

    datasets[name] = dataset.dropna(axis=axis).apply(pd.to_numeric,  downcast='integer')
    bar.set_description(f'Dropping done.')

  0%|          | 0/8 [00:00<?, ?it/s]



### Datasets shape check

In [17]:
for name, dataset in (bar := tqdm(datasets.items())):
    print(f'{name:<30} {len(dataset)} {len(dataset.columns)}')

  0%|          | 0/8 [00:00<?, ?it/s]

clinical_data.xz               445 11
cnv.score.xz                   738 19729
met_Mval.xz                    738 365786
miRNA_mor.xz                   738 1881
miRNA_vst.xz                   738 1881
mRNA_mor.xz                    738 56602
mRNA_vst.xz                    738 56602
tcga_cdr_brca_labels.xz        642 2


### No NaNs check
False means that there are not NaNs in the respective dataset

In [18]:
for name, dataset in (bar := tqdm(datasets.items())):
    print(name, dataset.isna().any().any())

  0%|          | 0/8 [00:00<?, ?it/s]

clinical_data.xz False
cnv.score.xz False
met_Mval.xz False
miRNA_mor.xz False
miRNA_vst.xz False
mRNA_mor.xz False
mRNA_vst.xz False
tcga_cdr_brca_labels.xz False


## Patients intersection

In [20]:
dataset: pd.DataFrame
indexes = [dataset.index for dataset in datasets.values()]

with Halo(text=f'Intersecting {len(indexes)} indexes...', spinner='dots'):
    intersection = indexes_intersection(indexes)

Output()

## Datasets filtering by patients instersection

In [21]:
for name, dataset in (bar := tqdm(datasets.items())):
    datasets[name] = dataset.loc[intersection]

  0%|          | 0/8 [00:00<?, ?it/s]

## Output to pickle

In [22]:
for name, dataset in (bar := tqdm(datasets.items())):
    path = os.path.join(output_root, name)
    bar.set_description(f'Writing {path}')
    dataset.to_pickle(path)
    bar.set_description('Done')

  0%|          | 0/8 [00:00<?, ?it/s]

