# Descriptive Data Analysis (DDA)

This notebook aims at performing a Descriptive data analises to generate an overview of data used in the following experiments. It computes gender, race percentual distribution, average age, number of molecular and clinical features.

In [1]:
import sys
sys.path.insert(0, '../../')

from data import load_data_gse135820 as gse135820, load_data_gse68465 as gse68465
from data import load_data_gse94873 as gse94873, load_data_gse96058 as gse96058
from data import load_data_gse136400 as gse136400

import pandas as pd

result = {c: [] for c in ['dataset_id', 'cancer_type', 'average_age', 'missing_values', 'samples', 
                          'molecular_features', 'clinical_features', 'gender']}

cancer_type = ['Multiple Myeloma', 'Tubo-Ovarian', 'Lung Adenocarcinoma', 'Melanoma', 'Breast']
dataset_id = ['GSE136400', 'GSE135820', 'GSE68465', 'GSE94873', 'GSE96058']

for i, func in enumerate([gse136400, gse135820, gse68465, gse94873, gse96058]):
    
    assert callable(func), 'func is not a callable object'
    
    c, g, o = func()
    
    result['dataset_id'].append(dataset_id[i])
    result['cancer_type'].append(cancer_type[i])
    result['average_age'].append(c['age_at_diagnosis'].mean())
    result['missing_values'].append((c.isnull().sum().sum()) / 
                    (c.shape[0] * c.shape[1]))
    result['samples'].append(c.shape[0])
    result['clinical_features'].append(c.shape[1])
    result['molecular_features'].append(g.shape[1])
    try:
        result['gender'].append(c['gender'].value_counts().to_dict())
    except:
        result['gender'].append({0: 0, 1: c.shape[0]})
    
pd.DataFrame(result)

Unnamed: 0,dataset_id,cancer_type,average_age,missing_values,samples,molecular_features,clinical_features,gender
0,GSE136400,Multiple Myeloma,58.263306,0.0,1240,17912,28,"{0.0: 769, 1.0: 471}"
1,GSE135820,Tubo-Ovarian,60.240916,0.0,3798,513,15,"{0: 0, 1: 3798}"
2,GSE68465,Lung Adenocarcinoma,64.38914,0.029638,442,22283,10,"{1.0: 223, 2.0: 219}"
3,GSE94873,Melanoma,54.975,0.0,720,169,4,"{0.0: 438, 1.0: 282}"
4,GSE96058,Breast,62.738926,0.042735,3409,30865,19,"{0: 0, 1: 3409}"
