In [17]:
from mit_d3m import load_dataset, download_dataset

In [18]:
load_dataset('185_baseball')

Downloading dataset from s3://d3m-data-dai


FileNotFoundError: [Errno 2] No such file or directory: 'data/185_baseball/185_baseball.tar.gz.BeDb2aff'

In [1]:
from analysis import *

In [2]:
import funcy as fy
import os
import json
import numpy as np
import pandas as pd
import pathlib
from mit_d3m import load_dataset
from tqdm import tqdm

In [3]:
from piex.explorer import S3PipelineExplorer
ex = S3PipelineExplorer('ml-pipelines-2018')

In [14]:
dataset_id_list = ex.get_datasets()['dataset'].tolist()
biglist = ['124_153_svhn_cropped', '31_urbansound', 'bone_image_classification', 'bone_image_collection']
process_big = False
dataset_id_list = [l for l in dataset_id_list if l not in biglist]
dataset_id_list = ['124_120_mnist', '196_autoMpg', '185_baseball']

In [5]:
def get_record_path(dataset_id):
    record_path = pathlib.Path(DATA_DIR, 'records')
    if not record_path.exists():
        record_path.mkdir()
        
    return record_path.joinpath(f'{dataset_id}.json')

In [6]:
def get_disk_usage_compressed(dataset_id):
    path = os.path.join(DATA_DIR, f'{dataset_id}.tar.gz')
    return os.path.getsize(path)

In [7]:
def get_disk_usage_inflated(dataset_id):
    start_path = os.path.join(DATA_DIR, dataset_id)
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

### prepare json file-based caching

In [8]:
def save_record(dataset_id, record):
    path = get_record_path(dataset_id)
    with open(path, 'w') as f:
        json.dump(record, f)

In [9]:
def load_record(dataset_id):
    path = get_record_path(dataset_id)
    with open(path, 'r') as f:
        return json.load(f)

In [10]:
def exists_record(dataset_id):
    path = get_record_path(dataset_id)
    return os.path.exists(path)

In [11]:
@fy.decorator
def jsoncached(call):
    dataset_id = call.dataset_id
    if exists_record(dataset_id):
        return load_record(dataset_id)
    else:
        record = call()
        save_record(dataset_id, record)
        return record

In [12]:
@quiet
@jsoncached
def create_record(dataset_id):
    dataset = load_dataset(dataset_id)
    if dataset is not None:
        size = getsize(dataset)
        n = len(dataset.y)
        m = dataset.X.shape[1]
        classes = len(np.unique(dataset.y))
        resources = len(dataset.context.keys())
        del dataset
    else:
        size = np.nan
        n = np.nan
        m = np.nan
        classes = np.nan
        resources = np.nan
    record = {
        'dataset_id': dataset_id,
        'size': size,
        'n': n,
        'm': m,
        'classes': classes,
        'resources': resources,
    }
    return record

### create records for most datasets

In [15]:
records = []
for dataset_id in tqdm(dataset_id_list):
    record = create_record(dataset_id)
    records.append(record)
records[-1]

  0%|          | 0/3 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'data/196_autoMpg/196_autoMpg.tar.gz.A3FcB04d'

### process the biggest datasets separately

In [None]:
@quiet
def download_big(dataset_id):
    download_dataset(BUCKET, dataset_id, DATA_PATH)

In [None]:
if process_big:
    for dataset_id in tqdm(biglist):
        download_big(dataset_id)
        record = {
            'dataset_id': dataset_id,
            'size': np.nan,
            'n': np.nan,
            'm': np.nan,
            'classes': np.nan,
            'resources': np.nan,
        }
        records.append(record)

### compute disk usage for all datasets at once

In [None]:
for record in tqdm(records):
    dataset_id = record['dataset_id']
    du_compressed = get_disk_usage_compressed(dataset_id)
    du_inflated = get_disk_usage_inflated(dataset_id)
    record['du_compressed'] = du_compressed
    record['du_inflated'] = du_inflated

In [None]:
records[-1]

In [None]:
df = pd.DataFrame.from_records(records)
df = df[['dataset_id', 'n', 'm', 'classes', 'resources', 'du_compressed', 'du_inflated', 'size']]
df = df.rename(columns={
    'size': f'Size (memory)',
    'du_compressed': 'Size (compressed)',
    'du_inflated': 'Size (inflated)',
    'n': 'Number of Examples',
    'm': 'Dimension of X',
    'classes': 'Number of classes',
    'resources': 'Number of resources'
})

tmp = ex.get_datasets()
msk = tmp['task_type'] == 'classification'
cls_ids = tmp[msk]
cls_ids = cls_ids['dataset'].tolist()
df.loc[~df['dataset_id'].isin(cls_ids), 'Number of classes'] = np.nan

df.head()

In [None]:
summary = df.describe()
summary = summary.rename(columns={'Dimension of X': 'Columns of $X$',
                                  'Number of Examples': 'Number of examples'})
summary = summary[['Number of examples',
                   'Number of classes',
                   'Columns of $X$',
                   'Number of resources',
                   'Size (compressed)',
                   'Size (uncompressed)']]
summary = summary.T
summary = summary[['min', '25%', '50%', '75%', 'max']]
summary = summary.rename(columns={'25%': 'p25', '50%': 'p50', '75%': 'p75'})
summary.loc[['Size (compressed)', 'Size (uncompressed)']] = summary.loc[['Size (compressed)', 'Size (uncompressed)']].applymap(sizeof_fmt)
summary.to_csv('task_characteristics.csv')
summary.to_latex('task_characterstics.tex', float_format="{:0.1f}".format)
summary