In [None]:
import pandas as pd
import numpy as np
#create .npy data splits starting from .xlsx file (after some preprocessing)
df = pd.read_excel("/home/PERSONALE/nicolas.derus2/downloads/cluster_vaccinazione_v1.2.xlsx", engine='openpyxl', sheet_name="cluster_vaccinazione")
df['Sesso'] = pd.factorize(df['Sesso'])[0]
df['cluster_farmaci'] = pd.factorize(df['cluster_farmaci'])[0]
df['cluster_specialistica'] = pd.factorize(df['cluster_specialistica'])[0]
df['Pronto Soccorso'] = df['Pronto Soccorso'].astype(int)
df['Specialistiche'] = df['Specialistiche'].astype(int)
df['ADI'] = df['ADI'].astype(int)
df['Farmaci'] = df['Farmaci'].astype(int)

In [None]:
from sklearn.model_selection import train_test_split
from typing import Any, Optional, cast
import enum
import random
import json
from pathlib import Path

SEED = 0
DATA_DIR = '/home/PERSONALE/nicolas.derus2/downloads/'
EXPECTED_FILES = {
    'cluster_vaccinazione': ['cluster_vaccinazione_v1.2.xlsx']
}
class TaskType(enum.Enum):
    REGRESSION = 'regression'
    BINCLASS = 'binclass'
    MULTICLASS = 'multiclass'
    
def _start(dirname: str):
    print(f'>>> {dirname}')
    _set_random_seeds()
    dataset_dir = DATA_DIR #/ dirname
    expected_files = EXPECTED_FILES[dirname]
    #if expected_files:
    #    assert dataset_dir.exists()
    #    assert set(expected_files) == set(x.name for x in dataset_dir.iterdir())
    #else:
    #    assert not dataset_dir.exists()
    #    dataset_dir.mkdir()
    return dataset_dir, expected_files


def _set_random_seeds():
    random.seed(SEED)
    np.random.seed(SEED)

In [None]:
#ArrayDict = dict[str, np.ndarray]
ArrayDict = dict()


def _make_split(size: int, stratify: Optional[np.ndarray], n_parts: int) -> ArrayDict:
    # n_parts == 3:      all -> train & val & test
    # n_parts == 2: trainval -> train & val
    assert n_parts in (2, 3)
    all_idx = np.arange(size, dtype=np.int64)
    a_idx, b_idx = train_test_split(
        all_idx,
        test_size=0.2,
        stratify=stratify,
        random_state=SEED + (1 if n_parts == 2 else 0),
    )
    if n_parts == 2:
        return cast(ArrayDict, {'train': a_idx, 'val': b_idx})
    a_stratify = None if stratify is None else stratify[a_idx]
    a1_idx, a2_idx = train_test_split(
        a_idx, test_size=0.2, stratify=a_stratify, random_state=SEED + 1
    )
    return cast(ArrayDict, {'train': a1_idx, 'val': a2_idx, 'test': b_idx})



In [None]:
def _save(
    dataset_dir,
    name,
    task_type,
    *,
    X_num,
    X_cat,
    y,
    idx,
    id_:Optional[str] = None,
    id_suffix,
):
        if id_ is not None:
            assert id_suffix == '--default'
            
        assert (
            X_num is not None or X_cat is not None
        ), 'At least one type of features must be presented.'
        
        if X_num is not None:
            X_num = {k: v.astype(np.float32) for k, v in X_num.items()}
        if X_cat is not None:
            X_cat = {k: v.astype(str) for k, v in X_cat.items()}
        if idx is not None:
            idx = {k: v.astype(np.int64) for k, v in idx.items()}

        y = {
            k: v.astype(np.float32 if task_type == TaskType.REGRESSION else np.int64)
            for k, v in y.items()
        }

        if task_type != TaskType.REGRESSION:
            y_unique = {k: set(v.tolist()) for k, v in y.items()}
            assert y_unique['train'] == set(range(max(y_unique['train']) + 1))
            for x in ['val', 'test']:
                assert y_unique[x] <= y_unique['train']
            del x

        info = {
            'name': name,
            'id': (dataset_dir + id_suffix) if id_ is None else id_,
            'task_type': task_type.value,
            'n_num_features': (0 if X_num is None else next(iter(X_num.values())).shape[1]),
            'n_cat_features': (0 if X_cat is None else next(iter(X_cat.values())).shape[1]),
        } | {f'{k}_size': len(v) for k, v in y.items()}


        if task_type == TaskType.MULTICLASS:
            info['n_classes'] = len(set(y['train']))
        #(dataset_dir + 'info.json').write_text(json.dumps(info, indent=4)) removed .json file 
        with open(dataset_dir + 'info.json', 'w', encoding='utf-8') as f:
            json.dump(info, f, ensure_ascii=False, indent=4)

        for data_name in ['X_num', 'X_cat', 'y', 'idx']:
            data = locals()[data_name]
            if data is not None:
                for k, v in data.items():
                    np.save(dataset_dir + f'{data_name}_{k}.npy', v)
        #(dataset_dir + 'READY').touch()

        print('Done\n')

In [None]:
def _apply_split(data: ArrayDict, split: ArrayDict):
    return {k: {part: v[idx] for part, idx in split.items()} for k, v in data.items()}


In [None]:
#define our dataset

def cluster_vaccinazione():
    dataset_dir, files = _start('cluster_vaccinazione')
    #df = pd.read_csv(files[0])
    df = pd.read_excel(dataset_dir + files[0])
    df = df.drop(columns=['Id', 'FasciaEta']) 
    df['Sesso'] = pd.factorize(df['Sesso'])[0]
    df['cluster_farmaci'] = pd.factorize(df['cluster_farmaci'])[0]
    df['cluster_specialistica'] = pd.factorize(df['cluster_specialistica'])[0]
    df['Pronto Soccorso'] = df['Pronto Soccorso'].astype(int)
    df['Specialistiche'] = df['Specialistiche'].astype(int)
    df['ADI'] = df['ADI'].astype(int)
    df['Farmaci'] = df['Farmaci'].astype(int)
    #df['Sesso'] = df['Sesso'].astype('category').cat.codes.values.astype(np.int64)
    y_all = df.pop('cluster_doc').values.astype(np.int64)
    num_columns = [
        'Eta',
        'Comorbilita',
        'Ricoveri',
        'Pronto Soccorso',
        'Specialistiche',
        'ADI',
        'Farmaci',
        
    ]
    cat_columns = ['Sesso','cluster_specialistica', 'cluster_farmaci',
       'malattie_cardiovascolari_croniche', 'malattie_cardiovascolari_acute',
       'malattie_respiratorie', 'malattie_respiratorie_acute',
       'malattie_metaboliche', 'deficit_immunitari', 'tumori', 'trapianti',
       'condizioni_neurologiche_disabilita', 'fibrosi_cistica',
       'malattia_renale', 'malattie_cerebrovascolari', 'malattie_epatiche',
       'obesita_grave', 'sindrome_down',
        ]
    assert set(num_columns) | set(cat_columns) == set(df.columns.tolist())
    X_num_all = df[num_columns].astype(np.float32).values
    X_cat_all = df[cat_columns].astype(str).values
    idx = _make_split(len(df), y_all, 3)

    _save(dataset_dir,
        'cluster_vaccinazione',
        TaskType.MULTICLASS,
        **_apply_split(
            {'X_num': X_num_all, 'X_cat': X_cat_all, 'y': y_all}, idx,
        ),
        idx=idx,
        id_suffix = '--default',
    )

In [None]:
cluster_vaccinazione()