# Uploading to Hugging Face datasets

This is the notebook used to upload the HAM10000 dataset to hugging face (HF).

Original dataset from kaggle was extracted to data folder.

More information on formats and how to's to upload to HF datasets can be found [here](https://huggingface.co/docs/datasets/image_dataset#loading-script).

### 1. Import Dependencies

In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import jsonlines
from datasets import load_dataset
from torchvision import transforms

  from .autonotebook import tqdm as notebook_tqdm


### 2. Read in files and join with Metadata

Download the HAM10000_metadata.csv file to categorize the datasets by their diagnosis type 

In [8]:
# moving the directory to the root folder, optional step since this notebook was in notebook folder
os.chdir('..')
os.getcwd()

'/home/oem/Documents/coding/personal/skin_cancer'

In [14]:
path = 'data'
fullpath = os.path.join(os.getcwd(), path)

# walking through the directory to get the path names
datapath = []
for root, _, files in os.walk(fullpath):
    for file in files:
        datapath.append(os.path.relpath(os.path.join(root, file)))

orig_df = pd.DataFrame(pd.Series(datapath))
orig_df = orig_df.rename(columns={0: 'file_name'})
orig_df['image_id'] = orig_df["file_name"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])


# add mapping the image names to metadata diagnosis
meta_df = pd.read_csv(os.path.join(os.getcwd(), 'notebooks/HAM10000_metadata.csv'))
lesion_type_dict = {
    'nv': 'melanocytic_Nevi',
    'mel': 'melanoma',
    'bkl': 'benign_keratosis-like_lesions',
    'bcc': 'basal_cell_carcinoma',
    'akiec': 'actinic_keratoses',
    'vasc': 'vascular_lesions',
    'df': 'dermatofibroma'
}
meta_df['dx'] = meta_df.dx.map(lesion_type_dict)

df = orig_df.merge(meta_df, how='inner', left_on='image_id', right_on='image_id')
print(df.head())
df.to_csv("dataset.csv", index=False)
print(df.shape)

                                      file_name      image_id    lesion_id  \
0  data/ham10000_images_part_2/ISIC_0031774.jpg  ISIC_0031774  HAM_0002275   
1  data/ham10000_images_part_2/ISIC_0030527.jpg  ISIC_0030527  HAM_0006713   
2  data/ham10000_images_part_2/ISIC_0033561.jpg  ISIC_0033561  HAM_0004708   
3  data/ham10000_images_part_2/ISIC_0034041.jpg  ISIC_0034041  HAM_0005496   
4  data/ham10000_images_part_2/ISIC_0031369.jpg  ISIC_0031369  HAM_0000531   

                 dx    dx_type   age     sex     localization  
0  melanocytic_Nevi  follow_up  45.0  female  lower extremity  
1  melanocytic_Nevi  follow_up  50.0  female            trunk  
2  melanocytic_Nevi      histo  45.0    male            trunk  
3  melanocytic_Nevi      histo  15.0  female  lower extremity  
4          melanoma      histo  85.0    male             face  
(10015, 8)


### 3. Split files by directory

Using SKlearn's train test split to split dataset into train test and validation sets

In [12]:
X = df
y = df['dx']

# initial split to train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, stratify=y)

# further split validation set into validation and test set
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, train_size=(2/3), stratify=y_valid)

### 4. Move files to correct folders

Creating the necessary metadata.jsonl file. 

In [13]:
datasets = {'train': X_train, 'valid': X_valid, 'test': X_test}

# creating the new destination path in the dataset
for k, v in datasets.items():
    v['base_path'] = v['file_name'].apply(lambda x: os.path.split(x)[1])
    v['move_path'] = 'data' + os.path.sep + k + os.path.sep + v['dx'] + os.path.sep + v['base_path']
    v = v.drop(columns=['base_path'])

# creating the necessary folders for train,test,split
for k in datasets.keys():
    parentfolderpath = os.path.join(os.getcwd(), 'data', k)
    if os.path.isdir(parentfolderpath)==False:
        os.mkdir(parentfolderpath)

# creating the necessary subfolder for each cancer type
for col in df['dx'].unique():
    for k in datasets.keys():
        folderpath = os.path.join(os.getcwd(), 'data', k, col)
        if os.path.isdir(folderpath)==False:
                os.mkdir(folderpath)

In [14]:
# moving the files to their correct destination
for k, v in datasets.items():
    for i, row in v.iterrows():
        filename = os.path.join(os.getcwd(), row['file_name'])
        movepath = os.path.join(os.getcwd(), row['move_path'])
        shutil.move(filename, movepath)

### 5. Create Jsonl files

Please note that this is not a standard json file as each line is a json dictionary. Use jsonlines to convert and dump the necessary files into metadata.jsonl 

In [15]:
# creating the jsonlines files
for k, v in datasets.items():
    # editing the dataset to get only the folder and filename in "file_name" column
    v['filepath'] = v['move_path'].copy()
    v['foldername'] = v['filepath'].apply(lambda x: x.split(os.path.sep)[-2])
    v['filename'] = v['move_path'].apply(lambda x: os.path.basename(x))
    v['file_name'] = v['foldername'] + os.path.sep + v['filename']
    v = v.drop(columns=['filepath', 'move_path', 'foldername', 'base_path', 'filename'])

    # creating the jsonlines file
    res = v.to_json(orient='records')
    jsonls = json.loads(res)
    jsonobj = json.dumps(jsonls)
    with jsonlines.open(os.path.join(os.getcwd(), 'data', k, 'metadata.jsonl'), 'w') as writer:
        writer.write_all(jsonls)

### 6. Upload to Hugging Face Hub

In order to upload to HF hub, you will first need to set up an account [here](https://huggingface.co/welcome)

You will also need to install git lfs by installing the latest version of git, or following the steps [here](https://stackoverflow.com/questions/48734119/git-lfs-is-not-a-git-command-unclear)

In [16]:
# load the transformer to change PIL to tensor
transform = {
    'train': transforms.Compose([
                transforms.Resize([224,224]),
                transforms.ToTensor(),
                transforms.Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5)),
            ]),
    'validation': transforms.Compose([
                transforms.Resize([224,224]),
                transforms.ToTensor(),
                transforms.Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5)),
            ]),
    'test': transforms.Compose([
                transforms.Resize([224,224]),
                transforms.ToTensor(),
                transforms.Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5)),
            ]),
}

# upload dataset to hugging face
dataset = load_dataset(os.path.join(os.getcwd(), "data"), revision="resize_244x244", transform=transform)
dataset.push_to_hub("marmal88/skin_cancer")

Resolving data files: 100%|██████████| 9603/9603 [00:01<00:00, 5980.73it/s] 
Resolving data files: 100%|██████████| 1297/1297 [00:00<00:00, 35350.46it/s]
Resolving data files: 100%|██████████| 2492/2492 [00:01<00:00, 2233.80it/s]


TypeError: __init__() got an unexpected keyword argument 'transform'