In [1]:
import os            # operaciones con carpetas
import shutil        # Utility functions for copying and archiving files and directory trees
import random        # Random variable generators
import pandas as pd  # table visualization
import cv2           # visual images


# Data Preprocessing

In [2]:
# carga del archivo metadatos completos
original = pd.read_csv("./data/metadata_original.csv")
original.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [22]:
# carga del archivo .csv con la metadata
metadata = pd.read_csv("./data/128x128/metadata.csv")

metadata = metadata[['image_name', 'diagnosis']]

metadata.head()

Unnamed: 0,image_name,diagnosis
0,ISIC_2637011,unknown
1,ISIC_0015719,unknown
2,ISIC_0052212,nevus
3,ISIC_0068279,unknown
4,ISIC_0074268,unknown


In [14]:
# exportando el nuevo archido de metadata 
metadata.to_csv("./data/128x128/metadata.csv", index=False)

In [3]:
# selección del seed para la distribución homogénea de números aleatorios 
random.seed(1)

# Selección de directorio a los datos
img_dir_128 = "./data/128x128/images/"

In [7]:
# redimencionamiento de las images de 128x128 a 64x64 pixels 
for line in open("data/64x64/metadata.csv").readlines()[1:]:
    split_line = line.split(",")
    img_file = split_line[0]
    img_from = img_dir_128+img_file+'.jpg'
    img = cv2.imread(img_from)
    img_resize = cv2.resize(img, (64, 64), cv2.INTER_AREA)
    img_to = img_dir_64+img_file+'.jpg'
    cv2.imwrite(img_to, img_resize)

In [25]:
# Ruta de datos de entrenamiento, testeo y validación
train = "data/128x128/train/"
test = "data/128x128/test/"
validation = "data/128x128/validation/"

# Cración de las rutas de datos de entrenamiento, testeo y validación
os.makedirs(train)
os.makedirs(test)
os.makedirs(validation)

# separacion de las imagenes por diagnostico positivo(melanoma) o negativo(demás dagnosticos)
os.makedirs(train+'/positive')
os.makedirs(test+'/positive')
os.makedirs(validation+'/positive')
os.makedirs(train+'/negative')
os.makedirs(test+'/negative')
os.makedirs(validation+'/negative')

In [None]:
# función que nos separara en las carpetas de test, train y validation 
# las imágenes separándolas a su ves por positive o negative    
lession_types = []
for line in open("data/128x128/metadata.csv").readlines()[1:]:
    
    split_line = line.replace('\n', '').split(",")
    img_file = split_line[0]
    lession_type = split_line[1]
    random_num = random.random()
    
    if random_num < 0.8:
        location = train
    elif random_num < 0.9:
        location = validation
    else:
        location = test

    if lession_type == "melanoma":
        shutil.copy(
            img_dir_128+img_file+'.jpg',
            location+"positive/"+img_file+'.jpg'
        )

    else:
        random_num = random.random()
        if random_num < 0.3:
            shutil.copy(
                img_dir_128+img_file+'.jpg',
                location+"negative/"+img_file+'.jpg'
            )


In [24]:
print(lession_types)

['unknown', 'nevus', 'melanoma', 'seborrheic keratosis', 'lentigo NOS', 'lichenoid keratosis', 'solar lentigo', 'cafe-au-lait macule', 'atypical melanocytic proliferation']
