In [25]:
import pandas as pd
import numpy as np
from PIL import Image
import os 
pathMasks = "C:\\Users\\lukas\\Documents\\Arquivos do PROJETO ITI\Fredrik Dataset\\mascaras"

1. Data frame cronológico organizado de maneira temporal, de forma que saiba qual ano é cada imagem pro respectivo paciente.
    - Colunas: id_Paciente, nome_Imagem, ano_Imagem
    - Linhas: id_Paciente
2. Saber se, naquele respectivo ano, ele teve ou não teve cancer (1 ou 0).
3. Criar um csv com todos os pacientes que **tiveram câncer** e não tem máscara. Colocar colunas como **ID** e **ANO** que ele não teve máscara.

In [102]:
def verify_Masks(dataFrame=pd.DataFrame,listMask=list,filesPath=str):
    '''
    Function that verify if exist pixel in mask of images

    Parameters
    ----------

    dataFrame : pandas dataframe
            Dataframe for analysis    
    listMask : list
            List of masks
    filesPath : str
            Path location of masks
    

    Returns 
    ------
        Return two lists:\n
        1- list of colMask thats contains 0 or name of fileMask\n
        2- list of colPixel thats contains 0 = don't contains white pixel | 1 = contains white pixel\n
    '''

    colMask = []
    colPixel =[]
    arrayDf = np.array(dataFrame)

    for pac in arrayDf:
        # creating name of mask
        idPac = pac[1]
        laterality = pac[5]
        incidence = pac[7]
        yearFollowUp = str(pac[2])
        fileMask = idPac[:5] + "_20990909_" + laterality[0] + "_" + incidence + "_" + yearFollowUp + "_mask.png"
        # check if not exist fileMask in listMask 
        if fileMask not in listMask:
            colMask.append(0) 
            colPixel.append(0)
        else:
            # check if exist white pixel
            colMask.append(fileMask)
            try:
                mask = np.array(Image.open(filesPath + '\\' + fileMask),dtype=int)
            except FileNotFoundError:
                print("filesPath don't found")
                break
            if len(np.where(mask != 0)) != 0:
                colPixel.append(1) # 1 = contains
            else:
                colPixel.append(0) # 0 = don't contains
    return colMask, colPixel

def create_Year_FollowUp(dataFrame=pd.DataFrame):
    '''
    Function that create a list of year followup of the pacient exams

    Parameters :
    ----------

    dataFrame : pandas DataFrame
        DataFrame for analysis

    Return : list
    ------
        Return a list of years followup for each image
    '''
    arrayDf = np.array(dataFrame)
    yearImg = arrayDf[0][2]
    idPac = [0][0]
    numberFollowUp = []
    numberAux = 1
    for pac in arrayDf:
        if pac[0] != idPac:
            # trocamos de paciente
            idPac = pac[0]
            yearImg = pac[2]
            numberAux=1
            numberFollowUp.append(numberAux) # recebe 1 no primeiro ano
        else:
            # estamos no mesmo paciente
            if yearImg != pac[2]: # se o ano for outro devemos somar +1 na contagem de anos
                yearImg = pac[2]  # atribui novo ano para as próximas verificações 
                numberAux+=1
                numberFollowUp.append(numberAux)
            else:
                # é o mesmo ano do verificado anteriormente
                numberFollowUp.append(numberAux) # coloca o mesmo número
    return numberFollowUp

In [103]:
dataFrame = pd.read_csv("CSVs\\anon_dataset_nonhidden_211125.csv") # lendo o dataset
cancerOnlyDataframe = dataFrame[dataFrame['x_case']==1] # apenas casos positivos
cancerOnlyDataframe

Unnamed: 0,anon_patientid,exam_year,anon_filename,x_age,x_case,x_cancer_laterality,x_type,x_lymphnode_met,rad_timing,rad_r1,rad_r2,rad_recall,rad_recall_type_right,rad_recall_type_left,imagelaterality,viewposition,libra_breastarea,libra_densearea,libra_percentdensity
0,2,2015,00002_20990909_L_CC_1.dcm,1,1,Left,3.0,0.0,2.0,0.0,0.0,0.0,,,Left,CC,127.258090,29.595217,23.256058
1,2,2015,00002_20990909_L_MLO_1.dcm,1,1,Left,3.0,0.0,2.0,0.0,0.0,0.0,,,Left,MLO,122.318120,39.298000,32.127705
2,2,2015,00002_20990909_R_CC_1.dcm,1,1,Left,3.0,0.0,2.0,0.0,0.0,0.0,,,Right,CC,114.570630,23.637600,20.631468
3,2,2015,00002_20990909_R_MLO_1.dcm,1,1,Left,3.0,0.0,2.0,0.0,0.0,0.0,,,Right,MLO,133.202380,36.162785,27.148750
84,12,2009,00012_20990909_L_CC_2.dcm,1,1,Right,2.0,0.0,3.0,0.0,0.0,0.0,,,Left,CC,87.053009,36.376034,41.786072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98699,11275,2014,11275_20990909_R_MLO_1.dcm,1,1,Right,2.0,0.0,1.0,1.0,1.0,1.0,1.0,,Right,MLO,121.447090,28.008400,23.062223
98784,11303,2014,11303_20990909_R_CC_1.dcm,2,1,Left,3.0,0.0,1.0,1.0,1.0,1.0,,2.0,Right,CC,196.125440,33.502670,17.082268
98785,11303,2014,11303_20990909_L_MLO_1.dcm,2,1,Left,3.0,0.0,1.0,1.0,1.0,1.0,,2.0,Left,MLO,257.890530,15.262912,5.918369
98786,11303,2014,11303_20990909_R_MLO_1.dcm,2,1,Left,3.0,0.0,1.0,1.0,1.0,1.0,,2.0,Right,MLO,232.718640,14.388752,6.182896


# IMPORTANTE: 
- Selecionei apenas as linhas onde a lateralidade do câncer é igual a imagem tomografada.
- x_cancer_laterality define em qual mama, esquerda ou direita, o câncer foi localizado.
- Por isso selecionar apenas as linhas que tem a mesma referência.

In [104]:
cancerOnlyDataframe = cancerOnlyDataframe.loc[:,['anon_patientid','anon_filename','exam_year','x_case','x_cancer_laterality','imagelaterality','viewposition']]
cancerOnlyDataframe = cancerOnlyDataframe.loc[ (cancerOnlyDataframe['imagelaterality']==cancerOnlyDataframe['x_cancer_laterality']) ,:]
cancerOnlyDataframe

Unnamed: 0,anon_patientid,anon_filename,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition
0,2,00002_20990909_L_CC_1.dcm,2015,1,Left,Left,CC
1,2,00002_20990909_L_MLO_1.dcm,2015,1,Left,Left,MLO
86,12,00012_20990909_R_CC_2.dcm,2009,1,Right,Right,CC
87,12,00012_20990909_R_MLO_2.dcm,2009,1,Right,Right,MLO
90,12,00012_20990909_R_CC_4.dcm,2010,1,Right,Right,CC
...,...,...,...,...,...,...,...
98683,11266,11266_20990909_R_CC_1.dcm,2009,1,Right,Right,CC
98698,11275,11275_20990909_R_CC_1.dcm,2014,1,Right,Right,CC
98699,11275,11275_20990909_R_MLO_1.dcm,2014,1,Right,Right,MLO
98785,11303,11303_20990909_L_MLO_1.dcm,2014,1,Left,Left,MLO


- Percorrer o dataframe por paciente e colocar um número que identifique o acompanhamento do paciente a partir do ano das imagens. 

In [105]:
arrayDf = np.array(cancerOnlyDataframe)
yearImg = arrayDf[0][2]
idPac = [0][0]
numberFollowUp = []
numberAux = 1
for pac in arrayDf:
    if pac[0] != idPac:
        # trocamos de paciente
        idPac = pac[0]
        yearImg = pac[2]
        numberAux=1
        numberFollowUp.append(numberAux) # recebe 1 no primeiro ano
    else:
        # estamos no mesmo paciente
        if yearImg != pac[2]: # se o ano for outro devemos somar +1 na contagem de anos
            yearImg = pac[2]  # atribui novo ano para as próximas verificações 
            numberAux+=1
            numberFollowUp.append(numberAux)
        else:
            # é o mesmo ano do verificado anteriormente
            numberFollowUp.append(numberAux) # coloca o mesmo número
            
# nova coluna dos acompanhamentos
cancerOnlyDataframe['yearFollowUp'] = numberFollowUp

In [106]:
# Selecting columns
cancerOnlyDataframe = cancerOnlyDataframe.loc[:,['anon_patientid','anon_filename','yearFollowUp','exam_year','x_case','x_cancer_laterality','imagelaterality','viewposition']]
cancerOnlyDataframe.iloc[:20]
cancerOnlyDataframe

Unnamed: 0,anon_patientid,anon_filename,yearFollowUp,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition
0,2,00002_20990909_L_CC_1.dcm,1,2015,1,Left,Left,CC
1,2,00002_20990909_L_MLO_1.dcm,1,2015,1,Left,Left,MLO
86,12,00012_20990909_R_CC_2.dcm,1,2009,1,Right,Right,CC
87,12,00012_20990909_R_MLO_2.dcm,1,2009,1,Right,Right,MLO
90,12,00012_20990909_R_CC_4.dcm,2,2010,1,Right,Right,CC
...,...,...,...,...,...,...,...,...
98683,11266,11266_20990909_R_CC_1.dcm,1,2009,1,Right,Right,CC
98698,11275,11275_20990909_R_CC_1.dcm,1,2014,1,Right,Right,CC
98699,11275,11275_20990909_R_MLO_1.dcm,1,2014,1,Right,Right,MLO
98785,11303,11303_20990909_L_MLO_1.dcm,1,2014,1,Left,Left,MLO


In [107]:
listMask = os.listdir("C:\\Users\\lukas\\Documents\\Arquivos do PROJETO ITI\Fredrik Dataset\\mascaras")

In [108]:
colMascs, colNodulo = verify_Masks(cancerOnlyDataframe,listMask,"C:\\Users\\lukas\\Documents\\Arquivos do PROJETO ITI\Fredrik Dataset\\mascaras")

In [109]:
cancerOnlyDataframe['file_mask'] = colMascs
cancerOnlyDataframe['nodule'] = colNodulo

### Criar um CSV com todas essas informações: **casos de câncer que não contém máscara, casos de não-câncer e que contém máscara, e os casos de câncer que tem máscara, mas não tem um pixel branco**: 3 csvs diferentes

- CSV com todos as imagens dos pacientes que **tiveram cancer e não tem mascara**.

In [110]:
cancerNoMask = cancerOnlyDataframe[cancerOnlyDataframe['file_mask'] == 0]
cancerNoMask

Unnamed: 0,anon_patientid,anon_filename,yearFollowUp,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition,file_mask,nodule
0,2,00002_20990909_L_CC_1.dcm,1,2015,1,Left,Left,CC,0,0
1,2,00002_20990909_L_MLO_1.dcm,1,2015,1,Left,Left,MLO,0,0
86,12,00012_20990909_R_CC_2.dcm,1,2009,1,Right,Right,CC,0,0
87,12,00012_20990909_R_MLO_2.dcm,1,2009,1,Right,Right,MLO,0,0
90,12,00012_20990909_R_CC_4.dcm,2,2010,1,Right,Right,CC,0,0
...,...,...,...,...,...,...,...,...,...,...
98193,11107,11107_20990909_R_MLO_1.dcm,1,2010,1,Right,Right,MLO,0,0
98194,11107,11107_20990909_R_CC_1.dcm,1,2010,1,Right,Right,CC,0,0
98512,11207,11207_20990909_L_CC_1.dcm,1,2013,1,Left,Left,CC,0,0
98513,11207,11207_20990909_L_MLO_1.dcm,1,2013,1,Left,Left,MLO,0,0


- CSV com imagens de **pacientes com câncer e que contém máscara**.

In [111]:
cancerMask = cancerOnlyDataframe[cancerOnlyDataframe['file_mask'] != 0]
cancerMask

Unnamed: 0,anon_patientid,anon_filename,yearFollowUp,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition,file_mask,nodule
93,12,00012_20990909_R_MLO_1.dcm,3,2012,1,Right,Right,MLO,00012_20990909_R_MLO_3_mask.png,1
491,45,00045_20990909_L_CC_1.dcm,1,2009,1,Left,Left,CC,00045_20990909_L_CC_1_mask.png,1
848,75,00075_20990909_L_MLO_2.dcm,1,2009,1,Left,Left,MLO,00075_20990909_L_MLO_1_mask.png,1
851,75,00075_20990909_L_CC_2.dcm,1,2009,1,Left,Left,CC,00075_20990909_L_CC_1_mask.png,1
859,75,00075_20990909_L_MLO_3.dcm,3,2012,1,Left,Left,MLO,00075_20990909_L_MLO_3_mask.png,1
...,...,...,...,...,...,...,...,...,...,...
98680,11266,11266_20990909_R_MLO_1.dcm,1,2009,1,Right,Right,MLO,11266_20990909_R_MLO_1_mask.png,1
98698,11275,11275_20990909_R_CC_1.dcm,1,2014,1,Right,Right,CC,11275_20990909_R_CC_1_mask.png,1
98699,11275,11275_20990909_R_MLO_1.dcm,1,2014,1,Right,Right,MLO,11275_20990909_R_MLO_1_mask.png,1
98785,11303,11303_20990909_L_MLO_1.dcm,1,2014,1,Left,Left,MLO,11303_20990909_L_MLO_1_mask.png,1


- CSV com imagens de **pacientes que tem máscara mas não tem pixel branco**.

In [112]:
cancerNotNodule = cancerMask[cancerMask['nodule'] == 0]
cancerNotNodule

Unnamed: 0,anon_patientid,anon_filename,yearFollowUp,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition,file_mask,nodule


In [115]:
cancerOnlyDataframe.to_csv("FredrikDatasetAnalysis//analysis_Fredrik_Dataset.csv") # csv com a analise do dataset
cancerNotNodule.to_csv("FredrikDatasetAnalysis//hasCancer_No_Pixel.csv") # caso de cancer que tem máscara mas não tem pixel branco | csv está vazio
cancerNoMask.to_csv("FredrikDatasetAnalysis//hasCancer_No_Mask.csv") # csv de pacientes diag com cancer mas não tem máscara

In [116]:
# Percent of images that have masks | Only x_cancer_laterality == imagelaterality
haveMask = ((len(cancerMask)*100) / len(cancerOnlyDataframe))
print("Porcentagem de imagens(x_cancer_laterality == imagelaterality) sem mascara: ", (100 - haveMask),'%')
print("Porcentagem de imagens(x_cancer_laterality == imagelaterality) com mascara: ", (haveMask),'%')
print("Quantidade de imagens(x_cancer_laterality == imagelaterality): ", len(cancerOnlyDataframe))
print("Quantidade de imagens com mascara: ", len(cancerMask),'\n')
# Analysis for masks 
masksFound = (len(cancerMask)*100) / len(listMask)
print("Porcentagem de mascaras encontradas: ", (masksFound),'%')
print("Total de mascaras: ", len(listMask))
print("Mascaras encontradas: ", len(cancerMask))
print("Mascaras nao encontradas: ",(len(listMask) - len(cancerMask)))

Porcentagem de imagens(x_cancer_laterality == imagelaterality) sem mascara:  72.94633077765607 %
Porcentagem de imagens(x_cancer_laterality == imagelaterality) com mascara:  27.05366922234392 %
Quantidade de imagens(x_cancer_laterality == imagelaterality):  3652
Quantidade de imagens com mascara:  988 

Porcentagem de mascaras encontradas:  98.2107355864811 %
Total de mascaras:  1006
Mascaras encontradas:  988
Mascaras nao encontradas:  18


- Casos de não-câncer e que contém máscara.

In [117]:
dataFrame2 = dataFrame[dataFrame['x_case'] == 0]
noCancerDF = dataFrame2.loc[:,['anon_patientid','anon_filename','exam_year','x_case','x_cancer_laterality','imagelaterality','viewposition']]

# Adding yearFollowp 
yearsFollowUp = create_Year_FollowUp(noCancerDF)
noCancerDF['yearFollowUp'] =  yearsFollowUp
noCancerDF = noCancerDF.loc[:,['anon_patientid','anon_filename','yearFollowUp','exam_year','x_case','x_cancer_laterality','imagelaterality','viewposition']]

# Fill x_cancer_laterality 
noCancerDF.loc[:, 'x_cancer_laterality'] = noCancerDF['imagelaterality']
noCancerDF

Unnamed: 0,anon_patientid,anon_filename,yearFollowUp,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition
4,4,00004_20990909_R_CC_1.dcm,1,2012,0,Right,Right,CC
5,4,00004_20990909_R_MLO_2.dcm,1,2012,0,Right,Right,MLO
6,4,00004_20990909_L_MLO_2.dcm,1,2012,0,Left,Left,MLO
7,4,00004_20990909_L_CC_1.dcm,1,2012,0,Left,Left,CC
8,4,00004_20990909_L_MLO_1.dcm,2,2014,0,Left,Left,MLO
...,...,...,...,...,...,...,...,...
98779,11300,11300_20990909_L_MLO_1.dcm,1,2015,0,Left,Left,MLO
98780,11301,11301_20990909_L_MLO_1.dcm,1,2015,0,Left,Left,MLO
98781,11301,11301_20990909_R_MLO_1.dcm,1,2015,0,Right,Right,MLO
98782,11301,11301_20990909_L_CC_1.dcm,1,2015,0,Left,Left,CC


In [118]:
# Use the function verify_Masks to check the masks for noCanceDF
maskCol, pixelCol = verify_Masks(noCancerDF,listMask,pathMasks)

In [119]:
noCancerDF['file_mask'] = maskCol
noCancerDF['nodule'] = pixelCol
noCancerDF[noCancerDF['nodule'] != 0]

Unnamed: 0,anon_patientid,anon_filename,yearFollowUp,exam_year,x_case,x_cancer_laterality,imagelaterality,viewposition,file_mask,nodule
