In [1]:
import os
import re
import shutil
from rapidfuzz import fuzz # Faster than fuzzywuzzy
import pandas as pd

In [2]:
df_principal = pd.read_excel("C:/luis/Iso_olhos/dados/Pacientes_Torica.xlsx", header=3, engine= "openpyxl", usecols=range(21))

df_principal.rename(columns=str.lower, inplace=True)

In [49]:
def _extract_filename_components(filename: str) -> tuple[list[str], list[str]]:
    """
    Extract alphabetic and numeric groups from the name of a given file

    Parameters:
        filename: name of the file

    Returns:
        alphabetic_parts: list of strings with the alphabetic components from filename
        numeric_parts: list of the numeric parts from filename
    """

    filename = filename.lower()
    alphabetic_parts = re.findall(r'[^\W\d_]+', filename) # Extract alphabetic parts from filenames in directory
    numeric_parts = re.findall(r'[\d]+', filename) # Extract Number patters from filenames

    return alphabetic_parts, numeric_parts

def matches_number(search_number: str, numeric_parts:list[str]) -> bool:
    """
    Check if seach_number appears in numeric_parts list
    """
    return search_number in numeric_parts

def treats_names(work_string: list[str]) -> str:
    """
    Treat the work_string to fit in the fuzzy search and return this same work_string treated and sorted
    """
    # transform the list[str] in a single str to treat
    work_string = " ".join(work_string)

    # check if the commom words in exams are present in the string and separe the important part (first) from the rest.
    # Then the program sort the resultant string and return the list sorted

    if ' od ' in work_string:
        work_string = " ".join(sorted(work_string.split(' od ')[0].split()))
    elif ' os ' in work_string:
        work_string = " ".join(sorted(work_string.split(' os ')[0].split()))
    elif ' dob ' in work_string:
        work_string = " ".join(sorted(work_string.split(' dob ')[0].split()))
    elif ' iol ' in work_string:
        work_string = " ".join(sorted(work_string.split(' iol ')[0].split()))
    else:
        work_string = " ".join(sorted(work_string.split()))

    return work_string
    
    
def matches_name(name_parts: list[str], alphabetic_parts: list[str], similarity_threshold: int) -> bool:
    """
    Check if the similarity between the search_name list and the alphabetic_parts list is greater than the similarity_threshold
    """
    name_parts = treats_names(name_parts)
    alphabetic_parts = treats_names(alphabetic_parts)

    similarity_score = fuzz.token_sort_ratio(alphabetic_parts, name_parts)
    return similarity_score >= similarity_threshold

def find_matching_files(
    search_name: str, 
    search_number: str, 
    directory: str, 
    similarity_threshold: int = 80
    ) -> list[str]:
    """
    Find files in directory with names are similar to search_name using fuzzy functions or with numbers are equals to search_number

    Parameters:
        search_name (str): Name pattern to search for 
        search_number (str): Patient number to search in directory
        directory (str): Directory path to search in 
        similarity_threshold (int): Minimun similarity score (0-100) 

        (see: https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html)

    Returns:
        list: Matching filenames (empty if none found)

    """
    name_parts = search_name.lower().split()
    matching_files = []

    for _, _, files in os.walk(directory):
        for filename in files:
            
            alphabetic_parts, numeric_parts = _extract_filename_components(filename) # Extract names and numbers patterns 


            # Try to find correspondent files with patient number
            if matches_number(search_number, numeric_parts):
                matching_files.append(filename)
                continue


            # Try to find correspondent files with patient name
            if matches_name(name_parts, alphabetic_parts, similarity_threshold):
                matching_files.append(filename)
        
    return matching_files

In [31]:
def _patient_number_on_name(patient_number, patient_filename) -> bool:
   """
   Check if patient name has the patient number in its beginning
   """
   filename_parts = patient_filename.split("_")

   if filename_parts:
      return patient_number == filename_parts[0]
   
   return False


def rename_files(files: list[str], number: int, directory: str) -> list[str]:
   """
   Take a list of files, add the number on the name of each file and return a list with the new name of each files

   Parameters:
      files: a list with the filenames
      number: the patient number wich will be add in the beginning of file
      directory: the directory of file
   """
   new_filenames = []
   for filename in files:

      # Check if the number are alredy in the beginning of filename
      if _patient_number_on_name(number, filename):
         new_filenames.append(filename)
         continue

      original_path = os.path.join(directory, filename)

      new_filename = f"{number}_{filename}"
      final_path = os.path.join(directory,new_filename)

      os.rename(original_path, final_path)

      new_filenames.append(new_filename)
   
   return new_filenames

def move_files(files: list[str], original_directory, final_directory) -> None:
   """
   Move each file of the files list from original_directory to final_directory
   """
   for filename in files:
      original_path = os.path.join(original_directory, filename)

      print(original_path)
      shutil.move(original_path, final_directory)

In [32]:
# acha os exames de cada paciente e separa em uma pasta

data_original_directory = "C:/luis/Iso_olhos/dados/Biometrias"
data_final_directory = "C:/luis/Iso_olhos/dados/Biometrias_identificadas"


for _,patient in df_principal.iterrows():
    patient_name = patient['nome']
    patient_number = str(patient['pront'])

    corresponding_files = find_matching_files(patient_name, patient_number, data_original_directory)

    
    if corresponding_files :
        print(corresponding_files)
        corresponding_files = rename_files(corresponding_files, patient_number, data_original_directory)

        move_files(corresponding_files, data_original_directory, data_final_directory)

    

['57791_JUNIOR_ANTONIO_ELISARIO_S_1946-07-10_IOL.pdf']
C:/luis/Iso_olhos/dados/Biometrias\57781_57791_JUNIOR_ANTONIO_ELISARIO_S_1946-07-10_IOL.pdf


In [33]:
# Adiciona uma coluna  para dizer se o paciente tem exame identificado ou não


df_principal['tem_exame'] = df_principal.apply(
    lambda patient : bool(find_matching_files(patient['nome'], str(patient['pront']), data_final_directory)),
    axis = 1 # Apply to each row
)


In [34]:
df_principal.head()        

Unnamed: 0,pront,nome,sexo,olho operado,k1,eixo plano,k2,eixo curvo,al,acd,...,refração pré operatória,astig refracional pré op,astig anterior topo pré op,astig posterior topo pré,lente indicada,lente implantada,refração pós operatória,astig refracional pós op,classificação astigmatismo,tem_exame
0,121476,ADEMIR MARTINS PEIXOTO,Masculino,Direito,42.92,92.0,46.72,2.0,22.67,2.98,...,"+3,00 -4,00 95º",-4.0,2.6,0.1,T9 +23.0,"T8 +23,5 a 2°",plano,0.0,Regular assimétrico contra a regra,True
1,114679,ADRIANA DIAS DA CUNHA,Feminino,Direito,40.67,3.0,42.75,93.0,24.57,2.94,...,"+0,50 -1,00 170º",-1.0,0.8,0.4,"T3 +20,0","T3 +21,50 a 80°",0.5,0.0,Regular simétrico a favor da regra,True
2,71580,ALAIDE PEREIRA LIMA AGUIAR,Feminino,Direito,44.59,106.0,45.62,16.0,23.59,3.53,...,"+1,75 -1,00 100º",-1.0,0.9,0.1,"T4 +19,50","T4 +19,50 a 5°","+1,00 -0,50 125º",-0.5,Regular simétrico contra a regra,True
3,65154,ALAOR DE OLIVEIRA PINHAL,Masculino,Direito,,,,,,,...,"+1,00 -2,00 100º",-2.0,0.8,0.1,,"T5 +22,50 a 160°","plano -0,75 150º",-0.75,Regular assimétrico contra a regra,False
4,65154,ALAOR DE OLIVEIRA PINHAL,Masculino,Esquerdo,,,,,,,...,"+1,75 -1,75 70º",-1.75,1.0,0.0,,"T5 +22,0 a 180°",plano,0.0,Regular simetrico contra a regra,False


In [35]:
df_sem_exame = df_principal[df_principal['tem_exame'] == False]
print(df_sem_exame.sort_values(by='pront'))

      pront                                  nome       sexo olho operado  k1  \
98     7898              MARIA APARECIDA DE SOUZA   Feminino     Esquerdo NaN   
51     8005             ELIANE GUIMARAES FERREIRA   Feminino      Direito NaN   
240    9992                MARCOS ELIZIO DE BRITO  Masculino      Direito NaN   
256   16014        CARMEM CURVELANO PEREIRA LEMES   Feminino     Esquerdo NaN   
255   16014        CARMEM CURVELANO PEREIRA LEMES   Feminino      Direito NaN   
254   16730     MARIA DE LOURDES BARBOSA OLIVEIRA   Feminino     Esquerdo NaN   
253   16730     MARIA DE LOURDES BARBOSA OLIVEIRA   Feminino      Direito NaN   
247   17130                    MARLY ADAD SANCHES   Feminino      Direito NaN   
248   17130                    MARLY ADAD SANCHES   Feminino     Esquerdo NaN   
261   17644                  LUZIA MIYABARA KISHI   Feminino      Direito NaN   
260   20897                  CLARICE MARIA KREMER   Feminino     Esquerdo NaN   
259   20897                 

In [50]:
pentacam_directory = 'C:/luis/Iso_olhos/dados/dados_pentacam'
pentacam_final_directory = 'C:/luis/Iso_olhos/dados/dados_pentacam_identificados'

for _,patient in df_principal.iterrows():
    patient_name = patient['nome']
    patient_number = str(patient['pront'])

    corresponding_files = find_matching_files(patient_name, patient_number, pentacam_directory)

    if corresponding_files:
        print(corresponding_files)
        corresponding_files = rename_files(corresponding_files, patient_number, pentacam_directory)

        move_files(corresponding_files, pentacam_directory, pentacam_final_directory)

['Silva_Odilon Soares_OD_06072024_091846_4 Maps Refr.JPG', 'Silva_Odilon Soares_OS_06072024_092002_4 Maps Refr.JPG']
C:/luis/Iso_olhos/dados/dados_pentacam\17123_Silva_Odilon Soares_OD_06072024_091846_4 Maps Refr.JPG
C:/luis/Iso_olhos/dados/dados_pentacam\17123_Silva_Odilon Soares_OS_06072024_092002_4 Maps Refr.JPG
['Nascimento_Osvaldo Horbilon_OS_26062024_143915_4 Maps Refr.JPG']
C:/luis/Iso_olhos/dados/dados_pentacam\116752_Nascimento_Osvaldo Horbilon_OS_26062024_143915_4 Maps Refr.JPG
['Silva_Oswaldo Augusto _OS_01022024_095806_4 Maps Refr.JPG']
C:/luis/Iso_olhos/dados/dados_pentacam\40309_Silva_Oswaldo Augusto _OS_01022024_095806_4 Maps Refr.JPG


In [52]:
df_principal['tem_pentacam'] = df_principal.apply(
    lambda patient : bool(find_matching_files(patient['nome'], str(patient['pront']), pentacam_final_directory)),
    axis = 1 # Apply to each row
)

df_principal.head()

Unnamed: 0,pront,nome,sexo,olho operado,k1,eixo plano,k2,eixo curvo,al,acd,...,astig refracional pré op,astig anterior topo pré op,astig posterior topo pré,lente indicada,lente implantada,refração pós operatória,astig refracional pós op,classificação astigmatismo,tem_exame,tem_pentacam
0,121476,ADEMIR MARTINS PEIXOTO,Masculino,Direito,42.92,92.0,46.72,2.0,22.67,2.98,...,-4.0,2.6,0.1,T9 +23.0,"T8 +23,5 a 2°",plano,0.0,Regular assimétrico contra a regra,True,True
1,114679,ADRIANA DIAS DA CUNHA,Feminino,Direito,40.67,3.0,42.75,93.0,24.57,2.94,...,-1.0,0.8,0.4,"T3 +20,0","T3 +21,50 a 80°",0.5,0.0,Regular simétrico a favor da regra,True,True
2,71580,ALAIDE PEREIRA LIMA AGUIAR,Feminino,Direito,44.59,106.0,45.62,16.0,23.59,3.53,...,-1.0,0.9,0.1,"T4 +19,50","T4 +19,50 a 5°","+1,00 -0,50 125º",-0.5,Regular simétrico contra a regra,True,True
3,65154,ALAOR DE OLIVEIRA PINHAL,Masculino,Direito,,,,,,,...,-2.0,0.8,0.1,,"T5 +22,50 a 160°","plano -0,75 150º",-0.75,Regular assimétrico contra a regra,False,True
4,65154,ALAOR DE OLIVEIRA PINHAL,Masculino,Esquerdo,,,,,,,...,-1.75,1.0,0.0,,"T5 +22,0 a 180°",plano,0.0,Regular simetrico contra a regra,False,True
