In [1]:
from paddleocr import PaddleOCR, draw_ocr
import os,glob,csv,math,json,re,math,cv2,slideio
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from IPython.display import display, HTML
from fuzzywuzzy import fuzz
from utils import show_images
from tqdm import tqdm

In [2]:
# Setup model
ocr_model = PaddleOCR(lang='en',use_angle_cls = False,show_log=False)

In [3]:
import logging
from ppocr.utils.logging import get_logger as ppocr_get_logger
ppocr_get_logger().setLevel(logging.ERROR)

In [4]:
# Open our folder and list the number of files inside
folder_name,type = 'A20-169','svs' # Specify the folder name (first param) and file type (second param)
folder = glob.glob(f"{folder_name}/*.{type}")
print(f"{len(folder)} {type} files identified.")

72 svs files identified.


In [5]:
columns = ['participant_id', 'stain_id', 'brain_region','label']
csv_filename = f"{folder_name}.csv"  # Change this to your desired file name

with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(columns)

In [6]:
label_count = 0

for file in tqdm(folder):
    slide = slideio.open_slide(file)
    scene = slide.get_scene(0)
    # print(scene)
    # This part gets the image per slide
    image_names = slide.get_aux_image_names()
    images = []
    for name in image_names:
        if name == 'Label':
            image = slide.get_aux_image_raster(name)
            images.append(image)  
            rotated_image = np.rot90(image, k=-1) # Capture a rotated image of the label
            result = ocr_model.ocr(rotated_image) # Process rotated image with Paddle
            # Build a list containing all chunks identified by Paddle 
            text_list = []
            for result_group in result:
                for text_region in result_group:
                    text, confidence = text_region[1]  # Extract text and confidence
                    # print("Text:", text); print("Confidence:", confidence)
                    text_list.append(text)
                    # Coordinates are stored in text_region[0] if needed
            garbage = ['starfrost', 'adrc']  # Clean the result a little
            text_list = [thing for thing in text_list if thing.lower() not in garbage]
            # Phase 1: Gather the Participant ID:
            participant_pattern = r'([A-Z0-9]{3})-([A-Z0-9]{3})'
            participant_matches = [re.search(participant_pattern, i) for i in text_list if re.search(participant_pattern, i)]
            participant_id = [match[0] for match in participant_matches]
            participant_id = participant_id[0]; print(f"Participant ID: {participant_id}")
            # Phase 2: Gather the Stain ID:
            predefined_choices = ['LB509', 'HE', 'PHF-1', 'TDP-43', '10D5'] # Stain Choices
            stain_id = None
            best_similarity = 0
            for i in text_list:
                for choice in predefined_choices:
                    similarity = fuzz.ratio(i, choice)  # Calculate Lebenshtein distance
                    if similarity > best_similarity:
                        best_similarity = similarity
                        stain_id = choice
            print(f"Stain ID: {stain_id}")
            # Phase 2.5 (cleaning):
            prune = [item for item in text_list if item.lower() not in (stain_id.lower(), participant_id[0].lower())]
            text_list = prune # Clean the result a bit more 2 of 3
            # Phase 3: Gather the Brain Region ID:
            brain_region_pattern = r'\bL\d+[A-Z]?\b'
            brain_region_matchs = [re.search(brain_region_pattern, i) for i in text_list if re.search(brain_region_pattern, i)]
            brain_region = [match[0] for match in brain_region_matchs]
            brain_region = brain_region[0]; print(f"Brain region match: {brain_region}")
            # Phase 3.5 (cleaning):
            prune = [item.replace(brain_region[0], "").strip() for item in text_list]
            text_list = prune  # Clean the result a bit more 3 of 3
            # Phase 4 Print Results: 
            label_count += 1
            # Write the rows to the CSV file
            with open(f'{folder_name}.csv', mode='a', newline='') as csv_file:
                csv_writer = csv.writer(csv_file)
                label = f"{participant_id}_{brain_region}_{stain_id}"
                csv_writer.writerow([participant_id, stain_id, brain_region, label])
            # Phase 5 Rename Each File:
            new_name = os.path.join(os.path.dirname(file), label + ".svs")
            os.rename(file, new_name)

  1%|█▎                                                                                              | 1/72 [00:02<02:23,  2.02s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L3


  3%|██▋                                                                                             | 2/72 [00:03<01:46,  1.52s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L3


  4%|████                                                                                            | 3/72 [00:04<01:46,  1.54s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L3


  6%|█████▎                                                                                          | 4/72 [00:05<01:32,  1.36s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L4


  7%|██████▋                                                                                         | 5/72 [00:07<01:27,  1.31s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L4


  8%|████████                                                                                        | 6/72 [00:08<01:28,  1.34s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L4


 10%|█████████▎                                                                                      | 7/72 [00:09<01:23,  1.28s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L4


 11%|██████████▋                                                                                     | 8/72 [00:10<01:21,  1.27s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L4


 12%|████████████                                                                                    | 9/72 [00:11<01:13,  1.17s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L23


 14%|█████████████▏                                                                                 | 10/72 [00:12<01:12,  1.17s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L23


 15%|██████████████▌                                                                                | 11/72 [00:14<01:12,  1.18s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L23


 17%|███████████████▊                                                                               | 12/72 [00:15<01:09,  1.16s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L23


 18%|█████████████████▏                                                                             | 13/72 [00:16<01:13,  1.25s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L23


 19%|██████████████████▍                                                                            | 14/72 [00:17<01:10,  1.21s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L5


 21%|███████████████████▊                                                                           | 15/72 [00:19<01:15,  1.32s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L5


 22%|█████████████████████                                                                          | 16/72 [00:20<01:15,  1.35s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L5


 24%|██████████████████████▍                                                                        | 17/72 [00:22<01:13,  1.34s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L5


 25%|███████████████████████▊                                                                       | 18/72 [00:23<01:12,  1.34s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L5


 26%|█████████████████████████                                                                      | 19/72 [00:24<01:06,  1.26s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L6


 28%|██████████████████████████▍                                                                    | 20/72 [00:25<01:03,  1.21s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L6


 29%|███████████████████████████▋                                                                   | 21/72 [00:26<01:01,  1.21s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L6


 31%|█████████████████████████████                                                                  | 22/72 [00:28<00:59,  1.20s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L6


 32%|██████████████████████████████▎                                                                | 23/72 [00:29<00:57,  1.18s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L6


 33%|███████████████████████████████▋                                                               | 24/72 [00:30<00:55,  1.15s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L17


 35%|████████████████████████████████▉                                                              | 25/72 [00:31<00:54,  1.15s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L17


 36%|██████████████████████████████████▎                                                            | 26/72 [00:32<00:54,  1.18s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L17


 38%|███████████████████████████████████▋                                                           | 27/72 [00:34<00:57,  1.27s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L17


 39%|████████████████████████████████████▉                                                          | 28/72 [00:35<00:55,  1.25s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L17


 40%|██████████████████████████████████████▎                                                        | 29/72 [00:36<00:53,  1.25s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L8


 42%|███████████████████████████████████████▌                                                       | 30/72 [00:37<00:52,  1.24s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L8


 43%|████████████████████████████████████████▉                                                      | 31/72 [00:39<00:57,  1.41s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L8


 44%|██████████████████████████████████████████▏                                                    | 32/72 [00:40<00:53,  1.33s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L8


 46%|███████████████████████████████████████████▌                                                   | 33/72 [00:42<00:50,  1.29s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L8


 47%|████████████████████████████████████████████▊                                                  | 34/72 [00:42<00:44,  1.16s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L9


 49%|██████████████████████████████████████████████▏                                                | 35/72 [00:44<00:44,  1.21s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L9


 50%|███████████████████████████████████████████████▌                                               | 36/72 [00:45<00:42,  1.18s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L9


 51%|████████████████████████████████████████████████▊                                              | 37/72 [00:46<00:40,  1.16s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L9


 53%|██████████████████████████████████████████████████▏                                            | 38/72 [00:47<00:39,  1.16s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L9


 54%|███████████████████████████████████████████████████▍                                           | 39/72 [00:48<00:35,  1.09s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L11


 56%|████████████████████████████████████████████████████▊                                          | 40/72 [00:49<00:36,  1.13s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L11


 57%|██████████████████████████████████████████████████████                                         | 41/72 [00:50<00:35,  1.15s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L11


 58%|███████████████████████████████████████████████████████▍                                       | 42/72 [00:52<00:34,  1.14s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L11


 60%|████████████████████████████████████████████████████████▋                                      | 43/72 [00:53<00:33,  1.16s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L11


 61%|██████████████████████████████████████████████████████████                                     | 44/72 [00:54<00:32,  1.17s/it]

Participant ID: A20-169
Stain ID: HE
Brain region match: L12


 62%|███████████████████████████████████████████████████████████▍                                   | 45/72 [00:55<00:31,  1.17s/it]

Participant ID: A20-169
Stain ID: 10D5
Brain region match: L12


 64%|████████████████████████████████████████████████████████████▋                                  | 46/72 [00:56<00:30,  1.17s/it]

Participant ID: A20-169
Stain ID: PHF-1
Brain region match: L12


 65%|██████████████████████████████████████████████████████████████                                 | 47/72 [00:57<00:29,  1.16s/it]

Participant ID: A20-169
Stain ID: LB509
Brain region match: L12


 67%|███████████████████████████████████████████████████████████████▎                               | 48/72 [00:59<00:29,  1.23s/it]

Participant ID: A20-169
Stain ID: TDP-43
Brain region match: L12


 67%|███████████████████████████████████████████████████████████████▎                               | 48/72 [01:00<00:30,  1.26s/it]

Participant ID: A20-169
Stain ID: HE





IndexError: list index out of range