In [1]:
import os
import re
import cv2
import math
import time
import json
import random
import numpy as np
import polars as pl
import supervision as sv
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from PIL import Image, ImageFile, ImageDraw
from ultralytics import YOLO
from scipy.optimize import least_squares
from math import atan2, degrees, degrees, radians, sin, cos, pi
from shapely.geometry import MultiPolygon, Polygon

ImageFile.LOAD_TRUNCATED_IMAGES = True

In [2]:
def get_img_index(filename):
    # Find the number that appears just before the '.jpg' extension
    if filename.endswith('.jpg'):
        match = re.search(r'(\d{3,4})\.jpg$', filename)
        return int(match.group(1))
    # Find the number that appears just before the '.txt' extension
    elif filename.endswith('.txt'):
        match = re.search(r'(\d{3,4})\.txt$', filename)
        return int(match.group(1))
    
def save_coordinates_txt_format(polygons, file_path):
    with open(file_path, 'w') as file:
        for polygon in polygons:
            line = "0"
            for pair in polygon:
                line += f" {pair[0]} {pair[1]}"
                
            file.write(line + "\n")
            
def check_no_chars(results):
    if len(results[0]) == 0:
        print("\nNo chars")
        return True
    return False

def load_detection_model(model_path):
    # Load the text detection model
    model = YOLO(model_path)
    #model.to("cuda")
    print("Text detection model loaded")
    image_np = np.zeros((128, 128, 3), dtype=np.uint8)
    start_time = time.time()
    results = model.predict(source=image_np, imgsz=128, conf=float(0.4), verbose=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('Done infer! Took {} seconds'.format(elapsed_time))
    return model

def run_detection(model, image):
    # Perform text detection
    results = model.predict(image, conf=float(0.35), verbose=False)
    return results

def indent(elem, level=0):
   # Add indentation
   indent_size = "  "
   i = "\n" + level * indent_size
   if len(elem):
      if not elem.text or not elem.text.strip():
         elem.text = i + indent_size
      if not elem.tail or not elem.tail.strip():
         elem.tail = i
      for elem in elem:
         indent(elem, level + 1)
      if not elem.tail or not elem.tail.strip():
         elem.tail = i
   else:
      if level and (not elem.tail or not elem.tail.strip()):
         elem.tail = i

def pretty_print_xml_elementtree(xml_string):
   # Parse the XML string
   root = ET.fromstring(xml_string)

   # Indent the XML
   indent(root)

   # Convert the XML element back to a string
   pretty_xml = ET.tostring(root, encoding="unicode")

   # Print the pretty XML
   return pretty_xml

def write_to_xml(image_i, all_chars, class_to_char, image_file, xml_path):
    # Image dimensions
    img_size = image_i.size
    img_height = img_size[1]
    img_width = img_size[0]

    # Create the root element
    annotation = ET.Element("annotation")

    # Add filename element
    filename = ET.SubElement(annotation, "filename")
    filename.text = image_file

    # Iterate over the all_chars array to add objects
    for row in all_chars:
        label, cx, cy, w, h = row
        char_name = class_to_char[label]

        # Create object element and place char specific data inside
        obj = ET.SubElement(annotation, "object")
        label_elem = ET.SubElement(obj, "label")
        label_elem.text = str(label)
        name_elem = ET.SubElement(obj, "name")
        name_elem.text = char_name
        robndbox = ET.SubElement(obj, "robndbox")
        cx_elem = ET.SubElement(robndbox, "cx")
        cx_elem.text = str(cx)
        cy_elem = ET.SubElement(robndbox, "cy")
        cy_elem.text = str(cy)
        h_elem = ET.SubElement(robndbox, "h")
        h_elem.text = str(h)
        w_elem = ET.SubElement(robndbox, "w")
        w_elem.text = str(w)
        
    # Add size element
    size = ET.SubElement(annotation, "size")
    height_elem = ET.SubElement(size, "height")
    height_elem.text = str(img_height)
    width_elem = ET.SubElement(size, "width")
    width_elem.text = str(img_width)

    # Write to XML file
    xml_string = ET.tostring(annotation)
    xml_to_write = pretty_print_xml_elementtree(xml_string)
    with open(xml_path, "w", encoding="utf-8") as f:
        f.write(xml_to_write)


In [3]:
start_time_1 = time.time()

base_path = "/home/marci/Documents/gits/persuasion-techniques-in-memes" # your path to persuasion-techniques-in-memes
model_path = os.path.join(base_path, "team_2/data/preprocessing_model/yolo11n_best.pt")
test_images_path = os.path.join(base_path, "team_2/data/raw/yolo_test_dataset")

# Load the json file with id - char pairs
file_path = os.path.join(base_path, "team_2/data/other/chars_mapping.json")

with open(file_path, 'r') as json_file:
    char_to_class = json.load(json_file)

class_to_char = {v: k for k, v in char_to_class.items()}

image_files = [name
               for root, dirs, files in os.walk(test_images_path)
               for name in files
               if name.endswith((".jpg", ".jpeg", ".png"))]

sorted_image_files = sorted(image_files)

#image_file = sorted_image_files[0]

model = load_detection_model(model_path)

for image_file in sorted_image_files:
    curr_index = get_img_index(image_file)

    # Set up paths
    xml_file = f"{image_file[:-4]}.xml"
    txt_file = f"{image_file[:-4]}.txt"

    image_path = os.path.join(test_images_path, image_file)
    image_output_path = os.path.join(test_images_path, image_file)
    xml_path = os.path.join(test_images_path, xml_file)
    txt_path = os.path.join(test_images_path, txt_file)

    pil_image = Image.open(image_path).convert("RGB")
    image = np.array(pil_image)

    # Perform detection
    results = run_detection(model, image)

    if (1-check_no_chars(results)):
        
        for result in results:
            boxes = result.boxes
            print(len(boxes))
            for box in boxes:
                b = box.xywh[0].tolist()
                c = box.cls.tolist()
                try:
                    all_chars = np.append(all_chars, np.array(c + b).astype(int))
                except:
                    all_chars = np.array(c + b).astype(int)

        all_chars = all_chars.reshape(-1,5)

    write_to_xml(pil_image, all_chars, class_to_char, image_file, xml_path)


end_time_1 = time.time()
elapsed_time_1 = end_time_1 - start_time_1
print('Whole process took {} seconds'.format(elapsed_time_1))

Text detection model loaded
Done infer! Took 1.0189740657806396 seconds
3
9
18
18
18
7
19
15
1
13
2

No chars
3
7
7
10
7

No chars
2
11
19
27
14
17
40
9
26
17
19
9
22
12
15
14
5
7
15
7
7
30
8
21
19
37
29
6
32
30
24
23
23
14
20
11
15
12
16
18
12
14
21
1
14
5
16
16
13
7
6
18
11
24
11

No chars

No chars
7
27
17
18
17
22
5
28
6
17
7
13
1
12
12
14
16
3
23
5
17
9
9
4
5
25
12
11
19
26
11
1
6
13
3
15
5
10
23
26
18
5
7
2
7
5
8
11
8
1
14
4
16
11
13
19
22
23
11

No chars

No chars
29
19
10
35
3
3
21
17
10
11
17
5
28
24
19
3
19
16
4
4
27
4
15
19
20
26
19
17
12
2
19
11
18
3
29
11
16
22
5
6
14
24
17
22
3
8
14
13
2
12
8
6

No chars
14

No chars
13
8
8
12
2
17
11
20
22
17
1
9
28
25
10
6
5
19
17
7
14
13
5
23
10
7
12
1
14
20
7
20
27
9
8
3
23
7
16
8
7
9
14
12
1
3
13
11
26
19
8
15
7
24
9
16
4
5
2
8
6
10
6
Whole process took 54.267478942871094 seconds


In [4]:
class_to_char

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: 'A',
 11: 'B',
 12: 'C',
 13: 'D',
 14: 'E',
 15: 'F',
 16: 'G',
 17: 'H',
 18: 'I',
 19: 'J',
 20: 'K',
 21: 'L',
 22: 'M',
 23: 'N',
 24: 'O',
 25: 'P',
 26: 'Q',
 27: 'R',
 28: 'S',
 29: 'T',
 30: 'U',
 31: 'V',
 32: 'W',
 33: 'X',
 34: 'Y',
 35: 'Z',
 36: 'a',
 37: 'b',
 38: 'c',
 39: 'd',
 40: 'e',
 41: 'f',
 42: 'g',
 43: 'h',
 44: 'i',
 45: 'j',
 46: 'k',
 47: 'l',
 48: 'm',
 49: 'n',
 50: 'o',
 51: 'p',
 52: 'q',
 53: 'r',
 54: 's',
 55: 't',
 56: 'u',
 57: 'v',
 58: 'w',
 59: 'x',
 60: 'y',
 61: 'z',
 62: '"',
 63: '&',
 64: '!',
 65: '%',
 66: "'",
 67: '.',
 68: '+',
 69: '-',
 70: ',',
 71: '*',
 72: '?',
 73: '..'}