In [9]:
import torch

In [20]:
input_data = [
    {"x": 413, "y": 105, "w": 49, "h": 55, "text_ja": "あ!", "text_en": "Oh!"},
    {"x": 492, "y": 236, "w": 100, "h": 154, "text_ja": "あれは一丁目のスナックのママ!", "text_en": "That woman is the hostess in the bar at Block-1."},
    {"x": 91, "y": 244, "w": 94, "h": 119, "text_ja": "あっちは行きつけの店の女将!", "text_en": "That is the owner of my favorite restaurant!"},
    {"x": 625, "y": 457, "w": 89, "h": 120, "text_ja": "ワシもまだまだ人気者ですなぁ!", "text_en": "I'm still so popular!"},
    {"x": 540, "y": 529, "w": 71, "h": 141, "text_ja": "生き生きしますぞ!", "text_en": "I feel so alive!"},
    {"x": 565, "y": 701, "w": 54, "h": 96, "text_ja": "葬儀屋とは", "text_en": "The job of an undertaker"},
    {"x": 150, "y": 704, "w": 78, "h": 112, "text_ja": "生者と死者の最期の場所を作る仕事", "text_en": "is to set up the last place for the living and the dead."},
    {"x": 701, "y": 916, "w": 62, "h": 68, "text_ja": "..もう", "text_en": "Well, I'm afraid..."},
]

In [10]:
mask_dict = torch.load(f"boureisougi_masks.pth")

In [28]:
mask_dict['boureisougi_002']

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [24]:
image_path = 'boureisougi_002.jpg'

In [25]:
import numpy as np
from PIL import Image
import torch
import os

# Using your existing variables
# mask_dict = torch.load(f"boureisuogi_masks.pth")
# image_path = 'boureisuogi_002.jpg'

# Get the mask for this specific image
mask = mask_dict['boureisougi_002']

# Open and convert the image
img = Image.open(image_path).convert("RGB")
img_array = np.array(img)

# Apply the mask (turn text pixels white)
img_array[mask == 1] = [255, 255, 255]

# Convert back to PIL Image
masked_img = Image.fromarray(img_array)

# Save the result
output_path = image_path.replace('.jpg', '_masked.jpg')
masked_img.save(output_path)

print(f"Masked image saved to: {output_path}")

Masked image saved to: boureisougi_002_masked.jpg


In [26]:
import numpy as np
from PIL import Image, ImageDraw
import torch
import cv2

def improved_text_masking(image_path, mask_tensor, bubble_coordinates, margin=5, dilation_size=2, output_path=None):
    """
    Improved text masking that combines pixel-level mask with bubble coordinates.
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    bubble_coordinates : list of tuple
        List of (x, y, w, h) coordinates for each speech bubble
    margin : int
        Extra margin to add around detected text regions
    dilation_size : int
        Size of dilation kernel to expand the text mask
    output_path : str, optional
        Path to save the masked image
        
    Returns:
    --------
    PIL.Image
        The masked image with improved text removal
    """
    # Open image
    img = Image.open(image_path).convert("RGB")
    img_array = np.array(img)
    
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
    
    # Create a copy for the final output
    final_img = img_array.copy()
    
    # Process each bubble
    for bubble in bubble_coordinates:
        x, y, w, h = bubble
        
        # Extract the bubble region from the mask
        bubble_mask = mask[y:y+h, x:x+w]
        
        # Skip if no text in this bubble
        if np.sum(bubble_mask) == 0:
            continue
            
        # Apply dilation to expand text regions and connect nearby text
        kernel = np.ones((dilation_size, dilation_size), np.uint8)
        dilated_mask = cv2.dilate(bubble_mask.astype(np.uint8), kernel, iterations=2)
        
        # Find connected components in the dilated mask
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_mask, connectivity=8)
        
        # Process each text component
        for i in range(1, num_labels):  # Skip background (0)
            # Get component bounding box
            cx = stats[i, cv2.CC_STAT_LEFT]
            cy = stats[i, cv2.CC_STAT_TOP]
            cw = stats[i, cv2.CC_STAT_WIDTH]
            ch = stats[i, cv2.CC_STAT_HEIGHT]
            
            # Add margin
            cx = max(0, cx - margin)
            cy = max(0, cy - margin)
            cw = min(w - cx, cw + 2 * margin)
            ch = min(h - cy, ch + 2 * margin)
            
            # Create a rectangle mask for this component
            component_mask = np.zeros_like(bubble_mask)
            component_mask[cy:cy+ch, cx:cx+cw] = 1
            
            # Apply the component mask to the bubble region in the final image
            # Convert local coordinates to global
            global_mask = np.zeros_like(mask)
            global_mask[y+cy:y+cy+ch, x+cx:x+cx+cw] = 1
            
            # Apply mask to image
            final_img[global_mask == 1] = [255, 255, 255]
    
    # Convert back to PIL Image
    masked_img = Image.fromarray(final_img)
    
    # Save if output path provided
    if output_path:
        masked_img.save(output_path)
        print(f"Masked image saved to: {output_path}")
    
    return masked_img

def find_text_regions_in_bubbles(mask_tensor, bubble_coordinates, min_text_area=10):
    """
    Find rectangular regions that contain text within each speech bubble.
    
    Parameters:
    -----------
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    bubble_coordinates : list of tuple
        List of (x, y, w, h) coordinates for each speech bubble
    min_text_area : int
        Minimum area for a text region to be considered
        
    Returns:
    --------
    list of tuple
        List of (bubble_index, x, y, w, h) for each detected text region
    """
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
        
    text_regions = []
    
    # Process each bubble
    for i, bubble in enumerate(bubble_coordinates):
        x, y, w, h = bubble
        
        # Extract the bubble region from the mask
        bubble_mask = mask[y:y+h, x:x+w]
        
        # Skip if no text in this bubble
        if np.sum(bubble_mask) == 0:
            continue
            
        # Apply dilation to connect nearby text
        kernel = np.ones((3, 3), np.uint8)
        dilated_mask = cv2.dilate(bubble_mask.astype(np.uint8), kernel, iterations=1)
        
        # Find connected components in the dilated mask
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_mask, connectivity=8)
        
        # Process each text component
        for j in range(1, num_labels):  # Skip background (0)
            # Get component stats
            area = stats[j, cv2.CC_STAT_AREA]
            
            # Skip small noise
            if area < min_text_area:
                continue
                
            # Get component bounding box
            cx = stats[j, cv2.CC_STAT_LEFT]
            cy = stats[j, cv2.CC_STAT_TOP]
            cw = stats[j, cv2.CC_STAT_WIDTH]
            ch = stats[j, cv2.CC_STAT_HEIGHT]
            
            # Convert to global coordinates
            global_x = x + cx
            global_y = y + cy
            
            # Add to text regions
            text_regions.append((i, global_x, global_y, cw, ch))
    
    return text_regions

# Example usage
if __name__ == "__main__":
    # Example inputs
    image_path = "boureisougi_002.jpg"
    mask_dict = torch.load(f"boureisougi_masks.pth")
    mask = mask_dict["boureisougi_002"]
    
    # Example bubble coordinates (x, y, w, h)
    bubble_coordinates = [
        (100, 200, 150, 100),  # Example bubble 1
        (300, 150, 200, 120),  # Example bubble 2
    ]
    
    # Find text regions within bubbles
    text_regions = find_text_regions_in_bubbles(mask, bubble_coordinates)
    print(f"Found {len(text_regions)} text regions")
    
    # Apply improved masking
    output_path = "masked_manga.jpg"
    masked_img = improved_text_masking(image_path, mask, bubble_coordinates, margin=5, output_path=output_path)
    
    # Visualize the detected text regions (optional)
    img = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    
    for i, x, y, w, h in text_regions:
        # Draw rectangle around text region
        draw.rectangle([x, y, x+w, y+h], outline="red")
    
    img.save("detected_text_regions.jpg")
    print("Text regions visualization saved")

FileNotFoundError: [Errno 2] No such file or directory: 'mask_dict.pth'

In [27]:
import numpy as np
from PIL import Image, ImageDraw
import torch
import cv2

def mask_text_with_bubbles(image_path, mask_tensor, input_data, margin=5, output_path=None):
    """
    Mask text in manga using both pixel mask and speech bubble coordinates.
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of speech bubble data with keys 'x', 'y', 'w', 'h', 'text_ja', 'text_en'
    margin : int
        Extra margin to add around text regions
    output_path : str, optional
        Path to save the masked image
        
    Returns:
    --------
    PIL.Image
        The masked image with text removed
    """
    # Open image
    img = Image.open(image_path).convert("RGB")
    img_array = np.array(img)
    
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
    
    # Create a drawing image for debugging
    debug_img = img.copy()
    draw = ImageDraw.Draw(debug_img)
    
    # Create output image
    result_img = img_array.copy()
    
    # Process each bubble
    for bubble in input_data:
        x, y, w, h = bubble["x"], bubble["y"], bubble["w"], bubble["h"]
        
        # Draw original bubble outline
        draw.rectangle([x, y, x+w, y+h], outline="blue", width=1)
        
        # Create a mask for this bubble region
        bubble_region = np.zeros_like(mask)
        bubble_region[y:y+h, x:x+w] = 1
        
        # Get the text mask within this bubble
        text_in_bubble = np.logical_and(mask == 1, bubble_region == 1)
        
        # Skip if no text in this bubble
        if np.sum(text_in_bubble) == 0:
            continue
        
        # Find connected components (text clusters)
        text_img = text_in_bubble.astype(np.uint8) * 255
        
        # Apply dilation to connect nearby text
        kernel = np.ones((3, 3), np.uint8)
        dilated_text = cv2.dilate(text_img, kernel, iterations=1)
        
        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_text, connectivity=8)
        
        # Process each text component
        for i in range(1, num_labels):  # Skip background (0)
            # Get component bounding box
            cx = stats[i, cv2.CC_STAT_LEFT]
            cy = stats[i, cv2.CC_STAT_TOP]
            cw = stats[i, cv2.CC_STAT_WIDTH]
            ch = stats[i, cv2.CC_STAT_HEIGHT]
            
            # Expand with margin
            cx_expanded = max(0, cx - margin)
            cy_expanded = max(0, cy - margin)
            cw_expanded = cw + 2 * margin
            ch_expanded = ch + 2 * margin
            
            # Draw the expanded text region
            draw.rectangle([cx_expanded, cy_expanded, cx_expanded+cw_expanded, cy_expanded+ch_expanded], 
                          outline="red", width=1)
            
            # Create a mask for the expanded text region
            text_region = np.zeros_like(mask)
            text_region[cy_expanded:cy_expanded+ch_expanded, cx_expanded:cx_expanded+cw_expanded] = 1
            
            # Apply white color to the text region
            result_img[text_region == 1] = [255, 255, 255]
    
    # Save debug image
    debug_img.save(image_path.replace('.jpg', '_debug.jpg'))
    
    # Create output image
    final_img = Image.fromarray(result_img)
    
    if output_path:
        final_img.save(output_path)
        print(f"Masked image saved to: {output_path}")
    
    return final_img

# Example usage
if __name__ == "__main__":
    # Your variables
    image_path = "boureisougi_002.jpg"
    mask_dict = torch.load(f"boureisougi_masks.pth")
    mask = mask_dict["boureisougi_002"]
    
    # Your input data
    input_data = [
        {"x": 413, "y": 105, "w": 49, "h": 55, "text_ja": "あ!", "text_en": "Oh!"},
        {"x": 492, "y": 236, "w": 100, "h": 154, "text_ja": "あれは一丁目のスナックのママ!", "text_en": "That woman is the hostess in the bar at Block-1."},
        {"x": 91, "y": 244, "w": 94, "h": 119, "text_ja": "あっちは行きつけの店の女将!", "text_en": "That is the owner of my favorite restaurant!"},
        {"x": 625, "y": 457, "w": 89, "h": 120, "text_ja": "ワシもまだまだ人気者ですなぁ!", "text_en": "I'm still so popular!"},
        {"x": 540, "y": 529, "w": 71, "h": 141, "text_ja": "生き生きしますぞ!", "text_en": "I feel so alive!"},
        {"x": 565, "y": 701, "w": 54, "h": 96, "text_ja": "葬儀屋とは", "text_en": "The job of an undertaker"},
        {"x": 150, "y": 704, "w": 78, "h": 112, "text_ja": "生者と死者の最期の場所を作る仕事", "text_en": "is to set up the last place for the living and the dead."},
        {"x": 701, "y": 916, "w": 62, "h": 68, "text_ja": "..もう", "text_en": "Well, I'm afraid..."},
    ]
    
    # Apply masking
    output_path = "masked_manga.jpg"
    masked_img = mask_text_with_bubbles(image_path, mask, input_data, margin=5, output_path=output_path)

Masked image saved to: masked_manga.jpg


In [29]:
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import torch
import cv2

def mask_text_and_get_regions(image_path, mask_tensor, input_data, margin=5):
    """
    Mask text in manga and return the optimized text regions for each bubble.
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of speech bubble data with keys 'x', 'y', 'w', 'h', 'text_ja', 'text_en'
    margin : int
        Extra margin to add around text regions
        
    Returns:
    --------
    tuple
        (masked_image, optimized_regions)
        - masked_image: PIL.Image with text removed
        - optimized_regions: list of dicts with optimized text placement info
    """
    # Open image
    img = Image.open(image_path).convert("RGB")
    img_array = np.array(img)
    
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
    
    # Create output image
    result_img = img_array.copy()
    
    # Create optimized text regions list
    optimized_regions = []
    
    # Process each bubble
    for i, bubble in enumerate(input_data):
        x, y, w, h = bubble["x"], bubble["y"], bubble["w"], bubble["h"]
        
        # Create a mask for this bubble region
        bubble_region = np.zeros_like(mask)
        bubble_region[y:y+h, x:x+w] = 1
        
        # Get the text mask within this bubble
        text_in_bubble = np.logical_and(mask == 1, bubble_region == 1)
        
        # Skip if no text in this bubble
        if np.sum(text_in_bubble) == 0:
            # Still add the original bubble to optimized regions
            optimized_regions.append({
                "original_index": i,
                "x": x, 
                "y": y, 
                "w": w, 
                "h": h,
                "text_en": bubble["text_en"],
                "optimized": False
            })
            continue
        
        # Find connected components (text clusters)
        text_img = text_in_bubble.astype(np.uint8) * 255
        
        # Apply dilation to connect nearby text
        kernel = np.ones((3, 3), np.uint8)
        dilated_text = cv2.dilate(text_img, kernel, iterations=1)
        
        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_text, connectivity=8)
        
        # Skip if no significant components found
        if num_labels <= 1:
            optimized_regions.append({
                "original_index": i,
                "x": x, 
                "y": y, 
                "w": w, 
                "h": h,
                "text_en": bubble["text_en"],
                "optimized": False
            })
            continue
        
        # Get the bounding box that contains all text components
        min_x = float('inf')
        min_y = float('inf')
        max_x = 0
        max_y = 0
        
        for j in range(1, num_labels):  # Skip background (0)
            # Get component bounding box
            cx = stats[j, cv2.CC_STAT_LEFT]
            cy = stats[j, cv2.CC_STAT_TOP]
            cw = stats[j, cv2.CC_STAT_WIDTH]
            ch = stats[j, cv2.CC_STAT_HEIGHT]
            
            # Update min/max coordinates
            min_x = min(min_x, cx)
            min_y = min(min_y, cy)
            max_x = max(max_x, cx + cw)
            max_y = max(max_y, cy + ch)
            
            # Apply white color to this component with margin
            component_mask = np.zeros_like(mask)
            
            # Add margin
            cx_expanded = max(0, cx - margin)
            cy_expanded = max(0, cy - margin)
            cw_expanded = cw + 2 * margin
            ch_expanded = ch + 2 * margin
            
            # Apply expanded component mask
            component_mask[cy_expanded:cy_expanded+ch_expanded, cx_expanded:cx_expanded+cw_expanded] = 1
            
            # Apply white to result image
            result_img[component_mask == 1] = [255, 255, 255]
        
        # Calculate the optimized text region coordinates
        opt_x = x + min_x
        opt_y = y + min_y
        opt_w = max_x - min_x
        opt_h = max_y - min_y
        
        # Add margins to optimized region
        opt_x = max(x, opt_x - margin)
        opt_y = max(y, opt_y - margin)
        opt_w = min(w - (opt_x - x), opt_w + 2 * margin)
        opt_h = min(h - (opt_y - y), opt_h + 2 * margin)
        
        # Add optimized region to list
        optimized_regions.append({
            "original_index": i,
            "x": opt_x, 
            "y": opt_y, 
            "w": opt_w, 
            "h": opt_h,
            "text_en": bubble["text_en"],
            "optimized": True
        })
    
    # Create final masked image
    masked_img = Image.fromarray(result_img)
    
    return masked_img, optimized_regions

def format_text_for_bubble(text, max_width, max_height, font_path, base_size=22):
    """
    Format text to fit within a bubble with line wrapping and font size adjustment.
    """
    draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
    font = ImageFont.truetype(font_path, base_size)
    
    # 1. First attempt simple text wrapping
    words = text.split()
    lines = []
    current_line = []
    
    for word in words:
        test_line = ' '.join(current_line + [word])
        bbox = draw.textbbox((0, 0), test_line, font=font)
        text_width = bbox[2] - bbox[0]
        
        if text_width <= max_width:
            current_line.append(word)
        else:
            if current_line:
                lines.append(' '.join(current_line))
                current_line = [word]
            else:
                # If the word is too long, consider breaking the word or adjusting the font size
                current_line = [word]
    
    if current_line:
        lines.append(' '.join(current_line))
    
    # 2. Check if the total height fits
    total_height = 0
    line_spacing = base_size * 0.3  # 30% line spacing
    
    for line in lines:
        bbox = draw.textbbox((0, 0), line, font=font)
        total_height += (bbox[3] - bbox[1]) + line_spacing

    # 3. If height exceeds, consider reducing font size appropriately
    if total_height > max_height:
        # Reduce size proportionally based on overflow ratio
        ratio = max_height / total_height
        new_size = max(int(base_size * ratio * 0.95), 10)  # Minimum size of 10
        return format_text_for_bubble(text, max_width, max_height, font_path, new_size)
    
    return lines, base_size

def add_translated_text(image, text_regions, font_path, base_size=22):
    """
    Add translated text to the masked image.
    
    Parameters:
    -----------
    image : PIL.Image
        Masked image
    text_regions : list of dict
        List of optimized text regions
    font_path : str
        Path to font file
    base_size : int
        Base font size
        
    Returns:
    --------
    PIL.Image
        Image with translated text added
    """
    draw = ImageDraw.Draw(image)
    
    for region in text_regions:
        x, y = region["x"], region["y"]
        w, h = region["w"], region["h"]
        text = region["text_en"]
        
        # Format text to fit in this region
        lines, final_size = format_text_for_bubble(
            text,
            w * 0.9,  # 90% of region width
            h * 0.9,  # 90% of region height
            font_path,
            base_size
        )
        
        # Load the font at the final determined size
        font = ImageFont.truetype(font_path, final_size)
        
        # Calculate total text height (for vertical centering)
        total_height = 0
        line_spacing = final_size * 0.3
        for line in lines:
            bbox = draw.textbbox((0, 0), line, font=font)
            total_height += (bbox[3] - bbox[1]) + line_spacing
        total_height -= line_spacing  # Subtract the line spacing added for the last line
        
        # Calculate the starting y-coordinate to vertically center the text
        start_y = y + (h - total_height) // 2
        
        # Draw each line of text
        current_y = start_y
        for line in lines:
            # Get the width of the current line for horizontal centering
            bbox = draw.textbbox((0, 0), line, font=font)
            text_width = bbox[2] - bbox[0]
            text_x = x + (w - text_width) // 2
            
            # Draw text (with outline)
            draw.text(
                (text_x, current_y),
                line,
                font=font,
                fill='black',
                stroke_width=2,
                stroke_fill='white'
            )
            
            # Update the y-coordinate for the next line
            current_y += (bbox[3] - bbox[1]) + line_spacing
    
    return image

def translate_manga_page(image_path, mask_tensor, input_data, font_path, output_path=None, 
                         margin=5, base_font_size=22, debug=False):
    """
    Complete manga translation process:
    1. Mask original text
    2. Find optimized text regions
    3. Add translated text
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of speech bubble data
    font_path : str
        Path to font file
    output_path : str, optional
        Path to save the translated image
    margin : int
        Margin to add around text regions
    base_font_size : int
        Base font size for text
    debug : bool
        If True, save debug images
        
    Returns:
    --------
    PIL.Image
        Translated image
    """
    # Step 1: Mask text and get optimized regions
    masked_img, optimized_regions = mask_text_and_get_regions(
        image_path, mask_tensor, input_data, margin
    )
    
    # Save masked image for debugging
    if debug:
        masked_img.save(image_path.replace('.jpg', '_masked.jpg'))
        
        # Create debug image showing regions
        debug_img = masked_img.copy()
        draw = ImageDraw.Draw(debug_img)
        
        # Draw original bubbles in blue
        for bubble in input_data:
            x, y, w, h = bubble["x"], bubble["y"], bubble["w"], bubble["h"]
            draw.rectangle([x, y, x+w, y+h], outline="blue", width=1)
        
        # Draw optimized regions in red
        for region in optimized_regions:
            if region["optimized"]:
                x, y, w, h = region["x"], region["y"], region["w"], region["h"]
                draw.rectangle([x, y, x+w, y+h], outline="red", width=1)
        
        debug_img.save(image_path.replace('.jpg', '_regions.jpg'))
    
    # Step 2: Add translated text
    translated_img = add_translated_text(
        masked_img, optimized_regions, font_path, base_font_size
    )
    
    # Save translated image
    if output_path:
        translated_img.save(output_path)
        print(f"Translated image saved to: {output_path}")
    
    return translated_img

# Example usage
if __name__ == "__main__":
    # Your exact variables
    image_path = "boureisougi_002.jpg"
    mask_dict = torch.load(f"boureisougi_masks.pth")
    mask = mask_dict["boureisougi_002"]
    
    # Your input data
    input_data = [
        {"x": 413, "y": 105, "w": 49, "h": 55, "text_ja": "あ!", "text_en": "Oh!"},
        {"x": 492, "y": 236, "w": 100, "h": 154, "text_ja": "あれは一丁目のスナックのママ!", "text_en": "That woman is the hostess in the bar at Block-1."},
        {"x": 91, "y": 244, "w": 94, "h": 119, "text_ja": "あっちは行きつけの店の女将!", "text_en": "That is the owner of my favorite restaurant!"},
        {"x": 625, "y": 457, "w": 89, "h": 120, "text_ja": "ワシもまだまだ人気者ですなぁ!", "text_en": "I'm still so popular!"},
        {"x": 540, "y": 529, "w": 71, "h": 141, "text_ja": "生き生きしますぞ!", "text_en": "I feel so alive!"},
        {"x": 565, "y": 701, "w": 54, "h": 96, "text_ja": "葬儀屋とは", "text_en": "The job of an undertaker"},
        {"x": 150, "y": 704, "w": 78, "h": 112, "text_ja": "生者と死者の最期の場所を作る仕事", "text_en": "is to set up the last place for the living and the dead."},
        {"x": 701, "y": 916, "w": 62, "h": 68, "text_ja": "..もう", "text_en": "Well, I'm afraid..."},
    ]
    
    # Font path
    font_path = "/System/Library/Fonts/Supplemental/Arial.ttf"
    
    # Translate the manga page
    output_path = "boureisougi_002_translated.jpg"
    translated_img = translate_manga_page(
        image_path, mask, input_data, font_path, 
        output_path=output_path, 
        margin=5, 
        base_font_size=22,
        debug=True
    )

ValueError: x1 must be greater than or equal to x0

In [30]:
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import torch
import cv2

def mask_text_with_bubbles(image_path, mask_tensor, input_data, margin=5, output_path=None):
    """
    Mask text in manga using both pixel mask and speech bubble coordinates.
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of speech bubble data with keys 'x', 'y', 'w', 'h', 'text_ja', 'text_en'
    margin : int
        Extra margin to add around text regions
    output_path : str, optional
        Path to save the masked image
        
    Returns:
    --------
    PIL.Image
        The masked image with text removed
    """
    # Open image
    img = Image.open(image_path).convert("RGB")
    img_array = np.array(img)
    
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
    
    # Create a drawing image for debugging
    debug_img = img.copy()
    draw = ImageDraw.Draw(debug_img)
    
    # Create output image
    result_img = img_array.copy()
    
    # Process each bubble
    for bubble in input_data:
        x, y, w, h = bubble["x"], bubble["y"], bubble["w"], bubble["h"]
        
        # Draw original bubble outline
        draw.rectangle([x, y, x+w, y+h], outline="blue", width=1)
        
        # Create a mask for this bubble region
        bubble_region = np.zeros_like(mask)
        bubble_region[y:y+h, x:x+w] = 1
        
        # Get the text mask within this bubble
        text_in_bubble = np.logical_and(mask == 1, bubble_region == 1)
        
        # Skip if no text in this bubble
        if np.sum(text_in_bubble) == 0:
            continue
        
        # Find connected components (text clusters)
        text_img = text_in_bubble.astype(np.uint8) * 255
        
        # Apply dilation to connect nearby text
        kernel = np.ones((3, 3), np.uint8)
        dilated_text = cv2.dilate(text_img, kernel, iterations=1)
        
        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_text, connectivity=8)
        
        # Process each text component
        for i in range(1, num_labels):  # Skip background (0)
            # Get component bounding box
            cx = stats[i, cv2.CC_STAT_LEFT]
            cy = stats[i, cv2.CC_STAT_TOP]
            cw = stats[i, cv2.CC_STAT_WIDTH]
            ch = stats[i, cv2.CC_STAT_HEIGHT]
            
            # Expand with margin
            cx_expanded = max(0, cx - margin)
            cy_expanded = max(0, cy - margin)
            cw_expanded = cw + 2 * margin
            ch_expanded = ch + 2 * margin
            
            # Draw the expanded text region
            draw.rectangle([cx_expanded, cy_expanded, cx_expanded+cw_expanded, cy_expanded+ch_expanded], 
                          outline="red", width=1)
            
            # Create a mask for the expanded text region
            text_region = np.zeros_like(mask)
            text_region[cy_expanded:cy_expanded+ch_expanded, cx_expanded:cx_expanded+cw_expanded] = 1
            
            # Apply white color to the text region
            result_img[text_region == 1] = [255, 255, 255]
    
    # Save debug image
    debug_img.save(image_path.replace('.jpg', '_debug.jpg'))
    
    # Create output image
    final_img = Image.fromarray(result_img)
    
    if output_path:
        final_img.save(output_path)
        print(f"Masked image saved to: {output_path}")
    
    return final_img

def format_text_for_bubble(text, max_width, max_height, font_path, base_size=22):
    """
    First try to wrap text, then consider adjusting font size.
    """
    draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
    font = ImageFont.truetype(font_path, base_size)
    
    # 1. First attempt simple text wrapping
    words = text.split()
    lines = []
    current_line = []
    
    for word in words:
        test_line = ' '.join(current_line + [word])
        bbox = draw.textbbox((0, 0), test_line, font=font)
        text_width = bbox[2] - bbox[0]
        
        if text_width <= max_width:
            current_line.append(word)
        else:
            if current_line:
                lines.append(' '.join(current_line))
                current_line = [word]
            else:
                # If the word is too long, consider breaking the word or adjusting the font size
                current_line = [word]
    
    if current_line:
        lines.append(' '.join(current_line))
    
    # 2. Check if the total height fits
    total_height = 0
    line_spacing = base_size * 0.3  # 30% line spacing
    
    for line in lines:
        bbox = draw.textbbox((0, 0), line, font=font)
        total_height += (bbox[3] - bbox[1]) + line_spacing

    # 3. If height exceeds, consider reducing font size appropriately
    if total_height > max_height:
        # Reduce size by up to 20%
        min_size = int(base_size * 0.8)
        return format_text_for_bubble(text, max_width, max_height, font_path, min_size)
    
    return lines, base_size

def add_translated_text(image, input_data, font_path, base_size=22):
    """
    Add translated text to the image using the original bubble coordinates.
    
    Parameters:
    -----------
    image : PIL.Image
        The masked image
    input_data : list of dict
        List of bubble data with coordinates and translations
    font_path : str
        Path to the font file
    base_size : int
        Base font size
        
    Returns:
    --------
    PIL.Image
        Image with translated text added
    """
    draw = ImageDraw.Draw(image)
    
    # Process each text area
    for bubble in input_data:
        # Get bubble coordinates and English text
        x, y = bubble["x"], bubble["y"]
        w, h = bubble["w"], bubble["h"]
        text_en = bubble["text_en"]
        
        # Format text to fit in the bubble
        lines, final_size = format_text_for_bubble(
            text_en,
            w * 0.9,  # 90% of bubble width (10% margin)
            h * 0.9,  # 90% of bubble height (10% margin)
            font_path,
            base_size
        )
        
        # Load the font at the final determined size
        font = ImageFont.truetype(font_path, final_size)
        
        # Calculate total text height (for vertical centering)
        total_height = 0
        line_spacing = final_size * 0.3
        for line in lines:
            bbox = draw.textbbox((0, 0), line, font=font)
            total_height += (bbox[3] - bbox[1]) + line_spacing
        total_height -= line_spacing  # Subtract the line spacing added for the last line
        
        # Calculate the starting y-coordinate to vertically center the text
        start_y = y + (h - total_height) // 2
        
        # Draw each line of text
        current_y = start_y
        for line in lines:
            # Get the width of the current line for horizontal centering
            bbox = draw.textbbox((0, 0), line, font=font)
            text_width = bbox[2] - bbox[0]
            text_x = x + (w - text_width) // 2
            
            # Draw text (with outline)
            draw.text(
                (text_x, current_y),
                line,
                font=font,
                fill='black',
                stroke_width=2,
                stroke_fill='white'
            )
            
            # Update the y-coordinate for the next line
            current_y += (bbox[3] - bbox[1]) + line_spacing
    
    return image

def translate_manga_page(image_path, mask_tensor, input_data, font_path, output_path=None, margin=5, base_font_size=22):
    """
    Complete manga translation process:
    1. Mask original text
    2. Add translated text using original bubble coordinates
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of bubble data with coordinates and translations
    font_path : str
        Path to the font file
    output_path : str, optional
        Path to save the translated image
    margin : int
        Margin to add around text regions for masking
    base_font_size : int
        Base font size for text
        
    Returns:
    --------
    PIL.Image
        Translated image
    """
    # Step 1: Mask the original text
    masked_img = mask_text_with_bubbles(
        image_path, mask_tensor, input_data, margin, 
        output_path=image_path.replace('.jpg', '_masked.jpg')
    )
    
    # Step 2: Add translated text using original bubble coordinates
    translated_img = add_translated_text(
        masked_img, input_data, font_path, base_font_size
    )
    
    # Save the translated image
    if output_path:
        translated_img.save(output_path)
        print(f"Translated image saved to: {output_path}")
    
    return translated_img

# Example usage
if __name__ == "__main__":
    # Your exact variables
    image_path = "boureisougi_002.jpg"
    mask_dict = torch.load(f"boureisougi_masks.pth")
    mask = mask_dict["boureisougi_002"]
    
    # Your input data
    input_data = [
        {"x": 413, "y": 105, "w": 49, "h": 55, "text_ja": "あ!", "text_en": "Oh!"},
        {"x": 492, "y": 236, "w": 100, "h": 154, "text_ja": "あれは一丁目のスナックのママ!", "text_en": "That woman is the hostess in the bar at Block-1."},
        {"x": 91, "y": 244, "w": 94, "h": 119, "text_ja": "あっちは行きつけの店の女将!", "text_en": "That is the owner of my favorite restaurant!"},
        {"x": 625, "y": 457, "w": 89, "h": 120, "text_ja": "ワシもまだまだ人気者ですなぁ!", "text_en": "I'm still so popular!"},
        {"x": 540, "y": 529, "w": 71, "h": 141, "text_ja": "生き生きしますぞ!", "text_en": "I feel so alive!"},
        {"x": 565, "y": 701, "w": 54, "h": 96, "text_ja": "葬儀屋とは", "text_en": "The job of an undertaker"},
        {"x": 150, "y": 704, "w": 78, "h": 112, "text_ja": "生者と死者の最期の場所を作る仕事", "text_en": "is to set up the last place for the living and the dead."},
        {"x": 701, "y": 916, "w": 62, "h": 68, "text_ja": "..もう", "text_en": "Well, I'm afraid..."},
    ]
    
    # Font path (update this to match your system)
    font_path = "/System/Library/Fonts/Supplemental/Arial.ttf"
    
    # Translate the manga page
    output_path = "boureisougi_002_translated.jpg"
    translated_img = translate_manga_page(
        image_path, mask, input_data, font_path, 
        output_path=output_path, 
        margin=5, 
        base_font_size=22
    )

Masked image saved to: boureisougi_002_masked.jpg


OSError: cannot open resource

In [31]:
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import torch
import cv2
import platform
import os

def mask_text_with_bubbles(image_path, mask_tensor, input_data, margin=5, output_path=None):
    """
    Mask text in manga using both pixel mask and speech bubble coordinates.
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of speech bubble data with keys 'x', 'y', 'w', 'h', 'text_ja', 'text_en'
    margin : int
        Extra margin to add around text regions
    output_path : str, optional
        Path to save the masked image
        
    Returns:
    --------
    PIL.Image
        The masked image with text removed
    """
    # Open image
    img = Image.open(image_path).convert("RGB")
    img_array = np.array(img)
    
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
    
    # Create a drawing image for debugging
    debug_img = img.copy()
    draw = ImageDraw.Draw(debug_img)
    
    # Create output image
    result_img = img_array.copy()
    
    # Process each bubble
    for bubble in input_data:
        x, y, w, h = bubble["x"], bubble["y"], bubble["w"], bubble["h"]
        
        # Draw original bubble outline
        draw.rectangle([x, y, x+w, y+h], outline="blue", width=1)
        
        # Create a mask for this bubble region
        bubble_region = np.zeros_like(mask)
        bubble_region[y:y+h, x:x+w] = 1
        
        # Get the text mask within this bubble
        text_in_bubble = np.logical_and(mask == 1, bubble_region == 1)
        
        # Skip if no text in this bubble
        if np.sum(text_in_bubble) == 0:
            continue
        
        # Find connected components (text clusters)
        text_img = text_in_bubble.astype(np.uint8) * 255
        
        # Apply dilation to connect nearby text
        kernel = np.ones((3, 3), np.uint8)
        dilated_text = cv2.dilate(text_img, kernel, iterations=1)
        
        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_text, connectivity=8)
        
        # Process each text component
        for i in range(1, num_labels):  # Skip background (0)
            # Get component bounding box
            cx = stats[i, cv2.CC_STAT_LEFT]
            cy = stats[i, cv2.CC_STAT_TOP]
            cw = stats[i, cv2.CC_STAT_WIDTH]
            ch = stats[i, cv2.CC_STAT_HEIGHT]
            
            # Expand with margin
            cx_expanded = max(0, cx - margin)
            cy_expanded = max(0, cy - margin)
            cw_expanded = cw + 2 * margin
            ch_expanded = ch + 2 * margin
            
            # Draw the expanded text region
            draw.rectangle([cx_expanded, cy_expanded, cx_expanded+cw_expanded, cy_expanded+ch_expanded], 
                          outline="red", width=1)
            
            # Create a mask for the expanded text region
            text_region = np.zeros_like(mask)
            text_region[cy_expanded:cy_expanded+ch_expanded, cx_expanded:cx_expanded+cw_expanded] = 1
            
            # Apply white color to the text region
            result_img[text_region == 1] = [255, 255, 255]
    
    # Save debug image
    debug_img.save(image_path.replace('.jpg', '_debug.jpg'))
    
    # Create output image
    final_img = Image.fromarray(result_img)
    
    if output_path:
        final_img.save(output_path)
        print(f"Masked image saved to: {output_path}")
    
    return final_img

def format_text_for_bubble(text, max_width, max_height, font_path, base_size=22):
    """
    First try to wrap text, then consider adjusting font size.
    """
    draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
    font = ImageFont.truetype(font_path, base_size)
    
    # 1. First attempt simple text wrapping
    words = text.split()
    lines = []
    current_line = []
    
    for word in words:
        test_line = ' '.join(current_line + [word])
        bbox = draw.textbbox((0, 0), test_line, font=font)
        text_width = bbox[2] - bbox[0]
        
        if text_width <= max_width:
            current_line.append(word)
        else:
            if current_line:
                lines.append(' '.join(current_line))
                current_line = [word]
            else:
                # If the word is too long, consider breaking the word or adjusting the font size
                current_line = [word]
    
    if current_line:
        lines.append(' '.join(current_line))
    
    # 2. Check if the total height fits
    total_height = 0
    line_spacing = base_size * 0.3  # 30% line spacing
    
    for line in lines:
        bbox = draw.textbbox((0, 0), line, font=font)
        total_height += (bbox[3] - bbox[1]) + line_spacing

    # 3. If height exceeds, consider reducing font size appropriately
    if total_height > max_height:
        # Reduce size by up to 20%
        min_size = int(base_size * 0.8)
        return format_text_for_bubble(text, max_width, max_height, font_path, min_size)
    
    return lines, base_size

def add_translated_text(image, input_data, font_path, base_size=22):
    """
    Add translated text to the image using the original bubble coordinates.
    
    Parameters:
    -----------
    image : PIL.Image
        The masked image
    input_data : list of dict
        List of bubble data with coordinates and translations
    font_path : str
        Path to the font file
    base_size : int
        Base font size
        
    Returns:
    --------
    PIL.Image
        Image with translated text added
    """
    draw = ImageDraw.Draw(image)
    
    # Process each text area
    for bubble in input_data:
        # Get bubble coordinates and English text
        x, y = bubble["x"], bubble["y"]
        w, h = bubble["w"], bubble["h"]
        text_en = bubble["text_en"]
        
        # Format text to fit in the bubble
        lines, final_size = format_text_for_bubble(
            text_en,
            w * 0.9,  # 90% of bubble width (10% margin)
            h * 0.9,  # 90% of bubble height (10% margin)
            font_path,
            base_size
        )
        
        # Load the font at the final determined size
        font = ImageFont.truetype(font_path, final_size)
        
        # Calculate total text height (for vertical centering)
        total_height = 0
        line_spacing = final_size * 0.3
        for line in lines:
            bbox = draw.textbbox((0, 0), line, font=font)
            total_height += (bbox[3] - bbox[1]) + line_spacing
        total_height -= line_spacing  # Subtract the line spacing added for the last line
        
        # Calculate the starting y-coordinate to vertically center the text
        start_y = y + (h - total_height) // 2
        
        # Draw each line of text
        current_y = start_y
        for line in lines:
            # Get the width of the current line for horizontal centering
            bbox = draw.textbbox((0, 0), line, font=font)
            text_width = bbox[2] - bbox[0]
            text_x = x + (w - text_width) // 2
            
            # Draw text (with outline)
            draw.text(
                (text_x, current_y),
                line,
                font=font,
                fill='black',
                stroke_width=2,
                stroke_fill='white'
            )
            
            # Update the y-coordinate for the next line
            current_y += (bbox[3] - bbox[1]) + line_spacing
    
    return image

def get_system_font_path():
    """
    Get appropriate font path based on the operating system.
    
    Returns:
    --------
    str
        Path to a system font
    """
    if platform.system() == "Linux":
        font = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
    elif platform.system() == "Darwin":  # macOS
        font = "/System/Library/Fonts/Supplemental/Arial.ttf"
    elif platform.system() == "Windows":
        font = "C:/Windows/Fonts/arial.ttf"
    else:
        # Fallback to a common location or raise an error
        potential_paths = [
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
            "/System/Library/Fonts/Supplemental/Arial.ttf",
            "C:/Windows/Fonts/arial.ttf"
        ]
        for path in potential_paths:
            if os.path.exists(path):
                font = path
                break
        else:
            raise FileNotFoundError("Could not find a suitable font on this system")
    
    return font

def translate_manga_page(image_path, mask_tensor, input_data, font_path=None, output_path=None, margin=5, base_font_size=22):
    """
    Complete manga translation process:
    1. Mask original text
    2. Add translated text using original bubble coordinates
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of bubble data with coordinates and translations
    font_path : str
        Path to the font file
    output_path : str, optional
        Path to save the translated image
    margin : int
        Margin to add around text regions for masking
    base_font_size : int
        Base font size for text
        
    Returns:
    --------
    PIL.Image
        Translated image
    """
    # Use system font path if none provided
    if font_path is None:
        font_path = get_system_font_path()
    
    # Step 1: Mask the original text
    masked_img = mask_text_with_bubbles(
        image_path, mask_tensor, input_data, margin, 
        output_path=image_path.replace('.jpg', '_masked.jpg')
    )
    
    # Step 2: Add translated text using original bubble coordinates
    translated_img = add_translated_text(
        masked_img, input_data, font_path, base_font_size
    )
    
    # Save the translated image
    if output_path:
        translated_img.save(output_path)
        print(f"Translated image saved to: {output_path}")
    
    return translated_img

# Example usage
if __name__ == "__main__":
    # Your exact variables
    image_path = "boureisougi_002.jpg"
    mask_dict = torch.load(f"boureisougi_masks.pth")
    mask = mask_dict["boureisougi_002"]
    
    # Your input data
    input_data = [
        {"x": 413, "y": 105, "w": 49, "h": 55, "text_ja": "あ!", "text_en": "Oh!"},
        {"x": 492, "y": 236, "w": 100, "h": 154, "text_ja": "あれは一丁目のスナックのママ!", "text_en": "That woman is the hostess in the bar at Block-1."},
        {"x": 91, "y": 244, "w": 94, "h": 119, "text_ja": "あっちは行きつけの店の女将!", "text_en": "That is the owner of my favorite restaurant!"},
        {"x": 625, "y": 457, "w": 89, "h": 120, "text_ja": "ワシもまだまだ人気者ですなぁ!", "text_en": "I'm still so popular!"},
        {"x": 540, "y": 529, "w": 71, "h": 141, "text_ja": "生き生きしますぞ!", "text_en": "I feel so alive!"},
        {"x": 565, "y": 701, "w": 54, "h": 96, "text_ja": "葬儀屋とは", "text_en": "The job of an undertaker"},
        {"x": 150, "y": 704, "w": 78, "h": 112, "text_ja": "生者と死者の最期の場所を作る仕事", "text_en": "is to set up the last place for the living and the dead."},
        {"x": 701, "y": 916, "w": 62, "h": 68, "text_ja": "..もう", "text_en": "Well, I'm afraid..."},
    ]
    
    # Get platform-appropriate font path
    font_path = get_system_font_path()
    
    # Translate the manga page
    output_path = "boureisougi_002_translated.jpg"
    translated_img = translate_manga_page(
        image_path, mask, input_data, font_path, 
        output_path=output_path, 
        margin=5, 
        base_font_size=22
    )

Masked image saved to: boureisougi_002_masked.jpg
Translated image saved to: boureisougi_002_translated.jpg


with split long word

In [1]:
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import torch
import cv2
import platform
import os
import re
try:
    import pyphen
    PYPHEN_AVAILABLE = True
except ImportError:
    PYPHEN_AVAILABLE = False
    print("Warning: pyphen library not found. Advanced hyphenation will not be available.")
    print("To install: pip install pyphen")

def mask_text_with_bubbles(image_path, mask_tensor, input_data, margin=5, output_path=None):
    """
    Mask text in manga using both pixel mask and speech bubble coordinates.
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of speech bubble data with keys 'x', 'y', 'w', 'h', 'text_ja', 'text_en'
    margin : int
        Extra margin to add around text regions
    output_path : str, optional
        Path to save the masked image
        
    Returns:
    --------
    PIL.Image
        The masked image with text removed
    """
    # Open image
    img = Image.open(image_path).convert("RGB")
    img_array = np.array(img)
    
    # Convert mask tensor to numpy if needed
    if isinstance(mask_tensor, torch.Tensor):
        mask = mask_tensor.cpu().numpy()
    else:
        mask = mask_tensor
    
    # Create a drawing image for debugging
    debug_img = img.copy()
    draw = ImageDraw.Draw(debug_img)
    
    # Create output image
    result_img = img_array.copy()
    
    # Process each bubble
    for bubble in input_data:
        x, y, w, h = bubble["x"], bubble["y"], bubble["w"], bubble["h"]
        
        # Draw original bubble outline
        draw.rectangle([x, y, x+w, y+h], outline="blue", width=1)
        
        # Create a mask for this bubble region
        bubble_region = np.zeros_like(mask)
        bubble_region[y:y+h, x:x+w] = 1
        
        # Get the text mask within this bubble
        text_in_bubble = np.logical_and(mask == 1, bubble_region == 1)
        
        # Skip if no text in this bubble
        if np.sum(text_in_bubble) == 0:
            continue
        
        # Find connected components (text clusters)
        text_img = text_in_bubble.astype(np.uint8) * 255
        
        # Apply dilation to connect nearby text
        kernel = np.ones((3, 3), np.uint8)
        dilated_text = cv2.dilate(text_img, kernel, iterations=1)
        
        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated_text, connectivity=8)
        
        # Process each text component
        for i in range(1, num_labels):  # Skip background (0)
            # Get component bounding box
            cx = stats[i, cv2.CC_STAT_LEFT]
            cy = stats[i, cv2.CC_STAT_TOP]
            cw = stats[i, cv2.CC_STAT_WIDTH]
            ch = stats[i, cv2.CC_STAT_HEIGHT]
            
            # Expand with margin
            cx_expanded = max(0, cx - margin)
            cy_expanded = max(0, cy - margin)
            cw_expanded = cw + 2 * margin
            ch_expanded = ch + 2 * margin
            
            # Draw the expanded text region
            draw.rectangle([cx_expanded, cy_expanded, cx_expanded+cw_expanded, cy_expanded+ch_expanded], 
                          outline="red", width=1)
            
            # Create a mask for the expanded text region
            text_region = np.zeros_like(mask)
            text_region[cy_expanded:cy_expanded+ch_expanded, cx_expanded:cx_expanded+cw_expanded] = 1
            
            # Apply white color to the text region
            result_img[text_region == 1] = [255, 255, 255]
    
    # Save debug image
    debug_img.save(image_path.replace('.jpg', '_debug.jpg'))
    
    # Create output image
    final_img = Image.fromarray(result_img)
    
    if output_path:
        final_img.save(output_path)
        print(f"Masked image saved to: {output_path}")
    
    return final_img

def hyphenate_word(word, dic=None):
    """
    Hyphenate a word using pyphen if available, otherwise use a simple rule-based approach.
    
    Parameters:
    -----------
    word : str
        Word to hyphenate
    dic : pyphen.Pyphen, optional
        Pyphen dictionary for language-specific hyphenation
        
    Returns:
    --------
    list
        List of syllables that can be joined with hyphens
    """
    # If word is short, no need to hyphenate
    if len(word) <= 6:
        return [word]
    
    # Use pyphen if available for better hyphenation
    if PYPHEN_AVAILABLE and dic is not None:
        # Get hyphenation points using pyphen
        parts = dic.inserted(word, hyphen='-').split('-')
        return parts
    
    # Simple fallback hyphenation method
    # Try to split at reasonable points for English words
    syllables = []
    remaining = word
    
    # Common English prefixes
    prefixes = ['un', 'in', 're', 'dis', 'over', 'under', 'pre', 'post', 'non', 'anti']
    # Common English suffixes
    suffixes = ['ing', 'tion', 'ment', 'ness', 'able', 'ible', 'ful', 'less', 'ize', 'ise']
    
    # Check for prefixes
    for prefix in prefixes:
        if word.startswith(prefix) and len(word) > len(prefix) + 3:
            syllables.append(prefix)
            remaining = word[len(prefix):]
            break
    
    # Check for suffixes in what's left
    for suffix in suffixes:
        if remaining.endswith(suffix) and len(remaining) > len(suffix) + 3:
            end_part = remaining[-len(suffix):]
            remaining = remaining[:-len(suffix)]
            
            # Simple approach: split the middle part in half if it's long enough
            if len(remaining) > 6:
                mid_point = len(remaining) // 2
                syllables.append(remaining[:mid_point])
                syllables.append(remaining[mid_point:])
            else:
                syllables.append(remaining)
                
            syllables.append(end_part)
            return syllables
    
    # If no prefix/suffix matched or remaining part is still long, use a simple splitting approach
    if len(remaining) > 8:
        # Try to split at vowel-consonant boundaries
        vowels = 'aeiou'
        parts = []
        temp = ""
        
        for i in range(len(remaining)):
            temp += remaining[i]
            
            # Look for vowel followed by consonant as potential break points
            if (i > 0 and i < len(remaining) - 2 and 
                remaining[i-1].lower() in vowels and 
                remaining[i].lower() not in vowels):
                
                if len(temp) >= 3:  # Ensure each part is at least 3 chars
                    parts.append(temp)
                    temp = ""
        
        if temp:  # Add any remaining part
            parts.append(temp)
            
        if len(parts) > 1:
            return parts
        
        # If vowel-consonant approach didn't work well, fall back to simpler approach
        mid_point = len(remaining) // 2
        return [remaining[:mid_point], remaining[mid_point:]]
    
    # If word is not that long or we couldn't split it well, return as is
    return [remaining]

def format_text_for_bubble(text, max_width, max_height, font_path, base_size=22):
    """
    Format text to fit within a bubble with line wrapping, hyphenation for long words,
    and font size adjustment if needed.
    """
    draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
    font = ImageFont.truetype(font_path, base_size)
    
    # Initialize hyphenation dictionary if pyphen is available
    if PYPHEN_AVAILABLE:
        dic = pyphen.Pyphen(lang='en_US')
    else:
        dic = None
    
    # 1. First attempt text wrapping with hyphenation for long words
    words = text.split()
    lines = []
    current_line = []
    
    for word in words:
        # Test if adding this word exceeds the width
        test_line = ' '.join(current_line + [word])
        bbox = draw.textbbox((0, 0), test_line, font=font)
        text_width = bbox[2] - bbox[0]
        
        if text_width <= max_width:
            # Word fits, add it to current line
            current_line.append(word)
        else:
            if current_line:
                # Add the current line to lines and start a new line
                lines.append(' '.join(current_line))
                
                # Start new line with current word
                current_line = [word]
                
                # Check if this word alone exceeds the width (needs hyphenation)
                bbox = draw.textbbox((0, 0), word, font=font)
                if bbox[2] - bbox[0] > max_width:
                    # Word is too long, need to hyphenate
                    current_line = []
                    syllables = hyphenate_word(word, dic)
                    
                    # Build line with hyphenated parts
                    temp_line = ""
                    for i, part in enumerate(syllables):
                        test_part = temp_line + part
                        
                        # If not the last part, add hyphen for testing
                        if i < len(syllables) - 1:
                            test_part += "-"
                            
                        # Test width
                        bbox = draw.textbbox((0, 0), test_part, font=font)
                        if bbox[2] - bbox[0] <= max_width:
                            temp_line = test_part
                        else:
                            # This part doesn't fit, add what we have so far
                            if temp_line:
                                lines.append(temp_line)
                                temp_line = part
                                # Add hyphen if not the last part
                                if i < len(syllables) - 1:
                                    temp_line += "-"
                            else:
                                # Forced break within a syllable (rare case)
                                syllable_len = len(part)
                                mid = syllable_len // 2
                                lines.append(part[:mid] + "-")
                                temp_line = part[mid:]
                                # Add hyphen if not the last part
                                if i < len(syllables) - 1:
                                    temp_line += "-"
                    
                    # Add any remaining parts
                    if temp_line:
                        current_line = [temp_line]
            else:
                # First word on the line is already too long
                syllables = hyphenate_word(word, dic)
                
                # Try to fit syllables
                temp_line = ""
                for i, part in enumerate(syllables):
                    test_part = temp_line + part
                    
                    # If not the last part, add hyphen for testing
                    if i < len(syllables) - 1:
                        test_part += "-"
                        
                    # Test width
                    bbox = draw.textbbox((0, 0), test_part, font=font)
                    if bbox[2] - bbox[0] <= max_width:
                        temp_line = test_part
                    else:
                        # This part doesn't fit, add what we have so far
                        if temp_line:
                            lines.append(temp_line)
                            temp_line = part
                            # Add hyphen if not the last part
                            if i < len(syllables) - 1:
                                temp_line += "-"
                        else:
                            # Forced break within a syllable (rare case)
                            syllable_len = len(part)
                            mid = syllable_len // 2
                            lines.append(part[:mid] + "-")
                            temp_line = part[mid:]
                            # Add hyphen if not the last part
                            if i < len(syllables) - 1:
                                temp_line += "-"
                
                # Add any remaining parts
                if temp_line:
                    current_line = [temp_line]
    
    # Add the last line if there's anything left
    if current_line:
        lines.append(' '.join(current_line))
    
    # 2. Check if the total height fits
    total_height = 0
    line_spacing = base_size * 0.3  # 30% line spacing
    
    for line in lines:
        bbox = draw.textbbox((0, 0), line, font=font)
        total_height += (bbox[3] - bbox[1]) + line_spacing

    # 3. If height exceeds, consider reducing font size appropriately
    if total_height > max_height:
        # Calculate new size based on how much we exceed
        ratio = max_height / total_height
        new_size = max(int(base_size * ratio * 0.95), 10)  # Minimum size of 10
        
        # Recursively try with smaller font
        return format_text_for_bubble(text, max_width, max_height, font_path, new_size)
    
    return lines, base_size

def add_translated_text(image, input_data, font_path, base_size=22):
    """
    Add translated text to the image using the original bubble coordinates.
    
    Parameters:
    -----------
    image : PIL.Image
        The masked image
    input_data : list of dict
        List of bubble data with coordinates and translations
    font_path : str
        Path to the font file
    base_size : int
        Base font size
        
    Returns:
    --------
    PIL.Image
        Image with translated text added
    """
    draw = ImageDraw.Draw(image)
    
    # Process each text area
    for bubble in input_data:
        # Get bubble coordinates and English text
        x, y = bubble["x"], bubble["y"]
        w, h = bubble["w"], bubble["h"]
        text_en = bubble["text_en"]
        
        # Format text to fit in the bubble
        lines, final_size = format_text_for_bubble(
            text_en,
            w * 0.9,  # 90% of bubble width (10% margin)
            h * 0.9,  # 90% of bubble height (10% margin)
            font_path,
            base_size
        )
        
        # Load the font at the final determined size
        font = ImageFont.truetype(font_path, final_size)
        
        # Calculate total text height (for vertical centering)
        total_height = 0
        line_spacing = final_size * 0.3
        for line in lines:
            bbox = draw.textbbox((0, 0), line, font=font)
            total_height += (bbox[3] - bbox[1]) + line_spacing
        total_height -= line_spacing  # Subtract the line spacing added for the last line
        
        # Calculate the starting y-coordinate to vertically center the text
        start_y = y + (h - total_height) // 2
        
        # Draw each line of text
        current_y = start_y
        for line in lines:
            # Get the width of the current line for horizontal centering
            bbox = draw.textbbox((0, 0), line, font=font)
            text_width = bbox[2] - bbox[0]
            text_x = x + (w - text_width) // 2
            
            # Draw text (with outline)
            draw.text(
                (text_x, current_y),
                line,
                font=font,
                fill='black',
                stroke_width=2,
                stroke_fill='white'
            )
            
            # Update the y-coordinate for the next line
            current_y += (bbox[3] - bbox[1]) + line_spacing
    
    return image

def get_system_font_path():
    """
    Get appropriate font path based on the operating system.
    
    Returns:
    --------
    str
        Path to a system font
    """
    if platform.system() == "Linux":
        font = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
    elif platform.system() == "Darwin":  # macOS
        font = "/System/Library/Fonts/Supplemental/Arial.ttf"
    elif platform.system() == "Windows":
        font = "C:/Windows/Fonts/arial.ttf"
    else:
        # Fallback to a common location or raise an error
        potential_paths = [
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
            "/System/Library/Fonts/Supplemental/Arial.ttf",
            "C:/Windows/Fonts/arial.ttf"
        ]
        for path in potential_paths:
            if os.path.exists(path):
                font = path
                break
        else:
            raise FileNotFoundError("Could not find a suitable font on this system")
    
    return font

def translate_manga_page(image_path, mask_tensor, input_data, font_path=None, output_path=None, margin=5, base_font_size=22):
    """
    Complete manga translation process:
    1. Mask original text
    2. Add translated text using original bubble coordinates
    
    Parameters:
    -----------
    image_path : str
        Path to the manga image
    mask_tensor : torch.Tensor
        Binary mask tensor where 1 indicates text pixels
    input_data : list of dict
        List of bubble data with coordinates and translations
    font_path : str
        Path to the font file
    output_path : str, optional
        Path to save the translated image
    margin : int
        Margin to add around text regions for masking
    base_font_size : int
        Base font size for text
        
    Returns:
    --------
    PIL.Image
        Translated image
    """
    # Use system font path if none provided
    if font_path is None:
        font_path = get_system_font_path()
    
    # Step 1: Mask the original text
    masked_img = mask_text_with_bubbles(
        image_path, mask_tensor, input_data, margin, 
        output_path=image_path.replace('.jpg', '_masked.jpg')
    )
    
    # Step 2: Add translated text using original bubble coordinates
    translated_img = add_translated_text(
        masked_img, input_data, font_path, base_font_size
    )
    
    # Save the translated image
    if output_path:
        translated_img.save(output_path)
        print(f"Translated image saved to: {output_path}")
    
    return translated_img

# Example usage
if __name__ == "__main__":
    # Your exact variables
    image_path = "boureisougi_002.jpg"
    mask_dict = torch.load(f"boureisougi_masks.pth")
    mask = mask_dict["boureisougi_002"]
    
    # Your input data
    input_data = [
        {"x": 413, "y": 105, "w": 49, "h": 55, "text_ja": "あ!", "text_en": "Oh!"},
        {"x": 492, "y": 236, "w": 100, "h": 154, "text_ja": "あれは一丁目のスナックのママ!", "text_en": "That woman is the hostess in the bar at Block-1."},
        {"x": 91, "y": 244, "w": 94, "h": 119, "text_ja": "あっちは行きつけの店の女将!", "text_en": "That is the owner of my favorite restaurant!"},
        {"x": 625, "y": 457, "w": 89, "h": 120, "text_ja": "ワシもまだまだ人気者ですなぁ!", "text_en": "I'm still so popular!"},
        {"x": 540, "y": 529, "w": 71, "h": 141, "text_ja": "生き生きしますぞ!", "text_en": "I feel so alive!"},
        {"x": 565, "y": 701, "w": 54, "h": 96, "text_ja": "葬儀屋とは", "text_en": "The job of an undertaker"},
        {"x": 150, "y": 704, "w": 78, "h": 112, "text_ja": "生者と死者の最期の場所を作る仕事", "text_en": "is to set up the last place for the living and the dead."},
        {"x": 701, "y": 916, "w": 62, "h": 68, "text_ja": "..もう", "text_en": "Well, I'm afraid..."},
    ]
    
    # Get platform-appropriate font path
    font_path = get_system_font_path()
    
    # Translate the manga page
    output_path = "boureisougi_002_translated.jpg"
    translated_img = translate_manga_page(
        image_path, mask, input_data, font_path, 
        output_path=output_path, 
        margin=5, 
        base_font_size=22
    )

To install: pip install pyphen
Masked image saved to: boureisougi_002_masked.jpg
Translated image saved to: boureisougi_002_translated.jpg
