EAST_MODEL_YES_OR_NO

In [18]:
import cv2
import numpy as np
import os
import pandas as pd

# Load the pre-trained EAST model for text detection
net = cv2.dnn.readNet(r'C:\Users\marta\Tese\frozen_east_text_detection.pb')

# Function to check if an image contains text using EAST
def detect_text_with_east(image_path):
    try:
        # Load the image
        image = cv2.imread(image_path)
        orig = image.copy()
        (H, W) = image.shape[:2]

        # Define the new width and height for the image (must be multiples of 32)
        newW, newH = (320, 320)
        rW = W / float(newW)
        rH = H / float(newH)

        # Resize the image
        image = cv2.resize(image, (newW, newH))

        # Construct a blob from the image
        blob = cv2.dnn.blobFromImage(image, 1.0, (newW, newH), (123.68, 116.78, 103.94), swapRB=True, crop=False)
        net.setInput(blob)

        # Define the output layer names for the EAST detector model
        layerNames = [
            "feature_fusion/Conv_7/Sigmoid",
            "feature_fusion/concat_3"
        ]

        # Perform a forward pass of the model to get output layers
        (scores, geometry) = net.forward(layerNames)

        # Decode the predictions and return the bounding box coordinates
        (rects, confidences) = decode_predictions(scores, geometry)

        # Apply non-maxima suppression to suppress weak, overlapping bounding boxes
        boxes = non_max_suppression(np.array(rects), probs=confidences)

        # If we found at least one text box, return that text is detected
        if len(boxes) > 0:
            return "Text detected"
        else:
            return "No text detected"

    except Exception as e:
        # If any error occurs, print the error and continue
        print(f"Error processing {os.path.basename(image_path)}: {e}")
        return f"Error: {e}"

# Decode the predictions of the EAST model
def decode_predictions(scores, geometry, confThreshold=0.5):
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    for y in range(numRows):
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        for x in range(numCols):
            if scoresData[x] < confThreshold:
                continue

            # Calculate the offset
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # Calculate the rotation angle and other metrics
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # Calculate the bounding box coordinates
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    return (rects, confidences)

# Perform non-maxima suppression to reduce overlapping bounding boxes
def non_max_suppression(boxes, probs=None, overlapThresh=0.3):
    if len(boxes) == 0:
        return []

    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")

    pick = []

    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(probs)

    while len(idxs) > 0:
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)

        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])

        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)

        overlap = (w * h) / area[idxs[:last]]

        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))

    return boxes[pick].astype("int")

# Define the folder containing your images
image_folder = r"C:\Users\marta\Tese\reddit-memes\images_download_test\output_folder"

# Create an empty list to store the results
results = []

# Loop through all the files in the folder and check if they contain text
for filename in os.listdir(image_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
        image_path = os.path.join(image_folder, filename)
        # Call the function to detect text and get the result
        result = detect_text_with_east(image_path)
        # Append the image name and result to the list
        results.append({"Image Name": filename, "East": result})

# Create a pandas DataFrame from the results
east_df = pd.DataFrame(results)

# Print the dataframe
display(east_df.head(10))

Error processing friendlyremindertogethelpifyourefeelingsuicidaltj4E.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_0.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_1.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_12.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_134.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_138.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_146.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_155.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_156.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_158.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_162.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_164.jpg: 'NoneType' object has no attribute 'copy'
Error processing image_171.jpg: 'NoneType' object has no attribute 'copy'
E

Unnamed: 0,Image Name,East
0,04kcpr9s0amd1.jpeg,Text detected
1,04pqjucet3md1.jpeg,Text detected
2,0fqsqd5hu5md1.jpeg,Text detected
3,0qawne0hu6md1.jpeg,Text detected
4,0v2kl2c4i5md1.jpeg,No text detected
5,0xpvklm7w3md1.jpeg,Text detected
6,10x5m4kew6md1.jpeg,Text detected
7,117eqigz74md1.jpeg,Text detected
8,11jh4mtgv9md1.jpeg,Text detected
9,17ewnbvi04md1.jpeg,Text detected


TESSERACT_MODEL_YES_OR_NO

In [19]:
import pytesseract
from PIL import Image
import os
import pandas as pd

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Function to check if an image contains text using Tesseract
def check_text_with_tesseract(image_path):
    try:
        # Open the image
        img = Image.open(image_path)

        # Use Tesseract to detect text
        text = pytesseract.image_to_string(img)

        # If any text is detected
        if text.strip():
            return "Text detected"
        else:
            return "No text detected"
    
    except Exception as e:
        # If any error occurs, print the error and return "Error"
        print(f"Error processing {os.path.basename(image_path)}: {e}")
        return f"Error: {e}"

# Define the folder containing your images
image_folder = r"C:\Users\marta\Tese\reddit-memes\images_download_test\output_folder"

# Create an empty list to store the results
results = []

# Loop through all the files in the folder and check if they contain text
for filename in os.listdir(image_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
        image_path = os.path.join(image_folder, filename)
        # Call the function to detect text and get the result
        result = check_text_with_tesseract(image_path)
        # Append the image name and result to the list
        results.append({"Image Name": filename, "Tesseract": result})

# Create a pandas DataFrame from the results
tesseract_df = pd.DataFrame(results)

# Print the dataframe
display(tesseract_df.head(10))

Error processing friendlyremindertogethelpifyourefeelingsuicidaltj4E.jpg: cannot identify image file 'C:\\Users\\marta\\Tese\\reddit-memes\\images_download_test\\output_folder\\friendlyremindertogethelpifyourefeelingsuicidaltj4E.jpg'
Error processing image_0.jpg: cannot identify image file 'C:\\Users\\marta\\Tese\\reddit-memes\\images_download_test\\output_folder\\image_0.jpg'
Error processing image_1.jpg: cannot identify image file 'C:\\Users\\marta\\Tese\\reddit-memes\\images_download_test\\output_folder\\image_1.jpg'
Error processing image_12.jpg: cannot identify image file 'C:\\Users\\marta\\Tese\\reddit-memes\\images_download_test\\output_folder\\image_12.jpg'
Error processing image_134.jpg: cannot identify image file 'C:\\Users\\marta\\Tese\\reddit-memes\\images_download_test\\output_folder\\image_134.jpg'
Error processing image_138.jpg: cannot identify image file 'C:\\Users\\marta\\Tese\\reddit-memes\\images_download_test\\output_folder\\image_138.jpg'
Error processing image_146

Unnamed: 0,Image Name,Tesseract
0,04kcpr9s0amd1.jpeg,No text detected
1,04pqjucet3md1.jpeg,Text detected
2,0fqsqd5hu5md1.jpeg,Text detected
3,0j1t16uxg7md1.gif,Text detected
4,0qawne0hu6md1.jpeg,Text detected
5,0v2kl2c4i5md1.jpeg,No text detected
6,0xpvklm7w3md1.jpeg,No text detected
7,10x5m4kew6md1.jpeg,No text detected
8,117eqigz74md1.jpeg,Text detected
9,11jh4mtgv9md1.jpeg,Text detected


PADDLEOCR_MODEL_YES_OR_NO

In [20]:
from paddleocr import PaddleOCR
import os
import pandas as pd
from PIL import Image

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Set 'lang' to the appropriate language for your images

# Function to check if an image contains text using PaddleOCR
def check_text_with_paddleocr(image_path):
    try:
        # Perform OCR on the image
        result = ocr.ocr(image_path, cls=True)
        
        # If any text is detected
        if result and any([line[1][0].strip() for line in result[0]]):
            return "Text detected"
        else:
            return "No text detected"
    
    except Exception as e:
        # If any error occurs, print the error and return "Error"
        print(f"Error processing {os.path.basename(image_path)}: {e}")
        return f"Error: {e}"

# Define the folder containing your images
image_folder = r"C:\Users\marta\Tese\reddit-memes\images_download_test\output_folder"

# Create an empty list to store the results
results = []

# Loop through all the files in the folder and check if they contain text
for filename in os.listdir(image_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
        image_path = os.path.join(image_folder, filename)
        # Call the function to detect text and get the result
        result = check_text_with_paddleocr(image_path)
        # Append the image name and result to the list
        results.append({"Image Name": filename, "PaddleOCR": result})

# Create a pandas DataFrame from the results
paddle_df = pd.DataFrame(results)

# Print the dataframe
display(paddle_df.head(10))

[2024/10/26 19:14:36] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\marta/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\marta/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

Unnamed: 0,Image Name,PaddleOCR
0,04kcpr9s0amd1.jpeg,Text detected
1,04pqjucet3md1.jpeg,Text detected
2,0fqsqd5hu5md1.jpeg,Text detected
3,0j1t16uxg7md1.gif,Text detected
4,0qawne0hu6md1.jpeg,Text detected
5,0v2kl2c4i5md1.jpeg,Error: 'NoneType' object is not iterable
6,0xpvklm7w3md1.jpeg,Text detected
7,10x5m4kew6md1.jpeg,Text detected
8,117eqigz74md1.jpeg,Text detected
9,11jh4mtgv9md1.jpeg,Text detected


COMPARING THE MODELS:

In [22]:
# Load the real text file
real_file = r'C:\Users\marta\Tese\real_text_yes_no.xlsx'
real_df = pd.read_excel(real_file)

# Drop the "Real Text" column if it exists in the DataFrame
if "Real Text" in real_df.columns:
    real_df.drop(columns=["Real Text"], inplace=True)

# Ensure that `east_df`, `tesseract_df`, and `paddle_df` are already loaded DataFrames with the same "Image Name" column

# Merge the DataFrames on "Image Name" using an outer join
merged_df = real_df.merge(east_df, on="Image Name", how="outer") \
                    .merge(tesseract_df, on="Image Name", how="outer") \
                    .merge(paddle_df, on="Image Name", how="outer")

# Display the first 10 rows of the merged DataFrame
display(merged_df.head(10))

Unnamed: 0,Image Name,Real,East,Tesseract,PaddleOCR
0,04kcpr9s0amd1.jpeg,YES,Text detected,No text detected,Text detected
1,04pqjucet3md1.jpeg,YES,Text detected,Text detected,Text detected
2,0fqsqd5hu5md1.jpeg,YES,Text detected,Text detected,Text detected
3,0j1t16uxg7md1.gif,,,Text detected,Text detected
4,0qawne0hu6md1.jpeg,YES,Text detected,Text detected,Text detected
5,0v2kl2c4i5md1.jpeg,NO,No text detected,No text detected,Error: 'NoneType' object is not iterable
6,0xpvklm7w3md1.jpeg,YES,Text detected,No text detected,Text detected
7,10x5m4kew6md1.jpeg,YES,Text detected,No text detected,Text detected
8,117eqigz74md1.jpeg,YES,Text detected,Text detected,Text detected
9,11jh4mtgv9md1.jpeg,YES,Text detected,Text detected,Text detected


In [23]:
# Initialize counts
results = {
    'Model': [],
    'Precision': [],
    'Recall': [],
    'F-measure': []
}

def calculate_metrics(model_name):
    # Get the predictions and real labels for the model
    predictions = merged_df[model_name]
    real_labels = merged_df["Real"]

    # Calculate true positives, true negatives, false positives, false negatives
    TP = ((predictions == "Text detected") & (real_labels == "YES")).sum()
    TN = ((predictions == "No text detected") & (real_labels == "NO")).sum()
    FP = ((predictions == "Text detected") & (real_labels == "NO")).sum()
    FN = ((predictions == "No text detected") & (real_labels == "YES")).sum()

    # Calculate precision, recall, and F-measure
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f_measure = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    # Append results
    results['Model'].append(model_name)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F-measure'].append(f_measure)

# Calculate metrics for each model
for model in ['East', 'Tesseract', 'PaddleOCR']:
    calculate_metrics(model)

# Create a DataFrame from results
metrics_df = pd.DataFrame(results)

# Display the results
metrics_df

Unnamed: 0,Model,Precision,Recall,F-measure
0,East,0.978571,0.98917,0.983842
1,Tesseract,0.979592,0.863309,0.917782
2,PaddleOCR,0.982332,1.0,0.991087


TEXT EXTRACT:

In [12]:
import pandas as pd
import pytesseract
from PIL import Image
import os

# Set the path for Tesseract executable if it's not in the system PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust path as necessary

# Load the DataFrame with real labels
df_real = pd.read_excel(r"C:\Users\marta\Tese\real_text_yes_no.xlsx")  # Load your DataFrame here

# Define the folder containing your images
image_folder = r"C:\Users\marta\Tese\reddit-memes\images_download_test\output_folder"

# Create an empty list to store the results
results = []

# Loop through the DataFrame and check images that have text
for index, row in df_real.iterrows():
    if row['Real'] == 'YES':  # Assuming 'real' column indicates presence of text
        image_name = row['Image Name']  # Ensure this matches the column name in your DataFrame
        image_path = os.path.join(image_folder, image_name)
        
        # Open the image and use Tesseract to extract text
        try:
            img = Image.open(image_path)
            text = pytesseract.image_to_string(img)
            results.append({"Image Name": image_name, "Tesseract Text": text})
        except Exception as e:
            print(f"Error processing {image_name}: {e}")

# Create a pandas DataFrame from the results
df_extracted_text = pd.DataFrame(results)

# Save the DataFrame to an Excel file
output_excel = r"C:\Users\marta\Tese\tesseract_extracted_text.xlsx"
df_extracted_text.to_excel(output_excel, index=False)

print(f"Extracted text results saved to {output_excel}")

Extracted text results saved to C:\Users\marta\Tese\tesseract_extracted_text.xlsx


In [13]:
import pandas as pd
from paddleocr import PaddleOCR
import os

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Set the language as needed

# Load the DataFrame with real labels
df_real = pd.read_excel(r"C:\Users\marta\Tese\real_text_yes_no.xlsx")  # Adjust path as necessary

# Define the folder containing your images
image_folder = r"C:\Users\marta\Tese\reddit-memes\images_download_test\output_folder"

# Create an empty list to store the results
results = []

# Loop through the DataFrame and check images that have text
for index, row in df_real.iterrows():
    if row['Real'] == 'YES':  # Assuming 'real' column indicates presence of text
        image_name = row['Image Name']  # Ensure this matches the column name in your DataFrame
        image_path = os.path.join(image_folder, image_name)
        
        # Perform OCR on the image
        try:
            result = ocr.ocr(image_path, cls=True)  # Perform OCR
            
            # Extract text from the results
            extracted_text = "\n".join([line[1][0] for line in result[0]]) if result else "No text detected"
            results.append({"Image Name": image_name, "PaddleOCR Text": extracted_text})
        except Exception as e:
            print(f"Error processing {image_name}: {e}")

# Create a pandas DataFrame from the results
df_extracted_text = pd.DataFrame(results)

# Save the DataFrame to an Excel file
output_excel = r"C:\Users\marta\Tese\paddleocr_extracted_text.xlsx"
df_extracted_text.to_excel(output_excel, index=False)

print(f"Extracted text results saved to {output_excel}")


[2024/10/21 22:33:04] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\marta/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\marta/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [16]:
import os
import pandas as pd
from craft_text_detector import Craft

# Initialize CRAFT model
craft = Craft(output_dir='output', crop_type="box", cuda=False)  # Set cuda=True if you have a GPU

# Function to check if an image contains text using CRAFT
def check_text_with_craft(image_path):
    try:
        # Perform text detection on the image
        prediction_result = craft.detect_text(image_path)
        
        # If any text regions are detected
        if prediction_result["boxes"]:
            return "Text detected"
        else:
            return "No text detected"
    
    except Exception as e:
        # If any error occurs, print the error and return "Error"
        print(f"Error processing {os.path.basename(image_path)}: {e}")
        return f"Error: {e}"

# Define the folder containing your images
image_folder = r"C:\Users\marta\Tese\reddit-memes\images_download_test\output_folder"

# Create an empty list to store the results
results = []

# Loop through all the files in the folder and check if they contain text
for filename in os.listdir(image_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        image_path = os.path.join(image_folder, filename)
        # Call the function to detect text and get the result
        result = check_text_with_craft(image_path)
        # Append the image name and result to the list
        results.append({"Image Name": filename, "CRAFT": result})

# Create a pandas DataFrame from the results
craft_df = pd.DataFrame(results)

# Save the DataFrame to an Excel file
output_excel = r"C:\Users\marta\Tese\craft_text_yes_no.xlsx"
df.to_excel(output_excel, index=False)

print(f"Results saved to {output_excel}")

# Clean up resources
craft.unload_craftnet_model()
craft.unload_refinenet_model()


OSError: [WinError 127] The specified procedure could not be found. Error loading "c:\Users\marta\Tese\reddit-memes\.venv\Lib\site-packages\torch\lib\shm.dll" or one of its dependencies.