In [None]:
import transformers
print(transformers.__version__)

In [2]:
import cv2 #image processing
import numpy as np # data processing
import matplotlib.pyplot as plt # plotting image
import matplotlib.image as mpimg # showing image
import tensorflow as tf # importing framework

from keras.models import Model # importing model
from keras.layers import Input, Lambda, Activation, BatchNormalization, Dropout, Reshape
from keras.layers import Conv2D, MaxPooling2D # import CNN Arsitektur layer
from keras.layers import Reshape
from keras.layers import Bidirectional, LSTM, Dense # import RNN Arsitektur
from keras.optimizers import Adam
from scipy import ndimage
from transformers import TrOCRProcessor, VisionEncoderDecoderModel # import pretrained model
from PIL import Image
from tabulate import tabulate
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from sklearn.metrics import confusion_matrix, f1_score
from tabulate import tabulate
import csv
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [3]:
# Define alphabet and some parameters for model
alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ-' "  # Valid characters for the recognition task
max_str_len = 50  # Max length of the output of 1 sequence
num_of_characters = len(alphabets) + 1  # Include blank label in CTC
num_of_timestamps = 64  # Length of the sequence output

In [None]:
# Initialize the processor and model once, outside of the prediction function
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')

In [5]:
# mounting from gdrive
path_to_model_weight = 'trained_model_13_3.h5' # the best model weight
#path_to_model_weight = 'trained_model_12_3.h5' # previous version

In [6]:
# Function to preprocess the image for TensorFlow model
def preprocess(img):
    # Check if the image is grayscale or RGB (3 channels)
    if len(img.shape) == 3:  # RGB image
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale

    (h, w) = img.shape  # Get the height and width of the grayscale image
    final_img = np.ones([64, 256]) * 255  # Create a blank white image

    # Crop the image if larger than the fixed size
    if h > 64:
        img = img[:64, :]
    if w > 256:
        img = img[:, :256]

    final_img[:h, :w] = img  # Place the original image on the blank one
    return cv2.rotate(final_img, cv2.ROTATE_90_CLOCKWISE)  # Rotate image for model input


In [7]:
# Augmentation function (for rotation)
def augment_image(image):
    # Apply random rotation between -5 to +5 degrees
    angle = np.random.uniform(-5, 5)
    rotated = ndimage.rotate(image, angle, reshape=False, cval=255)  # Rotate with padding
    return rotated

In [8]:
# Preprocessing for TrOCR model (image needs to be converted to RGB)
def preprocess_for_trocr(image):
    return image.convert("RGB")

In [9]:
# Convert label (string) to numeric values
def label_to_num(label):
    return np.array([alphabets.find(ch) for ch in label])

In [10]:
# Convert numeric predictions to string
def num_to_label(num):
    return ''.join([alphabets[ch] for ch in num if ch != -1])  # Ignore blank labels

In [11]:
# Function for prediction using the TensorFlow model
def predict(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
    image = preprocess(image)  # Preprocess image
    image = image / 255.0  # Normalize image

    # Reshape for batch size 1
    image = image.reshape(1, 256, 64, 1)

    pred = model.predict(image)  # Make prediction
    decoded = tf.keras.backend.get_value(
        tf.keras.backend.ctc_decode(pred, input_length=np.ones(pred.shape[0]) * pred.shape[1], greedy=True)[0][0]
    )

    return num_to_label(decoded[0])  # Convert numeric predictions to string


In [12]:
# Function to predict using TrOCR model
def predict_trocr(image):
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)  # Generate predictions using TrOCR
    predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return predicted_text

In [13]:
# Function to combine predictions from TensorFlow and TrOCR
def ensemble_prediction(tf_pred, trocr_pred):
    # Combine predictions: Choose the longer prediction (or other strategies like voting)
    return tf_pred if len(tf_pred) > len(trocr_pred) else trocr_pred

In [14]:
# Function to process large images using sliding windows
def sliding_window_prediction(image, window_size=(64, 256), step_size=128):
    (h, w) = image.shape
    if h < window_size[0]:  # Pad image height if needed
        padding_h = window_size[0] - h
        image = np.pad(image, ((0, padding_h), (0, 0)), 'constant', constant_values=255)
    if w < window_size[1]:  # Pad image width if needed
        padding_w = window_size[1] - w
        image = np.pad(image, ((0, 0), (0, padding_w)), 'constant', constant_values=255)

    results = []
    for x in range(0, w - window_size[1] + 1, step_size):
        window = image[:, x:x + window_size[1]]  # Extract window from image
        pred_text = predict(window)  # Predict text from window
        results.append(pred_text)

    return ''.join(results)  # Combine all predictions

In [15]:
def calculate_char_accuracy(recognized_text, ground_truth):
    recognized_text = recognized_text.upper()
    ground_truth = ground_truth.upper()

    TP = sum(1 for i in range(min(len(recognized_text), len(ground_truth))) if recognized_text[i] == ground_truth[i])
    FP = sum(1 for i in range(len(recognized_text)) if i >= len(ground_truth) or recognized_text[i] != ground_truth[i])
    FN = sum(1 for i in range(len(ground_truth)) if i >= len(recognized_text) or recognized_text[i] != ground_truth[i])

    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    accuracy = TP / max(len(recognized_text), len(ground_truth)) * 100

    return accuracy, f1, TP, FP, FN

In [16]:
def load_and_predict(image_path, ground_truth, all_true, all_pred):
    image = Image.open(image_path).convert("RGB")  # Open image and convert to RGB
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    predicted_text = predicted_text.upper().replace("'", "")
    if(predicted_text[-2:] == " ."):
        predicted_text = predicted_text[:-2]
    
    ground_truth = ground_truth.upper()

    accuracy, f1, TP, FP, FN = calculate_char_accuracy(predicted_text, ground_truth)
    
    # Append true and predicted characters for confusion matrix
    all_true.extend(list(ground_truth))
    all_pred.extend(list(predicted_text))
    
    table_data = [
        ["Predicted Text", predicted_text],
        ["Length of Predicted Text", len(predicted_text)],
        ["Accuracy", f"{accuracy:.2f}%"],
        ["F1 Score", f"{f1:.2f}"],
    ]
    
    print(tabulate(table_data, headers=["Metric", "Value"], tablefmt="fancy_grid"))
    print("="*50)
    
    return accuracy, f1, TP, FP, FN

In [17]:
def read_ground_truth(csv_file):
    ground_truth_dict = {}
    with open(csv_file, mode='r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip CSV header
        for row in csv_reader:
            image_name = row[0]
            ground_truth = row[1]
            ground_truth_dict[image_name] = ground_truth
    return ground_truth_dict

In [18]:
def plot_accuracy_graph(accuracies, ground_truth_lengths):
    plt.figure(figsize=(10, 6))
    plt.plot(ground_truth_lengths, accuracies, marker='o', linestyle='-', color='b', label='Akurasi per Gambar')
    plt.xlabel('Panjang String Ground Truth')
    plt.ylabel('Akurasi (%)')
    plt.title('Akurasi CRNN with TrOCR')
    plt.grid(True)
    plt.legend()
    plt.show()


In [19]:
def plot_accuracy_vs_iteration(accuracies):
    # Menghitung rata-rata kumulatif
    avg_accuracies = [sum(accuracies[:i + 1]) / (i + 1) for i in range(len(accuracies))]
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(accuracies) + 1), accuracies, marker='o', linestyle='-', color='g', label='Akurasi per Iterasi')
    plt.plot(range(1, len(avg_accuracies) + 1), avg_accuracies, marker='', linestyle='--', color='b', label='Rata-rata Kumulatif')
    plt.xlabel('Nomor Iterasi')
    plt.ylabel('Akurasi (%)')
    plt.title('Akurasi trOCR+CNN 30 Char dengan Rata-rata Kumulatif') # ganti dengan keterangan yang sesuai
    plt.grid(True)
    plt.legend()
    plt.show()

In [20]:
def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(10, 7))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix CRNN with TrOCR')
    plt.colorbar()
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)
    plt.ylabel('True Labels')
    plt.xlabel('Predicted Labels')

    for i in range(len(labels)):
        for j in range(len(labels)):
            plt.text(j, i, cm[i, j], ha='center', va='center', color='red')

    plt.tight_layout()
    plt.show()

In [21]:
def plot_f1_graph(f1_scores):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(f1_scores) + 1), f1_scores, marker='o', linestyle='-', color='r', label='F1 Score per Gambar')
    plt.xlabel('Nomor Gambar')
    plt.ylabel('F1 Score')
    plt.title('F1 Convidence CRNN with TrOCR')
    plt.grid(True)
    plt.legend()
    plt.show()

In [22]:
def process_folder_images(folder_path, ground_truth_dict, base_filename="TEST_", start_index=1, end_index=20):
    accuracies = []  # List for storing accuracies per image
    ground_truth_lengths = []  # List for storing ground truth string lengths per image
    all_true = []  # List for storing all true characters
    all_pred = []  # List for storing all predicted characters
    f1_scores = []  # List for storing F1 scores per image

    # Iterate over the image indices
    for i in range(start_index, end_index + 1):
        image_path = os.path.join(folder_path, f"{base_filename}{i}.png")

        if os.path.exists(image_path):
            print(f"Processing image: {image_path}")
            ground_truth = ground_truth_dict.get(f"{base_filename}{i}.png", "")
            

            # Perform prediction
            accuracy, f1, _, _, _ = load_and_predict(image_path, ground_truth, all_true, all_pred)
            f1_scores.append(f1)
            # Store accuracy and ground truth length
            accuracies.append(accuracy)
            ground_truth_lengths.append(len(ground_truth))
        else:
            print(f"Skipping missing file: {image_path}")  # Skip missing image

    # Make sure all_true and all_pred have the same length by handling mismatches
    min_len = min(len(all_true), len(all_pred))
    all_true = all_true[:min_len]
    all_pred = all_pred[:min_len]

    # Plot accuracy graph
    # plot_accuracy_graph(accuracies, ground_truth_lengths)
    plot_accuracy_vs_iteration(accuracies)
    labels = sorted(set(all_pred))
    
    # Plot confusion matrix
    plot_confusion_matrix(all_true, all_pred, labels)

    plot_f1_graph(f1_scores)

    

In [None]:
folder_path = '../Capital/26-30/' #ganti dengan folder yang diinginkan
ground_truth_file = '../Capital/26-30/GROUND_TRUTH30.csv'  #ganti dengan file ground truth yang diinginkan
ground_truth_dict = read_ground_truth(ground_truth_file)

# Process the images and plot the results
process_folder_images(folder_path, ground_truth_dict, base_filename="TEST_", start_index=1, end_index=36)

In [None]:
def plot_predicted_text_lengths(folder_path, base_filename="TEST_", start_index=1, end_index=50):
    """
    Plot distribusi panjang teks yang diprediksi dari gambar-gambar.

    Args:
        folder_path (str): Path ke folder berisi gambar.
        base_filename (str): Nama dasar file gambar.
        start_index (int): Indeks awal untuk diproses.
        end_index (int): Indeks akhir untuk diproses.
    """
    lengths = []
    for i in range(start_index, end_index + 1):
        image_path = os.path.join(folder_path, f"{base_filename}{i}.png")
        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")
            pixel_values = processor(images=image, return_tensors="pt").pixel_values
            generated_ids = model.generate(pixel_values)
            predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            lengths.append(len(predicted_text))

    plt.figure(figsize=(8, 5))
    plt.hist(lengths, bins=20, color='skyblue', edgecolor='black')
    plt.title("Distribusi Panjang Teks Prediksi")
    plt.xlabel("Panjang Teks")
    plt.ylabel("Frekuensi")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# Panggil fungsi
plot_predicted_text_lengths(folder_path, base_filename="TEST_", start_index=1, end_index=36)


In [25]:
def plot_predicted_text_lengths(folder_path, base_filename="test_", start_index=1, end_index=50):
    lengths = []
    for i in range(start_index, end_index + 1):
        image_path = os.path.join(folder_path, f"{base_filename}{i}.jpg")
        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")
            pixel_values = processor(images=image, return_tensors="pt").pixel_values
            generated_ids = model.generate(pixel_values)
            predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            lengths.append(len(predicted_text))

    plt.figure(figsize=(8, 5))
    plt.hist(lengths, bins=20, color='skyblue', edgecolor='black')
    plt.title("Histogram of Predicted Text Lengths")
    plt.xlabel("Text Length")
    plt.ylabel("Frequency")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()


In [26]:
def compare_predictions(folder_path, base_filename="test_", start_index=1, end_index=5):
    tf_preds = []
    trocr_preds = []
    indices = []

    for i in range(start_index, end_index + 1):
        image_path = os.path.join(folder_path, f"{base_filename}{i}.jpg")
        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")
            tf_pred = predict(np.array(image))
            trocr_pred = predict_trocr(image)

            indices.append(f"{base_filename}{i}")
            tf_preds.append(len(tf_pred))
            trocr_preds.append(len(trocr_pred))

    bar_width = 0.35
    indices_pos = np.arange(len(indices))

    plt.figure(figsize=(10, 6))
    plt.bar(indices_pos - bar_width/2, tf_preds, bar_width, label="TensorFlow Predictions", color='blue')
    plt.bar(indices_pos + bar_width/2, trocr_preds, bar_width, label="TrOCR Predictions", color='orange')
    plt.xticks(indices_pos, indices, rotation=45, ha="right")
    plt.xlabel("Image")
    plt.ylabel("Predicted Text Length")
    plt.title("Comparison of Predicted Text Lengths")
    plt.legend()
    plt.tight_layout()
    plt.show()
