In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import string
import numpy as np
import easyocr
import fitz
import collections
import hashlib
import pymongo
from PIL import Image, ImageDraw, ImageFont
from matplotlib import pyplot as plt
from bson.objectid import ObjectId
from pprint import pprint
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Initialize OCR
reader = easyocr.Reader([], gpu=True)

In [2]:
mongo = pymongo.MongoClient('mongodb://root:root@localhost:27017/?authMechanism=DEFAULT')
db = mongo['skripsi']
fonts = db['fonts']
documents = db['documents']

In [10]:
# Define input
doc = documents.find_one({'document': '0.pdf'})
font_doc = fonts.find_one({'_id': doc['font']})
filename = doc['document']

# Open file
pdf = fitz.open("../tmps/" + filename)

# Get all fonts across document
detected_fonts = list({el for i in range(pdf.page_count) for el in pdf.get_page_fonts(i)})
embedded_fonts = set()

# Loop through fonts
for i, font in enumerate(detected_fonts):
    print(font)
    # Extract font
    name, ext, _, content = pdf.extract_font(font[0])
    name = name.split('+')[-1]

    # If font is embedded
    if ext != 'n/a' and name.startswith("TimesNewRomanFake"):
        # Write fonts
        fontfile = name + "." + ext
        f = open("../fonts/" + fontfile, "wb")
        f.write(content)
        f.close()

        # Append to array
        embedded_fonts.add((fontfile, name))

embedded_fonts

(5, 'ttf', 'TrueType', 'BCDEEE+TimesNewRomanPSMT', 'F1', 'WinAnsiEncoding')
(17, 'ttf', 'Type0', 'BCDIEE+TimesNewRomanFake0PSMT', 'F5', 'Identity-H')
(11, 'ttf', 'TrueType', 'BCDGEE+Arial-BoldMT', 'F3', 'WinAnsiEncoding')
(13, 'ttf', 'TrueType', 'BCDHEE+TimesNewRomanFake0PSMT', 'F4', 'WinAnsiEncoding')
(54, 'ttf', 'Type0', 'BCDLEE+SymbolMT', 'F8', 'Identity-H')
(42, 'ttf', 'Type0', 'BCDKEE+CambriaMath', 'F7', 'Identity-H')
(22, 'ttf', 'TrueType', 'BCDJEE+ArialMT', 'F6', 'WinAnsiEncoding')
(9, 'ttf', 'TrueType', 'BCDFEE+TimesNewRomanPS-BoldMT', 'F2', 'WinAnsiEncoding')


{('TimesNewRomanFake0PSMT.ttf', 'TimesNewRomanFake0PSMT')}

In [None]:
def draw_char(char, typeface, size):
    # Set canvas size
    W, H = (int(size * 1.5) * 3, int(size * 1.5))
    # Set font
    font = ImageFont.truetype(typeface, size)
    # Make empty image
    img = Image.new('RGB', (W, H), color='#B7C274')
    # Draw text to image
    draw = ImageDraw.Draw(img)
    _, _, w, h = font.getbbox(char)
    draw.text(((W-w)/2,(H-h)/2), char, fill='#483d8b', font=font)

    return np.asarray(img)

# Get list possible character
chars = list(string.digits + string.ascii_letters)

# Setup problematic font map
hashmap = {}
setmap = set(font_doc['swaps'].keys())

# Counter
y_true = []
y_pred = []

# Loop through embedded fonts
for fontfile, fontname in embedded_fonts:
    hashmap[fontname] = set()
    # Loop through characters
    for char in tqdm(chars):
        # Render characters
        img = draw_char(char * 4, '../fonts/' + fontfile, 250)
        # Detect characters with OCR
        result = reader.readtext(img, allowlist=chars)

        # If character detected
        if len(result) > 0:
            # Calculate most character appear
            most_char, total = collections.Counter(result[0][1]).most_common(1)[0]
            # If char not the same
            if most_char.lower() == char.lower():
                if char not in setmap or font_doc['swaps'][char] == char:
                    # True Negative
                    y_true.append('Real')
                    y_pred.append('Real')
                else:
                    # False Negative
                    y_true.append('Fake')
                    y_pred.append('Real')
                    print(char)
            else:
                # Add to hash map
                hashmap[fontname].add(char)

                # Print char
                if char not in setmap:
                    # False Positive
                    y_true.append('Real')
                    y_pred.append('Fake')
                    print(char)
                else:
                    # True Positive
                    y_true.append('Fake')
                    y_pred.append('Fake')

            # Char in font not embedded
        elif char not in setmap or font_doc['swaps'][char] == char:
            # True Negative
            y_true.append('Real')
            y_pred.append('Real')
        elif char in setmap:
            hashmap[fontname].add(char)
            # True Positive
            y_true.append('Fake')
            y_pred.append('Fake')

    if len(hashmap[fontname]) == 0:
        del hashmap[fontname]

print("Accuracy Score :", accuracy_score(y_true, y_pred))
print("Precision Score :", precision_score(y_true, y_pred, pos_label='Fake'))
print("Recall Score :", recall_score(y_true, y_pred, pos_label='Fake'))
print("F1 Score :", f1_score(y_true, y_pred, pos_label='Fake'))

ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap='gray')
plt.show()

In [None]:
# Render characters
img = draw_char('Y' * 4, '../fonts/6f711bf6f241371420c568b82facf04e.ttf', 250)
# Detect characters with OCR
result = reader.readtext(img, allowlist=chars)

# Draw bounding box
img = Image.fromarray(np.uint8(img)).convert('RGB')
draw = ImageDraw.ImageDraw(img)
for item in result:
    top_left = (item[0][0][0], item[0][0][1])
    bottom_right = (item[0][2][0], item[0][2][1])
    draw.rectangle((top_left, bottom_right), outline='red')
    # Print char
    print('Y', "-->", result[0][1])

plt.imshow(img, cmap='gray')
plt.show()

In [None]:
from sklearn.metrics import RocCurveDisplay

isFakeTransform = np.vectorize(lambda t: t == 'Fake')
RocCurveDisplay.from_predictions(isFakeTransform(y_true), isFakeTransform(y_pred))

In [None]:
for key, value in hashmap.items():
    print(len(font_doc['swaps']))
    print(key, set(font_doc['swaps'].keys()) - value)

In [None]:
pdf.close()