<a href="https://colab.research.google.com/github/nxxk23/AI-Engineer/blob/main/extract/gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install easyocr pdf2image gradio pythainlp

In [10]:
!pip install easyocr gradio pythainlp pdf2image
!apt-get install -y poppler-utils  # Install poppler for PDF processing

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 0s (408 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123597 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [13]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
from google.colab import files
uploaded_files = files.upload()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os

resume_directory = '/content/drive/MyDrive/AIEngineer/resume/resume_LLM'
pdf_files = [os.path.join(resume_directory, f) for f in os.listdir(resume_directory) if f.endswith('.pdf')]

# Print the list of PDF files to verify
print(pdf_files)

['/content/drive/MyDrive/AIEngineer/resume/resume_LLM/Resume BA 2.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/Resume BA 3.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/Resume ba 5.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/Resume ba2.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/Resume baa.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 6.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 7.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 2.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 3.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 4.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 5.pdf', '/content/drive/MyDrive/AIEngineer/resume/resume_LLM/resume 1.pdf']


In [18]:
import os
import re
import numpy as np
import pandas as pd
import easyocr
from pdf2image import convert_from_path
from pythainlp.phayathaibert.core import NamedEntityTagger
import gradio as gr

def extract_text_from_image(image):
    try:
        reader = easyocr.Reader(['th', 'en'], gpu=True)  # Enable GPU if available
        image_np = np.array(image)
        result = reader.readtext(image_np)
        sorted_data = sorted(result, key=lambda x: x[0][0][1])
        plain_text = "\n".join([text for _, text, _ in sorted_data])
        return plain_text
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

def process_single_pdf(pdf_file):
    tagger = NamedEntityTagger()
    unwanted_terms = [
        'ที่อยู่', 'โทรศัพท์', 'อีเมล', 'linkedin', ':', ',', '-', '|',
        'ประวัติส่วนตัว', 'เกี่ยวกับฉัน', 'about me', 'ชื่อ', 'สกุล', 'tell', 'โทร', 'โทรงาน',
        'ชื่อเล่น', 'อายุ', 'วันเกิด', 'พุทธ', 'ศาสนา', 'สัญชาติ', 'phone',
        'ช่องทางการติดต่อ', '_', 're sume', 'resume', 'resu me'
    ]
    unwanted_pattern = '|'.join(map(re.escape, unwanted_terms))

    def tag_and_clean_text(text):
        try:
            ner = tagger.get_ner(text, tag=True)
            pattern = r'<(?!ORGANIZATION|PERCENT|TIME)[^>]+>[^<]+</[^>]+>'
            cleaned_ner = re.sub(pattern, '', ner)
            cleaned_ner = re.sub(r'</?(ORGANIZATION|PERCENT|TIME)>', '', cleaned_ner)
            cleaned_ner = re.sub(unwanted_pattern, '', cleaned_ner)
            cleaned_ner = re.sub(r'\bal\b', 'ai', cleaned_ner, flags=re.IGNORECASE)
            return ner.strip(), cleaned_ner.strip()
        except Exception as e:
            print(f"Error in tag_and_clean_text: {e}")
            return "", text

    results = []
    try:
        images = convert_from_path(pdf_file.name, dpi=300)
        raw_text = ""
        for image in images:
            raw_text += extract_text_from_image(image) + "\n"
        tagged_text, cleaned_text = tag_and_clean_text(raw_text)
        results.append({
            "PDF File": os.path.basename(pdf_file.name),
            "Raw_Text": raw_text,
            "Cleaned_Text": cleaned_text
        })

    except Exception as e:
        print(f"Error in process_single_pdf: {e}")

    df = pd.DataFrame(results)
    return df

def process_multiple_pdfs(pdf_files):
    all_results = pd.DataFrame()
    for pdf_file in pdf_files:
        df = process_single_pdf(pdf_file)
        all_results = pd.concat([all_results, df], ignore_index=True)
    return all_results

def gradio_interface(pdf_files):
    try:
        df = process_multiple_pdfs(pdf_files)
        csv_path = save_dataframe(df)
        return df, csv_path
    except Exception as e:
        print(f"Error in gradio_interface: {e}")
        return pd.DataFrame(), ""


def save_dataframe(df):
    try:
        csv_path = '/content/output.csv'
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        return csv_path
    except Exception as e:
        print(f"Error saving DataFrame: {e}")
        return ""

def save_button_action(df):
    csv_path = save_dataframe(df)
    return csv_path


In [None]:
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Files(label="Upload PDF Files")],  # Allow multiple files
    outputs=[
        gr.Dataframe(label="Extracted and Cleaned Data", height=500, min_width=800),
        gr.File(label="Download CSV")  # Add a file download button
    ],
    title="PDF to Text and Data Cleaner"
)

iface.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://5f01a29b36f400adbc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  net.load_state_dict(copyStateDict(torch.load