In [0]:
%pip install PyMuPDF
%restart_python

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
CATALOG = "marcell"
SCHEMA = "marine_ai_poc"
PROJECT_ID = "EN010109"

In [0]:
df_reference = spark.table(f"{CATALOG}.{SCHEMA}.document_reference_with_paths").where(F.col("project_id") == PROJECT_ID)\
    .drop("title")\
    .withColumn("title", F.regexp_replace(F.col("link_text"), r"\s*\([^)]*\)", ""))\
    .withColumn("title", F.replace(F.col("title"), F.lit("\n"), F.lit("")))

In [0]:
@F.udf(returnType=T.ArrayType(T.StringType()))
def convert_doc_to_text(pdf_path):
  import pymupdf
  document_id = pdf_path.split('/')[-1].replace('.pdf', '')
  try:
    doc = pymupdf.open(pdf_path)
    pages = [page.get_text() for page in doc]
    
    return pages
  
  except Exception as e:
    return [e]

  

In [0]:
df_reference = (df_reference
                .repartition(1000)
                .withColumn("pages", convert_doc_to_text(F.col("pdf_path")))
                )

In [0]:
(df_reference
 .write
 .mode("append")
 .saveAsTable(f"{CATALOG}.{SCHEMA}.documents_parsed")
 )

In [0]:
df_reference_pages = (
  spark.table(f"{CATALOG}.{SCHEMA}.documents_parsed")
  .select("*", F.posexplode("pages").alias("page_number", "page_text")).drop("pages")
  .withColumn("page_id", F.expr("concat(filename, '_', page_number)"))
)

In [0]:
(df_reference_pages
 .write
 .mode("append")
 .option("overwriteSchema", "true")
 .saveAsTable(f"{CATALOG}.{SCHEMA}.document_pages_parsed")
 )