In [1]:
!apt-get install tesseract-ocr tesseract-ocr-heb poppler-utils
!pip install pdf2image pillow pytesseract stanza python-bidi transformers torch pandas huggingface_hub datasets

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesseract-ocr-eng tesseract-ocr-heb tesseract-ocr-osd
0 upgraded, 5 newly installed, 0 to remove and 18 not upgraded.
Need to get 5,434 kB of archives.
After this operation, 17.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy/universe amd64 te

In [2]:
PDF_DIR = '/content/drive/MyDrive/tzfira/'


In [3]:
# Cell 1 - Setup and PDF Processing
!apt-get install tesseract-ocr tesseract-ocr-heb poppler-utils
!pip install pdf2image pillow pytesseract stanza python-bidi pandas

from google.colab import drive
drive.mount('/content/drive')


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-heb is already the newest version (1:4.00~git30-7274cfa-1.1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
Mounted at /content/drive


In [4]:

import os
import numpy as np
from PIL import Image, ImageEnhance, ImageOps
import pytesseract
import bidi.algorithm as bidi
from pdf2image import convert_from_path, pdfinfo_from_path
import stanza
import re
import pandas as pd
import glob

PDF_DIR = '/content/drive/MyDrive/tzfira/'

# Create output directories
for dir_name in ['ocr_output', 'cleaned_texts', 'pos_tagged']:
   os.makedirs(dir_name, exist_ok=True)

print(f"Looking for PDFs in: {PDF_DIR}")
pdfs = glob.glob(f'{PDF_DIR}/*.pdf')
print(f"Found {len(pdfs)} PDF files")

# Process each PDF
for pdf_path in pdfs:
   print(f"\nProcessing {pdf_path}")
   base_name = os.path.splitext(os.path.basename(pdf_path))[0]

   # 1. Split and OCR
   info = pdfinfo_from_path(pdf_path)
   ocr_texts = []

   for page in range(1, info["Pages"] + 1):
       print(f"Page {page}/{info['Pages']}")
       images = convert_from_path(pdf_path, dpi=300, first_page=page, last_page=page)
       page_image = images[0]

       # Split page
       width = page_image.size[0]
       right_half = page_image.crop((width//2, 0, width, page_image.size[1]))
       left_half = page_image.crop((0, 0, width//2, page_image.size[1]))

       # OCR each half
       for half in [right_half, left_half]:
           # Preprocess
           half = half.convert('L')
           half = ImageEnhance.Contrast(half).enhance(2.0)
           half = ImageOps.autocontrast(half)

           # OCR
           text = pytesseract.image_to_string(half, lang='heb')
           text = bidi.get_display(text).strip()
           if text:
               ocr_texts.append(text)

   # Save OCR output
   ocr_path = f'ocr_output/{base_name}_ocr.txt'
   with open(ocr_path, 'w', encoding='utf-8') as f:
       f.write('\n\n'.join(ocr_texts))
   print(f"Saved OCR text to {ocr_path}")

   # 2. Clean text
   with open(ocr_path, 'r', encoding='utf-8') as f:
       text = f.read()

   cleaned = re.sub(r'[^\u0590-\u05FF\uFB1D-\uFB4F\s\.\,\?\!\-]', ' ', text)
   cleaned = re.sub(r'\s+', ' ', cleaned).strip()

   clean_path = f'cleaned_texts/{base_name}_clean.txt'
   with open(clean_path, 'w', encoding='utf-8') as f:
       f.write(cleaned)
   print(f"Saved cleaned text to {clean_path}")

   # 3. POS tag
   nlp = stanza.Pipeline('he', processors='tokenize,pos')
   doc = nlp(cleaned)

   pos_path = f'pos_tagged/{base_name}_pos.txt'
   with open(pos_path, 'w', encoding='utf-8') as f:
       for sent in doc.sentences:
           for word in sent.words:
               pos = "PPOS" if word.upos == "PRON" else word.upos
               f.write(f"{word.text} --> {pos}\n")
   print(f"Saved POS tags to {pos_path}")

print("\nProcessing complete!")

Looking for PDFs in: /content/drive/MyDrive/tzfira/
Found 50 PDF files

Processing /content/drive/MyDrive/tzfira/18620501_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620501_01_ocr.txt
Saved cleaned text to cleaned_texts/18620501_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json


Downloading https://huggingface.co/stanfordnlp/stanza-he/resolve/v1.10.0/models/tokenize/combined.pt:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-he/resolve/v1.10.0/models/mwt/combined.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-he/resolve/v1.10.0/models/pos/combined_charlm.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-he/resolve/v1.10.0/models/forward_charlm/oscar.pt:   0%|…

Downloading https://huggingface.co/stanfordnlp/stanza-he/resolve/v1.10.0/models/backward_charlm/oscar.pt:   0%…

Downloading https://huggingface.co/stanfordnlp/stanza-he/resolve/v1.10.0/models/pretrain/conll17.pt:   0%|    …

INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620501_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620508_01.pdf
Page 1/12
Page 2/12
Page 3/12
Page 4/12
Page 5/12
Page 6/12
Page 7/12
Page 8/12
Page 9/12
Page 10/12
Page 11/12
Page 12/12


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620508_01_ocr.txt
Saved cleaned text to cleaned_texts/18620508_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620508_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620515_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620515_01_ocr.txt
Saved cleaned text to cleaned_texts/18620515_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620515_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620522_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620522_01_ocr.txt
Saved cleaned text to cleaned_texts/18620522_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620522_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620529_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620529_01_ocr.txt
Saved cleaned text to cleaned_texts/18620529_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620529_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620703_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620703_01_ocr.txt
Saved cleaned text to cleaned_texts/18620703_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620703_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620710_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620710_01_ocr.txt
Saved cleaned text to cleaned_texts/18620710_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620710_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620717_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620717_01_ocr.txt
Saved cleaned text to cleaned_texts/18620717_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620717_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620724_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620724_01_ocr.txt
Saved cleaned text to cleaned_texts/18620724_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620724_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741202_01.pdf
Page 1/10
Page 2/10
Page 3/10
Page 4/10
Page 5/10
Page 6/10
Page 7/10
Page 8/10
Page 9/10
Page 10/10


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741202_01_ocr.txt
Saved cleaned text to cleaned_texts/18741202_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741202_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741209_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741209_01_ocr.txt
Saved cleaned text to cleaned_texts/18741209_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741209_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741216_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741216_01_ocr.txt
Saved cleaned text to cleaned_texts/18741216_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741216_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741230_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741230_01_ocr.txt
Saved cleaned text to cleaned_texts/18741230_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741230_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741104_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741104_01_ocr.txt
Saved cleaned text to cleaned_texts/18741104_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741104_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741111_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741111_01_ocr.txt
Saved cleaned text to cleaned_texts/18741111_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741111_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741118_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741118_01_ocr.txt
Saved cleaned text to cleaned_texts/18741118_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741118_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741125_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741125_01_ocr.txt
Saved cleaned text to cleaned_texts/18741125_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741125_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741007_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741007_01_ocr.txt
Saved cleaned text to cleaned_texts/18741007_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741007_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741014_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741014_01_ocr.txt
Saved cleaned text to cleaned_texts/18741014_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741014_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741021_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741021_01_ocr.txt
Saved cleaned text to cleaned_texts/18741021_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741021_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18741028_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18741028_01_ocr.txt
Saved cleaned text to cleaned_texts/18741028_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18741028_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740902_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740902_01_ocr.txt
Saved cleaned text to cleaned_texts/18740902_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740902_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740909_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740909_01_ocr.txt
Saved cleaned text to cleaned_texts/18740909_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740909_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740916_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740916_01_ocr.txt
Saved cleaned text to cleaned_texts/18740916_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740916_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740923_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740923_01_ocr.txt
Saved cleaned text to cleaned_texts/18740923_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740923_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740826_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740826_01_ocr.txt
Saved cleaned text to cleaned_texts/18740826_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740826_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740805_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740805_01_ocr.txt
Saved cleaned text to cleaned_texts/18740805_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740805_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740812_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740812_01_ocr.txt
Saved cleaned text to cleaned_texts/18740812_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740812_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18740819_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18740819_01_ocr.txt
Saved cleaned text to cleaned_texts/18740819_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18740819_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620306_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620306_01_ocr.txt
Saved cleaned text to cleaned_texts/18620306_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620306_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620313_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620313_01_ocr.txt
Saved cleaned text to cleaned_texts/18620313_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620313_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620320_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620320_01_ocr.txt
Saved cleaned text to cleaned_texts/18620320_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620320_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620327_01.pdf
Page 1/9
Page 2/9
Page 3/9
Page 4/9
Page 5/9
Page 6/9
Page 7/9
Page 8/9
Page 9/9


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620327_01_ocr.txt
Saved cleaned text to cleaned_texts/18620327_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620327_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620204_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620204_01_ocr.txt
Saved cleaned text to cleaned_texts/18620204_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620204_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620211_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620211_01_ocr.txt
Saved cleaned text to cleaned_texts/18620211_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620211_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620219_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620219_01_ocr.txt
Saved cleaned text to cleaned_texts/18620219_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620219_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18620226_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18620226_01_ocr.txt
Saved cleaned text to cleaned_texts/18620226_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18620226_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750602_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750602_01_ocr.txt
Saved cleaned text to cleaned_texts/18750602_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750602_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750608_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750608_01_ocr.txt
Saved cleaned text to cleaned_texts/18750608_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750608_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750616_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750616_01_ocr.txt
Saved cleaned text to cleaned_texts/18750616_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750616_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750623_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750623_01_ocr.txt
Saved cleaned text to cleaned_texts/18750623_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750623_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750630_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750630_01_ocr.txt
Saved cleaned text to cleaned_texts/18750630_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750630_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750505_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750505_01_ocr.txt
Saved cleaned text to cleaned_texts/18750505_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750505_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750512_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750512_01_ocr.txt
Saved cleaned text to cleaned_texts/18750512_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750512_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750519_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750519_01_ocr.txt
Saved cleaned text to cleaned_texts/18750519_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750519_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750526_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750526_01_ocr.txt
Saved cleaned text to cleaned_texts/18750526_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750526_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750407_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750407_01_ocr.txt
Saved cleaned text to cleaned_texts/18750407_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750407_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750414_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750414_01_ocr.txt
Saved cleaned text to cleaned_texts/18750414_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750414_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750428_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750428_01_ocr.txt
Saved cleaned text to cleaned_texts/18750428_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750428_01_pos.txt

Processing /content/drive/MyDrive/tzfira/18750620_01.pdf
Page 1/8
Page 2/8
Page 3/8
Page 4/8
Page 5/8
Page 6/8
Page 7/8
Page 8/8


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Saved OCR text to ocr_output/18750620_01_ocr.txt
Saved cleaned text to cleaned_texts/18750620_01_clean.txt


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: he (Hebrew):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Done loading processors!


Saved POS tags to pos_tagged/18750620_01_pos.txt

Processing complete!


In [9]:
# HuggingFace Upload - Simple Version
from huggingface_hub import HfApi, login
from datasets import Dataset
import os
import glob

# Configuration
HF_TOKEN = "addtoken"
REPO_NAME = "mbole/hebrew-tzfira-dataset"

# Login and connect
login(token=HF_TOKEN)
api = HfApi()

# Create repository
print("Creating repository...")
try:
    api.create_repo(
        repo_id=REPO_NAME,
        token=HF_TOKEN,
        repo_type="dataset",
        private=False
    )
except Exception as e:
    print(f"Repository might already exist: {e}")

# Prepare dataset
print("Preparing dataset...")
data = []
for ocr_file in glob.glob('ocr_output/*_ocr.txt'):
    base_name = os.path.basename(ocr_file).replace('_ocr.txt', '')

    with open(ocr_file, 'r', encoding='utf-8') as f:
        ocr_text = f.read()
    with open(f'cleaned_texts/{base_name}_clean.txt', 'r', encoding='utf-8') as f:
        cleaned_text = f.read()
    with open(f'pos_tagged/{base_name}_pos.txt', 'r', encoding='utf-8') as f:
        pos_text = f.read()

    data.append({
        'id': base_name,
        'ocr_text': ocr_text,
        'cleaned_text': cleaned_text,
        'pos_tagged': pos_text
    })

# Create and upload dataset
dataset = Dataset.from_list(data)
print("Uploading dataset...")
dataset.push_to_hub(REPO_NAME, token=HF_TOKEN)

print(f"Upload complete! Dataset available at: https://huggingface.co/datasets/{REPO_NAME}")

Creating repository...
Repository might already exist: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67a3448a-6b5e88a7254cc7a310048776;4ea947ba-0e52-487c-a5b6-8933f56c3499)

You already created this dataset repo
Preparing dataset...
Uploading dataset...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Upload complete! Dataset available at: https://huggingface.co/datasets/mbole/hebrew-tzfira-dataset
