## Example 1:  Installation without conda/pip/apt-get.

In [None]:
# For those who cannot use conda, pip or apt-get to install packages.
# They need to download the whole environment folder from github,
# and then append the "path to modules" to sys.path.

# sys.path explanation:
# When you start a Python interpreter, one of the things it creates automatically is a list that contains 
# all of directories it will use to search for modules when importing.
# This list is available in a variable named sys.path. 

import sys
if "../tesserocr_env/lib/python3.7/site-packages" not in sys.path:
    sys.path.append("../tesserocr_env/lib/python3.7/site-packages")
print(sys.path)

## Example 2: Image to text. 

In [None]:
from tesserocr import PyTessBaseAPI
# only for displaying images.
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

images = ["input/sample.jpg", "input/sample2.jpg", "input/sample3.jpg"]

with PyTessBaseAPI() as api:
    for image in images:
        api.SetImageFile(image)
        
        %pylab inline
        img=mpimg.imread(image)
        imgplot = plt.imshow(img)
        plt.show()
        
        print(api.GetUTF8Text())
        print(api.AllWordConfidences())
# api is automatically finalized when used in a with-statement (context manager).
# otherwise api.End() should be explicitly called when it's no longer needed.

## Example 3: pdf to image. 

In [None]:
from pdf2image import convert_from_path, convert_from_bytes
from PIL import Image

from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)
'''
We need to specify the [poppler_path] variable to locate the [pdfinfo] executable (from poppler installation).
That is, [poppler_path] is the path to the folder containing [pdfinfo].

Directory tree structure of this project:

/tesserocr
├── src
│   ├── example.ipynb
│
└── tesserocr_env
    ├── bin
       ├── pdfinfo
       
Therefore, when navigating from example.ipynb, "../tesserocr_env/bin" is the folder containing pdfinfo.
''' 
## input pdf file from /tesserocr/src/input
images = convert_from_path('input/pdf2image.pdf', poppler_path="../tesserocr_env/bin")
for i in range(len(images)):
    image = images[i]
    # image.show()
    ## save files in /tesserocr/src/input
    image.save("input/pdf2image"+str(i+1)+".jpg")

## Example 4: Detect image orientation automatically (like if it is rotated by 90 degree).

In [None]:
# remember to import PSM.
from tesserocr import PyTessBaseAPI, PSM
# only for displaying images.
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

images = ["input/pdf2image3.jpg", "input/pdf2image4.jpg"]

# By specifying psm=PSM.AUTO_OSD, we can detect images even when some of them are rotated by 90 degree.
# There are other page segmentation modes (PSMs):
'''
0 : OSD_ONLY: Orientation and script detection only.
1 : AUTO_OSD: Automatic page segmentation with orientation and script detection. (OSD)
2 : AUTO_ONLY: Automatic page segmentation, but no OSD, or OCR.
3 : AUTO: Fully automatic page segmentation, but no OSD. (default mode for tesserocr)
4 : SINGLE_COLUMN: Assume a single column of text of variable sizes.
5 : SINGLE_BLOCK_VERT_TEXT: Assume a single uniform block of vertically aligned text.
6 : SINGLE_BLOCK: Assume a single uniform block of text.
7 : SINGLE_LINE: Treat the image as a single text line.
8 : SINGLE_WORD: Treat the image as a single word.
9 : CIRCLE_WORD: Treat the image as a single word in a circle.
10 : SINGLE_CHAR: Treat the image as a single character.
11 : SPARSE_TEXT: Find as much text as possible in no particular order.
12 : SPARSE_TEXT_OSD: Sparse text with orientation and script detection
13 : RAW_LINE: Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
'''

with PyTessBaseAPI(psm=PSM.AUTO_OSD) as api:
    for image in images:
        api.SetImageFile(image)
        
        %pylab inline
        img=mpimg.imread(image)
        imgplot = plt.imshow(img)
        plt.show()
        
        print(api.GetUTF8Text())
        print(api.AllWordConfidences())
# api is automatically finalized when used in a with-statement (context manager).
# otherwise api.End() should be explicitly called when it's no longer needed.