### Numpy


In [None]:
import numpy as np

numbers = [100, 102, 98, 97, 103]
print(np.std(numbers))
print(np.mean(numbers))

### Image processing and text recognition


In [None]:
from PIL import Image, ImageFilter

kitten = Image.open('kitten.jpg')
blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
blurryKitten.save('kitten_blurred.jpg')
blurryKitten.show()

In [None]:
from PIL import Image
import pytesseract

print(pytesseract.image_to_string(Image.open('files/test.png')))

In [None]:
from PIL import Image
import pytesseract
from pytesseract import Output

print(
    pytesseract.image_to_data(Image.open('files/test.png'),
                              output_type=Output.DICT))
print(
    pytesseract.image_to_string(Image.open('files/test.png'),
                                output_type=Output.BYTES))

In [None]:
from PIL import Image
import pytesseract


def cleanFile(filePath, newFilePath):
    image = Image.open(filePath)
    # Set a threshold value for the image, and save
    image = image.point(lambda x: 0 if x < 143 else 255)
    image.save(newFilePath)
    return image


image = cleanFile('files/textBad.png', 'files/textCleaned.png')
# call tesseract to perform OCR on the newly created image
print(pytesseract.image_to_string(image))

In [None]:
import pytesseract
from pytesseract import Output
from PIL import Image
import numpy as np


def cleanFile(filePath: str, threshold: int):
    image = Image.open(filePath)
    # Set a threshold value for the image, and save
    image = image.point(lambda x: 0 if x < threshold else 255)
    return image


def getConfidence(image: Image):
    data = pytesseract.image_to_data(image, output_type=Output.DICT)
    text = data['text']
    confidences = []
    numChars = []
    for i in range(len(text)):
        if data['conf'][i] > -1:
            confidences.append(data['conf'][i])
            numChars.append(len(text[i]))
    return np.average(confidences, weights=numChars), sum(numChars)


filePath = 'files/textBad.png'
start = 80
step = 5
end = 200

for threshold in range(start, end, step):
    image = cleanFile(filePath, threshold)
    scores = getConfidence(image)
    print("threshold: " + str(threshold) + ", confidence: " + str(scores[0]) +
          " numChars " + str(scores[1]))

In [None]:
import time
from urllib.request import urlretrieve
from PIL import Image
import tesseract
from selenium import webdriver as wd
from selenium.webdriver.common.by import By


def getImageText(imageUrl: str):
    urlretrieve(image, 'page.jpg')
    p = subprocess.Popen(['tesseract', 'page.jpg', 'page'],
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    p.wait()
    f = open('page.txt', 'r')
    print(f.read())


# Create a new Selenium driver
driver = wd.Chrome()
driver.get(
    'https://www.amazon.com/Death-Ivan-Ilyich-Nikolayevich-Tolstoy/dp/1427027277'
)
time.sleep(2)
# Click on the button to preview the book
driver.find_element(By.ID, 'imgBlkFront').click()
imageList = []
# Wait for the page to load
time.sleep(5)

while 'pointer' in driver.find_element(
        By.ID, 'sitbReaderRightPageTurner').get_attribute('style'):
    # While the right arrow is available to click, turns pages
    driver.find_element(By.ID, 'sitbReaderRightPageTurner').click()
    time.sleep(2)
    # Get any new page loaded (multiple pages can be
    # loaded at once, but duplicate pages are not
    # added to a set
    pages = driver.find_element(By.XPATH,
                                '//div[@class=\'pageImage\']/div/img')
    if not len(pages):
        print("No pages found")
    for page in pages:
        image = page.get_attribute('src')
        print('Found image: {}'.format(image))
        if image not in imageList:
            imageList.append(image)
            getImageText(image)
driver.quit()

### Tesseract training


In [None]:
from PIL import Image
import subprocess
import os

# Steps to take before running:
# Set TESSDATA_PREFIX to correct directory
# Put image and box files together in the same directory
# Label each corresponding file with the same filenames

CLEANED_DIR = 'cleaned'
BOX_DIR = 'box'
EXP_DIR = 'exp'


class TesseractTrainer():

    def __init__(self, languageName, fontName, directory='data'):
        self.languageName = languageName
        self.fontName = fontName
        self.directory = directory

    def runAll(self):
        os.chdir(self.directory)
        self.createDirectories()
        self.createFontProperties()
        prefixes = self.renameFiles()
        self.createTrainingFiles(prefixes)
        self.extractUnicode()
        self.runShapeClustering()
        self.runMfTraining()
        self.runCnTraining()
        self.createTessData()

    def createDirectories(self):
        if not os.path.exists(CLEANED_DIR):
            os.mkdir(CLEANED_DIR)
        if not os.path.exists(EXP_DIR):
            os.mkdir(EXP_DIR)

    def createFontProperties(self):
        with open(f'{EXP_DIR}/font_properties', 'w') as f:
            f.write('f{self.fontName} 0 0 0 0 0')

    def cleanImages(self):
        images_dir = 'images'
        print("CLEANING IMAGES...")
        for fileName in os.listdir(images_dir):
            root, ext = os.path.splitext(fileName)
            if ext in ['.jpg', '.jpeg', '.png']:
                image = Image.open(f'{images_dir}/{fileName}')
                # Set a threshold value for the image, and save
                image = image.point(lambda x: 0 if x < 250 else 255)
                image.save(f'{CLEANED_DIR}/{root}.tiff')

    # Looks for box files, uses the box filename to find the corresponding
    # .tiff file. Renames all files with the appropriate "<language>.<font>.exp<N>" filename
    def renameFiles(self):
        file_prefixes = []
        for i, boxFile in enumerate(
            [f for f in os.listdir(BOX_DIR) if f.endswith('.box')]):
            root, _ = os.path.splitext(boxFile)
            os.system(
                f'cp {CLEANED_DIR}/{root}.tiff {EXP_DIR}/{self.languageName}.{self.fontName}.exp{i}.tiff'
            )
            os.system(
                f'cp {BOX_DIR}/{root}.box {EXP_DIR}/{self.languageName}.{self.fontName}.exp{i}.box'
            )
            file_prefixes.append(f'{self.languageName}.{self.fontName}.exp{i}')

        return file_prefixes

    # Creates a training file for a single tiff/box pair
    def createTrainingFiles(self, prefixes):
        print("CREATING TRAINING DATA...")
        os.chdir(EXP_DIR)
        for prefix in prefixes:
            p = subprocess.Popen([
                "tesseract", prefix + ".tiff", prefix, "nobatch", "box.train"
            ],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            returnValue = stdout_value = p.communicate()[1]
            returnValue = returnValue.decode("utf-8")
            print(prefix)
            print(returnValue)
            if "Empty page!!" in returnValue:
                print(returnValue)
                subprocess.call([
                    "tesseract", "-psm", "7", prefix + ".tiff", prefix,
                    "nobatch", "box.train"
                ])
        os.chdir('..')

    def extractUnicode(self):
        print("EXTRACTING UNICODE...")
        extractCommand = ['unicharset_extractor'] + [
            f for f in os.listdir(EXP_DIR) if f.endswith('.box')
        ]
        os.chdir(EXP_DIR)
        p = subprocess.Popen(extractCommand)
        p.wait()
        os.chdir('..')

    def runShapeClustering(self):
        print("RUNNING SHAPE CLUSTERING...")
        # shapeclustering -F font_properties -U unicharset eng.captchaFont.exp0.tr...
        shapeCommand = [
            'shapeclustering', '-F', 'font_properties', '-U', 'unicharset'
        ]
        shapeCommand = shapeCommand + self.getTrainingFileList()
        os.chdir(EXP_DIR)
        p = subprocess.Popen(shapeCommand)
        p.wait()
        os.chdir('..')

    def runMfTraining(self):
        # mftraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...
        print("RUNNING MF CLUSTERING...")
        mfCommand = ['mftraining', '-F', 'font_properties', '-U', 'unicharset']
        mfCommand = mfCommand + self.getTrainingFileList()
        os.chdir(EXP_DIR)
        p = subprocess.Popen(mfCommand)
        p.wait()
        os.chdir('..')

    def runCnTraining(self):
        # cntraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...
        print('RUNNING MF CLUSTERING...')
        cnCommand = ['cntraining', '-F', 'font_properties', '-U', 'unicharset']
        cnCommand = cnCommand + self.getTrainingFileList()
        os.chdir(EXP_DIR)
        p = subprocess.Popen(cnCommand)
        p.wait()
        os.chdir('..')

    def createTessData(self):
        print("CREATING TESS DATA...")
        os.chdir(EXP_DIR)
        # Rename all files and run combine_tessdata <language>.
        os.rename('unicharset', self.languageName + '.unicharset')
        os.rename('shapetable', self.languageName + '.shapetable')
        os.rename('inttemp', self.languageName + '.inttemp')
        os.rename('normproto', self.languageName + '.normproto')
        os.rename('pffmtable', self.languageName + '.pffmtable')

        p = subprocess.Popen(['combine_tessdata', self.languageName + '.'])
        # mv captcha.traineddata $TESSDATA_PREFIX/captcha.traineddata
        p.wait()
        os.chdir('..')

    # Retrieve a list of created training files
    def getTrainingFileList(self):
        return [f for f in os.listdir(EXP_DIR) if f.endswith('.tr')]


trainer = TesseractTrainer('captcha', 'captchaFont')
trainer.runAll()

### Reading CAPTCHAs and submitting solutions


In [None]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps


def cleanImage(imagePath: str):
    image = Image.open(imagePath)
    image = image.point(lambda x: 0 if x < 143 else 255)
    borderImage = ImageOps.expand(image, border=20, fill='white')
    borderImage.save(imagePath)


html = urlopen('http://www.pythonscraping.com/humans-only')
bs = BeautifulSoup(html, 'html.parser')

# Collects previously filled form values - page 243
imageLocation = bs.find('img', {'title': 'Image CAPTCHA'})['src']
formBuildId = bs.find('input', {'name': 'form_build_id'})['value']
captchaSid = bs.find('input', {'name': 'captcha_sid'})['value']
captchaToken = bs.find('input', {'name': 'captcha_token'})['value']

captchaUrl = 'http://pythonscraping.com' + imageLocation
urlretrieve(captchaUrl, 'captcha.jpg')
cleanImage('captcha.jpg')
p = subprocess.Popen(['tesseract', 'captcha.jpg', 'captcha'],
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE)
p.wait()
f = open('captcha.txt', 'r')

# Clear any whitespace characters
captchaResponse = f.read().replace(' ', '').replace('\n', '')
print('Captcha solution attempt: ' + captchaResponse)

if len(captchaResponse) == 5:
    params = {
        'captcha_token': captchaToken,
        'captcha_sid': captchaSid,
        'form_id': 'comment_node_page_form',
        'form_build_id': formBuildId,
        'captcha_response': captchaResponse,
        'name': 'Ryan Mitchell',
        'subject': 'I come to seek the Grail',
        'comment_body[und][0][value]': '...and I am definitely not a bot'
    }

r = requests.post('http://www.pythonscraping.com/comment/reply/10',
                  data=params)

responseObj = BeautifulSoup(r.text, 'html.parser')
if responseObj.find('div', {'class': 'messages'}) is not None:
    print(responseObj.find('div', {'class': 'messages'}).get_text())

else:
    print('There was a problem reading the CAPTCHA correctly!')