## Read the pdf file using PyPDFLoader and apply Underwoods list of ocr corrections

In [8]:
# Save start time

print ('Import libraries')
# import libraries
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain.document_loaders import PyPDFLoader
import pandas as pd
import re
import os
import csv
import json
from tqdm import tqdm  # Import tqdm for progress tracking
import time

print ('Start time')
start_time = time.time()

####### PDF loader #############
# load pdf
# Move to pdf folder
os.chdir(r'.\literature pdf')

# load text from pdf
pdf_file = '1903 Thirty seasons in Scandinavia.pdf'
book_title = pdf_file[:-4]
print (f'Start processing "{book_title}".\nWait a few seconds.')
loader = PyPDFLoader(pdf_file)
# Load PDF pages
pdf_pages = list(tqdm(loader.load(), desc="Loading PDF pages", colour='yellow'))

# Add one to get a reference back to the right page in pdf file
def get_pdf_page_no(pageNo):
    return pageNo + 1

# Extract the name of the source
def get_page_source(pageSource):
    return pageSource.replace('.pdf', '').split('\\')[-1]

print ('Preprocess text')
# preprocess text
def preproces_text(text_string):
    # Store text data in variable
    # Remove newlines, replace apostrophe s with s, remove quotation marks
    text_string_replace = text_string.replace(r' \xad\n', '').replace(r'\xad\n', '') \
        .replace(r'\xad', '').replace(r'\xad ', '') \
        .replace(r'-\n', '').replace('—', ' ').replace(r'\n', ' ') \
        .replace("'s", "s").replace('"', ' ').replace("'", " ") \
        .replace('.', ' . ').replace(' .', ' . ').replace(',', ' ,') \
        .replace('!', ' !').replace('?', ' ?')

    # scrub for multiple signs
    # but keep full stops, commas, exclamation, and question marks
    text_string_re = re.sub(r':|;|\(|\)|\||\+|\"|‘|’|“|”|\’|…|\-|–|—|\$|&|\*|>|<|\/|\[|\]|»|«', '', text_string_replace)

    # Remove numbers and words containing numbers
    text_string_no_numbers = re.sub(r'\b\w*\d\w*\b', '', text_string_re)
    # Remove multiple whitespaces
    text_string_sub_additional_white_space = re.sub(r'\s+', ' ', text_string_no_numbers)

    return text_string_sub_additional_white_space

# Extract page info
def extract_page_data(pdf_pages):
    # prepare empty lists
    content_list = []
    semi_clean_text_list = []
    page_no_list = []
    source_list = []

    for i in tqdm(pdf_pages, desc="Extracting and cleaning data", colour='yellow'):
        # take page number
        page = i

        # return the text content, preproces text, and metadata (source and page no.)
        pageContent = page.page_content
        semi_clean_text = preproces_text(pageContent)
        pageNo = page.metadata['page']
        pageSource = page.metadata['source']

        # append to lists
        content_list.append(pageContent)
        semi_clean_text_list.append(semi_clean_text)
        page_no_list.append(get_pdf_page_no(pageNo))
        source_list.append(get_page_source(pageSource))

    # output data
    return content_list, semi_clean_text_list, page_no_list, source_list

# return lists of content, page numbers, and source
C, SC, P, S = extract_page_data(pdf_pages)

# send lists to a dataframe
df = pd.DataFrame({'content': C, 'source': S, 'element_no': P, 'preprocessed_content': SC})


#######################

# go back to main folder
os.chdir('..')


print('Starting ocr correction - loading correction rules')
# Read the correction rules from the file
with open('Underwoods_CorrectionRules.txt', 'r', encoding='utf-8') as f:
    CorrectionRules_string = f.read()

# Process the correction rules
CorrectionRules_list = CorrectionRules_string.split('\n')
list_of_CorrectionRules_list = [i.split('\t') for i in CorrectionRules_list]
better_list_of_CorrectionRules = [i[0:2] for i in list_of_CorrectionRules_list]

# Add additional pairs
observations_to_add = [["Bruffels", 'Brussels'], ["fix", 'six'], ['Elsinore', 'Helsingør']]
better_list_of_CorrectionRules.extend(observations_to_add)

# Build correction dictionary
correction_dict = dict(better_list_of_CorrectionRules)



print('Ocr correction')

def ocr_post_correction(text):
    # Prepare for correction
    ### Keep track of the conditions of the words (capital, title, upper,lower)
    tokens = text.split()

    # Track the original case of each word
    original_case = [i for i in tokens]
    clean_text_list = [i.lower() for i in tokens]

    # Initialize the list to store corrected words and a dictionary to track corrections
    new_word_list = []
    corrections_tracker = {}

    # Iterate through the clean_text_list word list
    for index, word in enumerate(clean_text_list):
        # Check if the word has a correction in the dictionary
        if word in correction_dict:
            # If it does, append the corrected word to the new list and track the correction
            corrected_word = correction_dict[word]
            # Restore the original case of the word
            if original_case[index].istitle():
                corrected_word = corrected_word.capitalize()
            elif original_case[index].isupper():
                corrected_word = corrected_word.upper()
            new_word_list.append(corrected_word)
            corrections_tracker[word] = corrected_word

            # Extract context around the corrected word
            start_index = max(0, index - 10)
            end_index = min(len(clean_text_list), index + 11)
            context = clean_text_list[start_index:end_index]

            # Store the context with the correction
            corrections_tracker[word] = {
                'corrected_word': corrected_word,
                'context': ' '.join(context)
            }
        else:
            # If it doesn't, append the original word
            new_word_list.append(original_case[index])

    # Send data back to text
    new_corrected_text = ' '.join(new_word_list)

    return corrections_tracker, new_corrected_text

# Use the function and make two new columns to add to the dataframe
df['corrections_tracker'], df['ocr_corrected_text'] = zip(*tqdm(df['preprocessed_content'].apply(ocr_post_correction), desc="Applying OCR post-correction", colour='yellow'))

print ('Saving data')
# Save the df as a csv file
os.chdir(r'.\literature csv')
df.to_csv(f'{book_title}.csv', index=False)


# Save txt versions
os.chdir('..')
os.chdir(r'.\literature txt')

# Save text version without spell correction
text = ' '.join(df['preprocessed_content'].to_list())
with open(f'{book_title} without ocr correction.txt', 'w', encoding='utf-8') as f:
    f.write(text)

# Save text version with spell correction
# Join the corrected words back into a string
ocr_corrected_text = ' '.join(df['ocr_corrected_text'].to_list())
with open(f'{book_title} with ocr correction.txt', 'w', encoding='utf-8') as f:
    f.write(ocr_corrected_text)


########
# Save ocr corrections
print ('Save ocr corrections')
os.chdir('..')
os.chdir(r'.\ocr_correction_tracker')
obs_corrections_tracker = []
for i,j in enumerate(df['corrections_tracker'].to_list()):
    if j:
        observation = f"Index no.: {str(i)}. OCR observation: {str(j)}"
        observation = observation.replace('{', '').replace('}', '')
        obs_corrections_tracker.append(observation)

ocr_observations = '\n\n'.join(obs_corrections_tracker)

with open(f'{book_title} ocr corrections.txt', 'w', encoding='utf-8') as f:
    f.write(ocr_observations)

##############



# change to main folder
os.chdir('..')

print ('Script done.')
# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time
print(f"Time taken to run the script: {elapsed_time:.6f} seconds")

Import libraries
Start time
Start processing "1903 Thirty seasons in Scandinavia".
Wait a few seconds.


Loading PDF pages: 100%|[33m██████████[0m| 337/337 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 337/337 [00:00<00:00, 8227.62it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 337/337 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 6.405390 seconds





In [9]:
df

Unnamed: 0,content,source,element_no,preprocessed_content,corrections_tracker,ocr_corrected_text
0,Digitaliseret af | Digitised by\nForfatter(e) ...,1903 Thirty seasons in Scandinavia,1,Digitaliseret af Digitised by Forfatter e Auth...,{},Digitaliseret af Digitised by Forfatter e Auth...
1,"ThirtÖmws\n■\n,N\n^(ANDINAVH/ w\nytJ,■",1903 Thirty seasons in Scandinavia,2,"ThirtÖmws ■ ,N ^ ANDINAVH w ytJ ,■",{},"ThirtÖmws ■ ,N ^ ANDINAVH w ytJ ,■"
2,Det Kgl. Bibliotek\n130025471602,1903 Thirty seasons in Scandinavia,3,Det Kgl . Bibliotek 130025471602,{},Det Kgl . Bibliotek 130025471602
3,,1903 Thirty seasons in Scandinavia,4,,{},
4,,1903 Thirty seasons in Scandinavia,5,,{},
...,...,...,...,...,...,...
332,,1903 Thirty seasons in Scandinavia,333,,{},
333,,1903 Thirty seasons in Scandinavia,334,,{},
334,,1903 Thirty seasons in Scandinavia,335,,{},
335,,1903 Thirty seasons in Scandinavia,336,,{},


In [4]:
print ('Import libraries')
# import libraries
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain.document_loaders import PyPDFLoader
import pandas as pd
import re
import os
import csv
import json
from tqdm import tqdm  # Import tqdm for progress tracking
import time

Import libraries


In [5]:
path_to_folder = r'C:\Users\lakj\Documents\GitHub\nordic travel literature\literature pdf'
no_of_files = len(os.listdir(path_to_folder))
files = os.listdir(path_to_folder)

In [6]:

def input_is_folder(pdf_file):

    
    print ('Start time')
    start_time = time.time()
    
    ####### PDF loader #############
    # load pdf
    # Move to pdf folder
    os.chdir(r'.\literature pdf')
    
    # load text from pdf
    #pdf_file = 'Scenes of travel in Norway J C Phythian.pdf'
    book_title = pdf_file[:-4]
    print (f'Start processing "{book_title}".\nWait a few seconds.')
    loader = PyPDFLoader(pdf_file)
    # Load PDF pages
    pdf_pages = list(tqdm(loader.load(), desc="Loading PDF pages", colour='yellow'))
    
    # Add one to get a reference back to the right page in pdf file
    def get_pdf_page_no(pageNo):
        return pageNo + 1
    
    # Extract the name of the source
    def get_page_source(pageSource):
        return pageSource.replace('.pdf', '').split('\\')[-1]
    
    print ('Preprocess text')
    # preprocess text
    def preproces_text(text_string):
        # Store text data in variable
        # Remove newlines, replace apostrophe s with s, remove quotation marks
        text_string_replace = text_string.replace(r' \xad\n', '').replace(r'\xad\n', '') \
            .replace(r'\xad', '').replace(r'\xad ', '') \
            .replace(r'-\n', '').replace('—', ' ').replace(r'\n', ' ') \
            .replace("'s", "s").replace('"', ' ').replace("'", " ") \
            .replace('.', ' . ').replace(' .', ' . ').replace(',', ' ,') \
            .replace('!', ' !').replace('?', ' ?')
    
        # scrub for multiple signs
        # but keep full stops, commas, exclamation, and question marks
        text_string_re = re.sub(r':|;|\(|\)|\||\+|\"|‘|’|“|”|\’|…|\-|–|—|\$|&|\*|>|<|\/|\[|\]|»|«', '', text_string_replace)
    
        # Remove numbers and words containing numbers
        text_string_no_numbers = re.sub(r'\b\w*\d\w*\b', '', text_string_re)
        # Remove multiple whitespaces
        text_string_sub_additional_white_space = re.sub(r'\s+', ' ', text_string_no_numbers)
    
        return text_string_sub_additional_white_space
    
    # Extract page info
    def extract_page_data(pdf_pages):
        # prepare empty lists
        content_list = []
        semi_clean_text_list = []
        page_no_list = []
        source_list = []
    
        for i in tqdm(pdf_pages, desc="Extracting and cleaning data", colour='yellow'):
            # take page number
            page = i
    
            # return the text content, preproces text, and metadata (source and page no.)
            pageContent = page.page_content
            semi_clean_text = preproces_text(pageContent)
            pageNo = page.metadata['page']
            pageSource = page.metadata['source']
    
            # append to lists
            content_list.append(pageContent)
            semi_clean_text_list.append(semi_clean_text)
            page_no_list.append(get_pdf_page_no(pageNo))
            source_list.append(get_page_source(pageSource))
    
        # output data
        return content_list, semi_clean_text_list, page_no_list, source_list
    
    # return lists of content, page numbers, and source
    C, SC, P, S = extract_page_data(pdf_pages)
    
    # send lists to a dataframe
    df = pd.DataFrame({'content': C, 'source': S, 'element_no': P, 'preprocessed_content': SC})
    
    
    #######################
    
    # go back to main folder
    os.chdir('..')
    
    
    print('Starting ocr correction - loading correction rules')
    # Read the correction rules from the file
    with open('Underwoods_CorrectionRules.txt', 'r', encoding='utf-8') as f:
        CorrectionRules_string = f.read()
    
    # Process the correction rules
    CorrectionRules_list = CorrectionRules_string.split('\n')
    list_of_CorrectionRules_list = [i.split('\t') for i in CorrectionRules_list]
    better_list_of_CorrectionRules = [i[0:2] for i in list_of_CorrectionRules_list]
    
    # Add additional pairs
    observations_to_add = [["Bruffels", 'Brussels'], ["fix", 'six'], ['Elsinore', 'Helsingør']]
    better_list_of_CorrectionRules.extend(observations_to_add)
    
    # Build correction dictionary
    correction_dict = dict(better_list_of_CorrectionRules)
    
    
    
    print('Ocr correction')
    
    def ocr_post_correction(text):
        # Prepare for correction
        ### Keep track of the conditions of the words (capital, title, upper,lower)
        tokens = text.split()
    
        # Track the original case of each word
        original_case = [i for i in tokens]
        clean_text_list = [i.lower() for i in tokens]
    
        # Initialize the list to store corrected words and a dictionary to track corrections
        new_word_list = []
        corrections_tracker = {}
    
        # Iterate through the clean_text_list word list
        for index, word in enumerate(clean_text_list):
            # Check if the word has a correction in the dictionary
            if word in correction_dict:
                # If it does, append the corrected word to the new list and track the correction
                corrected_word = correction_dict[word]
                # Restore the original case of the word
                if original_case[index].istitle():
                    corrected_word = corrected_word.capitalize()
                elif original_case[index].isupper():
                    corrected_word = corrected_word.upper()
                new_word_list.append(corrected_word)
                corrections_tracker[word] = corrected_word
    
                # Extract context around the corrected word
                start_index = max(0, index - 10)
                end_index = min(len(clean_text_list), index + 11)
                context = clean_text_list[start_index:end_index]
    
                # Store the context with the correction
                corrections_tracker[word] = {
                    'corrected_word': corrected_word,
                    'context': ' '.join(context)
                }
            else:
                # If it doesn't, append the original word
                new_word_list.append(original_case[index])
    
        # Send data back to text
        new_corrected_text = ' '.join(new_word_list)
    
        return corrections_tracker, new_corrected_text
    
    # Use the function and make two new columns to add to the dataframe
    df['corrections_tracker'], df['ocr_corrected_text'] = zip(*tqdm(df['preprocessed_content'].apply(ocr_post_correction), desc="Applying OCR post-correction", colour='yellow'))
    
    print ('Saving data')
    # Save the df as a csv file
    os.chdir(r'.\literature csv')
    df.to_csv(f'{book_title}.csv', index=False)
    
    
    # Save txt versions
    os.chdir('..')
    os.chdir(r'.\literature txt')
    
    # Save text version without spell correction
    text = ' '.join(df['preprocessed_content'].to_list())
    with open(f'{book_title} without ocr correction.txt', 'w', encoding='utf-8') as f:
        f.write(text)
    
    # Save text version with spell correction
    # Join the corrected words back into a string
    ocr_corrected_text = ' '.join(df['ocr_corrected_text'].to_list())
    with open(f'{book_title} with ocr correction.txt', 'w', encoding='utf-8') as f:
        f.write(ocr_corrected_text)
    
    
    ########
    # Save ocr corrections
    print ('Save ocr corrections')
    os.chdir('..')
    os.chdir(r'.\ocr_correction_tracker')
    obs_corrections_tracker = []
    for i,j in enumerate(df['corrections_tracker'].to_list()):
        if j:
            observation = f"Index no.: {str(i)}. OCR observation: {str(j)}"
            observation = observation.replace('{', '').replace('}', '')
            obs_corrections_tracker.append(observation)
    
    ocr_observations = '\n\n'.join(obs_corrections_tracker)
    
    with open(f'{book_title} ocr corrections.txt', 'w', encoding='utf-8') as f:
        f.write(ocr_observations)
    
    ##############
    
    
    
    # change to main folder
    os.chdir('..')
    
    print ('Script done.')
    # Record the end time
    end_time = time.time()
    
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    
    # Print the elapsed time
    print(f"Time taken to run the script: {elapsed_time:.6f} seconds")

In [7]:
[input_is_folder(pdf_file) for pdf_file in files]

Start time
Start processing "1772 Travels through Holland Flanders Germany Denmark Sweden Lapland Russia the Ukraine and Poland in the years 1768 1769 and 1770 Vol 1".
Wait a few seconds.


Loading PDF pages: 100%|[33m██████████[0m| 202/202 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 202/202 [00:00<00:00, 4191.32it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 202/202 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 2.426439 seconds
Start time
Start processing "1807 A Tour round the Baltic".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 466/466 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 466/466 [00:00<00:00, 11490.16it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 466/466 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 76.484557 seconds
Start time
Start processing "1811 Travels in Iceland".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 562/562 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 562/562 [00:00<00:00, 10666.49it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 562/562 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 9.571139 seconds
Start time
Start processing "1813 Travels through Norway and Lapland during the Years 1806 1807 and 1808".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 933/933 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 933/933 [00:00<00:00, 3681.20it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 933/933 [00:00<?, ?it/s]

Saving data





Save ocr corrections
Script done.
Time taken to run the script: 20.421648 seconds
Start time
Start processing "1834 Excursions in the north of Europe through parts of Russia Finland Sweden Denmark and Norway".
Wait a few seconds.


Loading PDF pages: 100%|[33m██████████[0m| 424/424 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 424/424 [00:00<00:00, 10530.40it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 424/424 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 10.702873 seconds
Start time
Start processing "1839 Handbook for travellers in Denmark Norway Sweden and Russia".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 316/316 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 316/316 [00:00<00:00, 3151.15it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 316/316 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 12.863294 seconds
Start time
Start processing "1841 Visit to Northern Europe vol 1".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 408/408 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 408/408 [00:00<00:00, 7701.89it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 408/408 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 71.081715 seconds
Start time
Start processing "1842 Travels in Iceland".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 97/97 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 97/97 [00:00<00:00, 1337.37it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 97/97 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 8.438377 seconds
Start time
Start processing "1856 The Danes and the Swedes".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 433/433 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 433/433 [00:00<00:00, 5331.64it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 433/433 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 8.866470 seconds
Start time
Start processing "1857 The book of the Baltic being the North of Europe Steam Companys route to Denmark Sweden and Russia Norway Prussia and the Hanseatic Ports".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 169/169 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 169/169 [00:00<00:00, 4472.44it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 169/169 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 4.279276 seconds
Start time
Start processing "1860 A Residence in Jutland the Danish Isles and Copenhagen vol I".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 429/429 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 429/429 [00:00<00:00, 6042.83it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 429/429 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 15.038571 seconds
Start time
Start processing "1860 A Residence in Jutland the Danish Isles and Copenhagen vol II".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 407/407 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 407/407 [00:00<00:00, 5567.02it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 407/407 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 15.001379 seconds
Start time
Start processing "1861 Seasons with the sea horses".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 306/306 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 306/306 [00:00<00:00, 15908.19it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 306/306 [00:00<?, ?it/s]


Saving data
Save ocr corrections
Script done.
Time taken to run the script: 4.725705 seconds
Start time
Start processing "1862 A tour in Northern Europe".
Wait a few seconds.


Loading PDF pages: 100%|[33m██████████[0m| 415/415 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 415/415 [00:00<00:00, 6013.72it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 415/415 [00:00<00:00, 26541.78it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 7.314599 seconds
Start time
Start processing "1871 A handbook for travellers in Denmark Norway and Sweden".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 572/572 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 572/572 [00:00<00:00, 6430.02it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 572/572 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 27.017711 seconds
Start time
Start processing "1872 The Rob Roy on the Baltic".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 302/302 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 302/302 [00:00<00:00, 14998.46it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 302/302 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 6.779924 seconds
Start time
Start processing "1875 An American in Iceland".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 349/349 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 349/349 [00:00<00:00, 7197.00it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 349/349 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 7.989728 seconds
Start time
Start processing "1877 Scenes of travel in Norway".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 271/271 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 271/271 [00:00<00:00, 8672.66it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 271/271 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 4.338245 seconds
Start time
Start processing "1879 A holiday in Iceland".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 123/123 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 123/123 [00:00<00:00, 6769.80it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 123/123 [00:00<?, ?it/s]


Saving data
Save ocr corrections
Script done.
Time taken to run the script: 2.446751 seconds
Start time
Start processing "1880 Five weeks in Iceland".
Wait a few seconds.


Loading PDF pages: 100%|[33m██████████[0m| 199/199 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 199/199 [00:00<00:00, 10969.75it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 199/199 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 3.404396 seconds
Start time
Start processing "1885 In the Northmans land".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 376/376 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 376/376 [00:00<00:00, 5448.33it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 376/376 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 8.555696 seconds
Start time
Start processing "1886 The midnight sun the tsar and the nihilist".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 388/388 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 388/388 [00:00<00:00, 9588.28it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 388/388 [00:00<?, ?it/s]
Skipping broken line b'143f   143f   10300': Odd-length string
Skipping broken line b'1440   1440   10301': Odd-length string
Skipping broken line b'1441   1441   10302': Odd-length string
Skipping broken line b'1442   1442   10303': Odd-length string
Skipping broken line b'1443   1443   10304': Odd-length string
Skipping broken line b'1444   1444   10305': Odd-length string
Skipping broken line b'1445   1445   10306': Odd-length string
Skipping broken line b'1446   1446   10307': Odd-length string
Skipping broken line b'1447   1447   10308': Odd-length string
Skipping broken line b'1448   1448   10309': Odd-length string
Skipping broken line b'1449   1449   1030a': Odd-length string
Skipping broken line b'144a   144a   1030b': Odd-length string
Skipping broken line b'144b   144b   1030c': Odd-length string
Skipping broken line b'144c   144c   1030d': Odd-length string
Skipping broken line b'144d   144d   

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 6.717661 seconds
Start time
Start processing "1892 The boy travellers in nothern Europe".
Wait a few seconds.


Skipping broken line b'1483   1483   1d321': Odd-length string
Skipping broken line b'1484   1484   1d322': Odd-length string
Skipping broken line b'1485   1485   1d323': Odd-length string
Skipping broken line b'1486   1486   1d324': Odd-length string
Skipping broken line b'1487   1487   1d325': Odd-length string
Skipping broken line b'1488   1488   1d326': Odd-length string
Skipping broken line b'1489   1489   1d327': Odd-length string
Skipping broken line b'148a   148a   1d328': Odd-length string
Skipping broken line b'148b   148b   1d329': Odd-length string
Skipping broken line b'148c   148c   1d32a': Odd-length string
Skipping broken line b'148d   148d   1d32b': Odd-length string
Skipping broken line b'148e   148e   1d32c': Odd-length string
Skipping broken line b'148f   148f   1d32d': Odd-length string
Skipping broken line b'1490   1490   1d32e': Odd-length string
Skipping broken line b'1491   1491   1d32f': Odd-length string
Skipping broken line b'1492   1492   1d330': Odd-length

Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 1/1 [00:00<?, ?it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 1/1 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 0.753872 seconds
Start time
Start processing "1893 Sweden and the Swedes".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 766/766 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 766/766 [00:00<00:00, 5712.91it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 766/766 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 124.419775 seconds
Start time
Start processing "1895 Angling travels in Norway".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 320/320 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 320/320 [00:00<00:00, 15807.80it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 320/320 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 6.467244 seconds
Start time
Start processing "1897 Wild Norway".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 434/434 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 434/434 [00:00<00:00, 5344.51it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 434/434 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 10.146729 seconds
Start time
Start processing "1898 Through Finland in Carts".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 432/432 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 432/432 [00:00<00:00, 5916.98it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 432/432 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 77.553542 seconds
Start time
Start processing "1903 Thirty seasons in Scandinavia".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 337/337 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 337/337 [00:00<00:00, 5391.12it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 337/337 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 6.880169 seconds
Start time
Start processing "1909 Peeps at many Lands Finland".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 107/107 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 107/107 [00:00<00:00, 13253.91it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 107/107 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 1.513091 seconds
Start time
Start processing "1911 Two visits to Denmark 1872 1874".
Wait a few seconds.



Loading PDF pages: 100%|[33m██████████[0m| 401/401 [00:00<?, ?it/s]


Preprocess text


Extracting and cleaning data: 100%|[33m██████████[0m| 401/401 [00:00<00:00, 6599.32it/s]


Starting ocr correction - loading correction rules
Ocr correction


Applying OCR post-correction: 100%|[33m██████████[0m| 401/401 [00:00<?, ?it/s]

Saving data
Save ocr corrections
Script done.
Time taken to run the script: 8.652174 seconds





[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]