In [7]:
from collections import OrderedDict

class BookIndexer:
    def __init__(self, page_files, exclude_file, index_file):
        self.page_files = page_files
        self.exclude_file = exclude_file
        self.index_file = index_file
        self.index = {}
        self.excludes = self.read_excludes()
        self.add_header(index_file)
    
    def add_header(self, index_file):
        with open(index_file, "w") as f:
            f.write("Word : Page Numbers\n-------------------\n")
    
    def read_excludes(self):
        excludes = set()
        with open(self.exclude_file) as f:
            for word in f.read().split():
                excludes.add(word)
        return excludes
    
    def read_page(self, page_file):
        with open(page_file, encoding="mbcs") as f:
            words = f.read().split()
            
        return set(filter(lambda x: x not in self.excludes, words))
    
    def index_pages(self):
        page_num = 1
        for page_file in self.page_files:
            page_words = self.read_page(page_file)
            
            for word in page_words:
                word = self.get_filtered_word(word)
                if word == "":
                    continue
                if word not in self.index:
                    self.index[word] = set()
                self.index[word].add(page_num)
            page_num += 1
        
        with open(self.index_file, 'a') as f:
            for word in sorted(self.index.keys()):
                pages = ','.join(map(str, sorted(self.index[word])))
                f.write(f"{word} : {pages}\n")
                
    def get_filtered_word(self, word):
        word = word.lower()
        retracted_word = ""
        for letter in word:
            if letter.isalpha():
                retracted_word += letter
        if retracted_word in self.excludes:
            return ""
        if retracted_word in word:
            return retracted_word
        return ""
    
page_files = ['Page1.txt', 'Page2.txt', 'Page3.txt']
exclude_file = 'exclude-words.txt'
index_file = 'index.txt'

book_indexer = BookIndexer(page_files, exclude_file, index_file)
book_indexer.index_pages()