In [34]:
from file_processor import FileProcessor
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import docx
import string
import re

In [37]:
class DocumentProcessor(FileProcessor):
    
    def process_file(self, file):
        content = self.get_content(file)
        
        return self.get_word_array(content)
    
    
    def get_word_array(self, content):
        """
        content: string (content of the file)
        
        returns words array
        """
        # get stop words as a set
        en_stops = set(stopwords.words('english'))
        
        # remove punctuation from content thru regular expr.
        # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
        content = content.translate(str.maketrans('', '', string.punctuation))
        content = re.sub(r'(–|—|’|“|”)', '', content)
        
        # lower case
        content = content.lower()
        
        # tokenize content into an array
        words = word_tokenize(content)
    
        # convert to set
        words_set = set(words)
        
        # substract  stop words set from the words set
        words_set = words_set - en_stops
        
        # convert to list 
        # sort the array
        # return the list
        return sorted(list(words_set))
        

    def get_content(self, file):
        
        if '.doc' in file: 
            return self.read_doc(file)
        
        elif '.txt' in file: 
            return self.read_txt(file)
            
        # ... other file formats
        else:
            print (file, ' unknown file') # should not happen for this assignment
            return '' # return string so the application will not fail
        
        
    def read_doc(self, file):
        """
        reads doc/docx files
        
        return file content as a string
        """
        # https://www.pythonprogramming.in/how-to-read-data-from-docx-file-in-python.html
        try:
            doc = docx.Document(file)  # Creating word reader object.
            data = ""
            fullText = []
            for para in doc.paragraphs:
                fullText.append(para.text)
                data = ' '.join(fullText)

            return data

        except IOError:
            print('There was an error opening the file!')
            return ""
    
    
    def read_txt(self, file):
        """
        reads txt files 
        
        returns file content as a string
        """
        
        with open(file) as f:
            lines = f.readlines()
            return '\n'.join(lines)

In [38]:
DocumentProcessor().start()

Processing:  week_10_document1.docx
Processing:  random_text.txt
Processing:  how_rubber_goods_are_made.txt
Processing:  52256-0.txt
Processing:  pg43994.txt
Processing:  most_boring_part2.txt
Processing:  blind_text.txt
Processing:  pg14895.txt
Processing:  53031-0.txt
Processing:  58108-0.txt
Processing:  pg12814.txt
Processing:  smiley_the_bunny.txt
Processing:  most_boring_ever.txt
Processing:  dr_yawn.txt
Processing:  week_10_document2.docx
Saving File
Process Finished
