In [3]:
import PyPDF2
from tika import parser
from lxml import etree
import spacy
from spacy.lang.en import English
import bs4
import os
import pickle
import codecs

In [4]:
from nltk.corpus.reader.plaintext import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

nlp = spacy.load("en_core_web_sm")

DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w\s\d-]+).*'

In [5]:
class TextCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for raw text documents to enable preprocessing.
    """

    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8', **kwargs):
        """
        Initialize the corpus reader. 
        Arguments are passed to the ``CorpusReader`` constructor.
        """
        
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
        
        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
        
    def resolve(self, fileids, categories=None):
        """
        Returns a list of fileids.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of the document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()
            
    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)


In [6]:
import os

class Preprocessor(object):
    """
    The preprocessor wraps an `TextCorpusReader` and performs tokenization
    and part-of-speech tagging.
    """
    def __init__(self, root, corpus, target=None, **kwargs):
        self.corpus = corpus
        self.target = target

    def fileids(self, fileids=None):
        fileids = self.corpus.resolve(fileids)
        if fileids:
            return fileids
        return self.corpus.fileids()
    
    def abspath(self, fileid):
        # Find the directory, relative to the corpus root.
        parent = os.path.relpath(
            os.path.dirname(self.corpus.abspath(fileid)), self.corpus.root
        )

        # Compute the name parts to reconstruct
        basename  = os.path.basename(fileid)
        name, ext = os.path.splitext(basename)

        # Create the pickle file extension
        basename  = name + '.pickle'

        # Return the path to the file relative to the target.
        return os.path.normpath(os.path.join(self.target, parent, basename))
    
    def create_doc_struc(self, root, fileid):
        
        file_id = root + '/' + fileid
        file_data = parser.from_file(file_id)
        # Get files text content
        content_text = file_data['content']
        # Create doc object
        doc = nlp(content_text)
        
        doc_ls = []
        for i in doc.sents:
            sent_ls=[]
            for j in i:
                if j.is_stop == False:
                    if j.is_alpha == True:
                        token = j.text
                        pos = j.pos_
                        tup = (token,pos)
                        sent_ls.append(tup)
            doc_ls.append(sent_ls)
        yield doc_ls
            
    def process(self, fileid):
        """
        For a single file, checks the location on disk to ensure no errors,
        uses +tokenize()+ to perform the preprocessing, and writes transformed
        document as a pickle to target location.
        """
        # Compute the outpath to write the file to.
        target = self.abspath(fileid)
        parent = os.path.dirname(target)

        # Make sure the directory exists
        if not os.path.exists(parent):
            os.makedirs(parent)

        # Make sure that the parent is a directory and not a file
        if not os.path.isdir(parent):
            raise ValueError(
                "Please supply a directory to write preprocessed data to."
            )

        # Create a data structure for the pickle
        document = list(self.create_doc_struc(root,fileid))

        # Open and serialize the pickle to disk
        with open(target, 'wb') as f:
            pickle.dump(document, f, pickle.HIGHEST_PROTOCOL)

        # Clean up the document
        del document

        # Return the target fileid
        return target
    
    def transform(self, fileids=None):
        # Make the target directory if it doesn't already exist
        if not os.path.exists(self.target):
            os.makedirs(self.target)

        # Resolve the fileids to start processing
        for fileid in self.fileids(fileids):
            yield self.process(fileid)

In [7]:
PKL_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d-]+\.pickle'

class TextPickledCorpusReader(TextCorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
        
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
    
    def docs(self, fileids=None, categories=None):
        fileids = self.resolve(fileids, categories=None)
        # Load one pickled document into memory at a time.
        for path in self.abspaths(fileids):
            with open(path, 'rb') as f:
                yield pickle.load(f)
    
    def paras(self, fileids=None, categories=None):
        for doc in self.docs(fileids):
            for para in doc:
                yield para
                
    def sents(self, fileids=None, categories=None):
        for para in self.paras(fileids):
            for sent in para:
                yield sent
                
    def tagged(self, fileids=None, categories=None):
        for sent in self.sents(fileids):
            for tagged_token in sent:
                yield tagged_token

    def words(self, fileids=None, categories=None):
        for tagged in self.tagged(fileids):
            yield tagged[0]
