# DS 5001 Week 2 Lab: Text into Data: Class for Importing a Text

## Define Class

In [263]:
import pandas as pd

class TexImporter():
    
    def __init__(self, src_file, OHCO, data_in='./', data_out='./', 
                src_encoding='utf-8-sig'):
        self.src_file = src_file
        self.data_in = data_in
        self.data_out = data_out
        self.src_encoding = src_encoding
        
    def import_source(self):
        self.epub = open("{}/{}".format(self.data_in, self.src_file), 'r', encoding=self.src_encoding)\
            .readlines()
        self.lines = pd.DataFrame(self.epub, columns=['line_str'])
        self.lines.index.name = 'line_num'
        self.lines.line_str = self.lines.line_str.str.strip()
        return self
        
    def clip_lines(self, start_line_pat, end_line_pat):
        start = self.lines.line_str.str.match(start_line_pat)
        end = self.lines.line_str.str.match(end_line_pat)
        start_line_num = self.lines.loc[start].index[0]
        end_line_num = self.lines.loc[end].index[0]
        self.lines = self.lines.loc[start_line_num + 1 : end_line_num - 2]
        return self
        
    def chunk_chapters(self, chap_title_pat):
        chap_lines = self.lines.line_str.str.match(chap_title_pat, case=False)
        chap_nums = [i+1 for i in range(self.lines.loc[chap_lines].shape[0])]
        self.lines.loc[chap_lines, 'chap_num'] = chap_nums
        self.lines.chap_num = self.lines.chap_num.ffill()
        self.lines = self.lines.loc[~self.lines.chap_num.isna()] # Remove everything before Chapter 1
        self.lines = self.lines.loc[~chap_lines] # Remove chapter heading lines
        self.lines.chap_num = self.lines.chap_num.astype('int') # Convert chap_num from float to int
        self.chaps = self.lines.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
        self.chaps.line_str = self.chaps.line_str.str.strip()
        return self
        
    def split_paragraphs(self, para_pat=r'\n\n+'):
        self.paras = self.chaps['line_str'].str.split(para_pat, expand=True).stack()\
            .to_frame().rename(columns={0:'para_str'})
        self.paras.index.names = OHCO[:2]
        self.paras['para_str'] = self.paras['para_str'].str.replace(r'\n', ' ').str.strip()
        self.paras = self.paras[~self.paras['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        return self
        
    def split_sentences(self, sent_pat=r'[.?!;:]+'):
        self.sents = self.paras['para_str'].str.split(sent_pat, expand=True).stack()\
            .to_frame().rename(columns={0:'sent_str'})
        self.sents.index.names = OHCO[:3]
        self.sents = self.sents[~self.sents['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        return self
        
    def split_tokens(self, token_pat=r"[\s',-]+"):
        self.tokens = self.sents['sent_str'].str.split(token_pat, expand=True).stack()\
            .to_frame().rename(columns={0:'token_str'})
        self.tokens.index.names = OHCO[:4]
        return self
    
    def extract_vocab(self):
        self.tokens['term_str'] = self.tokens.token_str.replace(r'\W+', '', regex=True)\
            .str.lower()
        self.vocab = self.tokens.term_str.value_counts().to_frame('n')\
            .reset_index()\
            .rename(columns={'index':'term_str'})
        self.vocab.index.name = 'term_id'
        return self
        
    def gather(self, ohco_level=1):
        return self.tokens.groupby(OHCO[:ohco_level]).token_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame().rename(columns={'token_str':'content'})
    
    def export(self, prefix='foo'):
        self.tokens.to_csv('{}/{}-TOKENS.csv'.format(prefix, self.data_out))
        self.vocab.to_csv('{}/{}-VOCAB.csv'.format(prefix, self.data_out))

## Test

In [252]:
data_in = './data_in'
data_out = './data_out'
epub_file = "pg105.txt" # Source file
csv_file = 'austen-persuasion.csv' # The file we will create
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num'] 
start_line_pat = r"\*\*\*\s*START OF (THE|THIS) PROJECT"
end_line_pat = r"\*\*\*\s*END OF (THE|THIS) PROJECT"
chap_title_pat = r"^\s*(chapter|letter)\s+(\d+)"

In [253]:
ti = TexImporter(epub_file, OHCO, data_in=data_in, data_out=data_out)

In [254]:
ti.import_source();

In [255]:
ti.clip_lines(start_line_pat, end_line_pat);

In [239]:
# ti.lines.sample(10)

In [257]:
ti.chunk_chapters(chap_title_pat);

In [241]:
# ti.chaps

In [258]:
ti.split_paragraphs();

In [243]:
# ti.paras

In [259]:
ti.split_sentences();

In [245]:
# ti.sents

In [260]:
ti.split_tokens();

In [247]:
# ti.tokens

In [261]:
ti.extract_vocab();

In [262]:
# ti.vocab

## All at once

In [267]:
ti2 = TexImporter(epub_file, OHCO, data_in=data_in, data_out=data_out)\
    .import_source()\
    .clip_lines(start_line_pat, end_line_pat)\
    .chunk_chapters(chap_title_pat)\
    .split_paragraphs()\
    .split_sentences()\
    .split_tokens()\
    .extract_vocab()