# Metadata
```
Class:   DS 5001
Module:  02 Lab
Topic:   A Class for Importing a Text
Author:  R.C. Alvarado
Purpose: Create a class to wrap our functions and variables relating to parsing a raw text.
```

# Set Up

## Config 

We put everything we know about our text and its processing requirements in a configuration dictionary. This dictionary has to be structured in a predictable way. Ideally, it would be defined by a schema in a language like XML. 

In [279]:
config = {
    'src_file': "data_in/pg105.txt",
    'cruft': {
        'start_line_pat': r"\*\*\*\s*START OF (THE|THIS) PROJECT",
        'end_line_pat': 'End of the Project Gutenberg EBook'
    }, 
    'ohco': {
        'chapter': {
            'pat': r"^\s*(chapter|letter)\s+(\d+)",
            'type': 'milestone'
        },
        'paragraph': {
            'pat': r"\n\n+",
            'type': 'delimitter'
            
        },
        'sentence': {
            'pat': r"[.?!;:]+",
            'type': 'delimitter'
        },
        'token': {
            'pat': r"[\s',-]+",
            'type': 'delimitter'
        }
    }
}

## Class



In [9]:
import pandas as pd

class TextImporter():
    
    src_imported:bool = False       
    src_clipped:bool = False
        
    def __init__(self, config):
        self.config:dict = config # Ideally, validate against a schema
        self._validate_config()
        self.OHCO:list = list(self.config['ohco'].keys())
        
    def _validate_config(self):
        config_keys_ideal = {'src_file','cruft','ohco'}
        config_keys_real = set(self.config.keys())
        if config_keys_real != config_keys_ideal:
            print("Config not valid")

    def import_source(self, strip:bool = True):
        """Convert a raw text file into a dataframe of lines"""
        src_file = self.config['src_file']
        self.src_df = pd.DataFrame({'line_str':open(src_file,'r').readlines()})
        self.src_df.index.name = 'line_id'
        if strip:
            self.src_df.line_str = self.src_df.line_str.str.strip()
        self.src_imported = True
        self._clip_lines()
        return self

    def _clip_lines(self):
        """Remove cruft lines from beginning and/or end of file"""
        start_pat = self.config['cruft']['start_line_pat']
        end_pat = self.config['cruft']['end_line_pat']
        start = self.src_df.line_str.str.match(start_pat)
        end = self.src_df.line_str.str.match(end_pat)
        start_line_num = self.src_df.loc[start].index[0]
        end_line_num = self.src_df.loc[end].index[0]
        self.src_df = self.src_df.loc[start_line_num + 1 : end_line_num - 2]
        self.src_clipped == True
        
    def parse_tokens(self):
        """Convert lines to tokens with arbitrary OHCO"""
        if self.src_imported:
            self.tokens = self.src_df.copy()
            for i, level in enumerate(self.config['ohco']):
                if self.config['ohco'][level]['type'] == 'milestone':
                    self.tokens = self._group_by_milestone(self.tokens, i)
                elif self.config['ohco'][level]['type'] == 'delimitter':
                    self.tokens = self._split_by_delimitter(self.tokens, i)
                else:
                    raise("No method for level")
            return self
        else:
            print("Source not imported. Please run .import_source()")

    def _group_by_milestone(self, df, ohco_level, 
                           src_col='line_str', 
                           tmp_col='div_idx', 
                           id_suffix='_id', 
                           case=False):
        """Group and chunk text by milestone,such as chapter headers"""
        div_name = self.OHCO[ohco_level]
        div_pat = self.config['ohco'][div_name]['pat']
        div_lines = df[src_col].str.match(div_pat, case=case)
        df.loc[div_lines, div_name] = [i+1 for i in range(df.loc[div_lines].shape[0])]
        df[div_name] = df[div_name].ffill()
        df = df.loc[~df[div_name].isna()] # Remove everything before first div
        df = df.loc[~div_lines] # Remove milestone markers
        df[div_name] = df[div_name].astype('int')
        df = df.groupby(self.OHCO[:ohco_level+1])[src_col].apply(lambda x: '\n'.join(x)).to_frame() # Make big string
        df[src_col] = df[src_col].str.strip()    
        df = df.rename(columns={src_col:'{}_str'.format(div_name)})
        df.index.name = "{}_id".format(div_name)
        return df

    def _split_by_delimitter(self, df, ohco_level, 
                            src_col_suffix='_str', 
                            join_pat='\n', 
                            id_suffix='_num', 
                            case=False):
        """Split and chunk text by a delimmitter, for paragraphs, sentences, and tokens"""

        OHCO = list(config['ohco'].keys())
        div_name = OHCO[ohco_level]
        div_pat = config['ohco'][div_name]['pat']
        src_div_name = OHCO[ohco_level-1]
        src_col = f"{src_div_name}{src_col_suffix}"
        df2 = df[src_col].str.split(div_pat, expand=True).stack().to_frame()\
            .rename(columns={0:div_name}).copy()
        df2.index.names = df.index.names + [div_name + id_suffix]
        df2[div_name] = df2[div_name].str.replace(join_pat, ' ')
        df2 = df2[~df2[div_name].str.match(r'^\s*$')]    
        df2 = df2.rename(columns={div_name:f'{div_name}_str'})
        return df2        

    def gather_tokens(self, level=0, collapse=False):
        """Gather tokens into strings for arbitrary OHCO level"""
        max_level = len(self.OHCO) - 2
        if level > max_level:
            print(f"Level {level} too high. Try between 0 and {max_level}")
        else:
            level_name = self.OHCO[level]
            idx = self.tokens.index.names[:level+1]
            return self.tokens.groupby(idx).token_str.apply(lambda x: ' '.join(x)).to_frame(f'{level_name}_str')

## Test 1

In [309]:
TI = TextImporter(config).import_source().parse_tokens()

In [310]:
TI.tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chapter_id,paragraph_num,sentence_num,token_num,Unnamed: 4_level_1
1,0,0,0,Sir
1,0,0,1,Walter
1,0,0,2,Elliot
1,0,0,3,of
1,0,0,4,Kellynch
...,...,...,...,...
24,11,6,34,in
24,11,6,35,its
24,11,6,36,national
24,11,6,37,importance


In [305]:
foo = 'bar'

In [306]:
print(f"Hey {foo}")

Hey bar


In [307]:
TI.gather_tokens?