In [65]:
import numpy as np
import pandas as pd
import re

from pathlib import Path

In [66]:
training_path = Path().resolve().parents[0] / 'data' / 'WikiLarge_Train.csv'

In [67]:
df = pd.read_csv(training_path)

In [68]:
df.head()

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [88]:
def remove_lrb_rrb_tags(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Removes -LRB-...-RRB- tags along with all text between from 'original_text' column.
    
    Expected dataframe columns:
        original_text: str
    
    :return pd.DataFrame: the passed DataFrame cleaned
    """
    dataframe.original_text = dataframe.original_text.str.replace(r' -LRB-(.*?)-RRB-', '', regex=True)
    return dataframe

def test_remove_lrb_rrb_tags() -> None:
    df = pd.DataFrame({'original_text': ['Do not touch', 'Replace -LRB-xxx2123;;;;;!!!-RRB- me']})
    expected_df = pd.DataFrame({'original_text': ['Do not touch', 'Replace me']})
    assert remove_lrb_rrb_tags(df).original_text.equals(expected_df.original_text)
    
test_remove_lrb_rrb_tags()

In [92]:
def replace_html_char_codes_with_char(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Replaces char codes (e.g. &ndash;) with their respective characters.
    
    Expected dataframe columns:
        original_text: str
    
    :return pd.DataFrame: the passed DataFrame cleaned
    """ # TODO: Find/compile CSV of all codes and dynamically import and replace all
    characters: Dict[str, str] = { # Code: Character
        '&ndash;': '–',
        '&mdash;': '—',
    }
    for code, character in characters.items():
        dataframe.original_text = dataframe.original_text.str.replace(f"{code[0]} {code[1:-1]} {code[-1]}", character)
    return dataframe

def test_replace_html_char_codes_with_char() -> None:
    df = pd.DataFrame({'original_text': ['Do not touch', 'Replace & ndash ; me', 'Replace & mdash ; me']})
    expected_df = pd.DataFrame({'original_text': ['Do not touch', 'Replace – me', 'Replace — me']})
    assert replace_char_codes_with_char(df).original_text.equals(expected_df.original_text)
    
test_replace_html_char_codes_with_char()

In [82]:
def clean_dataset(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Performs all required data cleaning on all records in the passed DataFrame.
    
    Expected dataframe columns:
        original_text: str
        label: int (0 or 1)
    
    :return pd.DataFrame: the passed DataFrame cleaned
    """
    cleaning_steps = [
        remove_lrb_rrb_tags,
    ]
    for cleaning in cleaning_steps:
        dataframe = cleaning(dataframe)
    return dataframe

In [83]:
clean_dataset(df).head(50)

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,Geneva is the second-most-populous city in Swi...,1
5,When Japan earned another race on the F1 sched...,1
6,This marked the first motorcycle racing event ...,1
7,Ka La Ku'oko ' a : The Kingdom of Hawai`i is o...,1
8,A scythe is an agricultural hand tool for mowi...,1
9,Toshihide Saito is a Japanese football defender .,1
