In [1]:
import numpy as np
import pandas as pd
import re

from pathlib import Path
from typing import Dict

In [2]:
training_path = Path().resolve().parents[0] / 'data' / 'WikiLarge_Test.csv'

In [3]:
df = pd.read_csv(training_path)

In [4]:
df.head()

Unnamed: 0,id,original_text,label
0,0,-2011.0,
1,1,-2011.0,
2,2,-2000.0,
3,3,-1997.0,
4,4,1.636,


In [5]:
def remove_lrb_rrb_tags(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Removes -LRB-...-RRB- tags along with all text between from 'original_text' column.
    
    Expected dataframe columns:
        original_text: str
    
    :return pd.DataFrame: the passed DataFrame cleaned
    """
    dataframe.original_text = dataframe.original_text.str.replace(r' -LRB-(.*?)-RRB-', '', regex=True)
    return dataframe

def test_remove_lrb_rrb_tags() -> None:
    df = pd.DataFrame({'original_text': ['Do not touch', 'Replace -LRB-xxx2123;;;;;!!!-RRB- me']})
    expected_df = pd.DataFrame({'original_text': ['Do not touch', 'Replace me']})
    assert remove_lrb_rrb_tags(df).original_text.equals(expected_df.original_text)
    
test_remove_lrb_rrb_tags()

In [7]:
def replace_html_char_codes_with_char(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Replaces char codes (e.g. &ndash;) with their respective characters.
    
    Expected dataframe columns:
        original_text: str
    
    :return pd.DataFrame: the passed DataFrame cleaned
    """ # TODO: Find/compile CSV of all codes and dynamically import and replace all
    characters: Dict[str, str] = { # Code: Character
        '&ndash;': '–',
        '&mdash;': '—',
    }
    for code, character in characters.items():
        dataframe.original_text = dataframe.original_text.str.replace(f"{code[0]} {code[1:-1]} {code[-1]}", character)
    return dataframe

def test_replace_html_char_codes_with_char() -> None:
    df = pd.DataFrame({'original_text': ['Do not touch', 'Replace & ndash ; me', 'Replace & mdash ; me']})
    expected_df = pd.DataFrame({'original_text': ['Do not touch', 'Replace – me', 'Replace — me']})
    assert replace_html_char_codes_with_char(df).original_text.equals(expected_df.original_text)
    
test_replace_html_char_codes_with_char()

In [8]:
def clean_dataset(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Performs all required data cleaning on all records in the passed DataFrame.
    
    Expected dataframe columns:
        original_text: str
        label: int (0 or 1)
    
    :return pd.DataFrame: the passed DataFrame cleaned
    """
    cleaning_steps = [
        remove_lrb_rrb_tags,
        replace_html_char_codes_with_char,
    ]
    for cleaning in cleaning_steps:
        dataframe = cleaning(dataframe)
    return dataframe

In [9]:
clean_dataset(df).head(99)

Unnamed: 0,id,original_text,label
0,0,-2011,
1,1,-2011,
2,2,-2000,
3,3,-1997,
4,4,1.636,
...,...,...,...
94,94,-- and King Ahab grew annoyed at such insolence .,
95,95,"- Andrei Rublev , Russian iconographer",
96,96,'' Angels '',
97,97,'' Another One Bites the Dust '' \/O/ '' Do n'...,
