In [1]:
from lexical_benchmark import settings
from lexical_benchmark.datasets.human import childes
from lexical_benchmark.datasets.human.childes import clean
from lexical_benchmark.datasets.utils import text_cleaning

from pathlib import Path
import platform
from collections import Counter
import re
import itertools

In [2]:
# Child clean rules
clean_rules = clean.cleaning_child_speech_rules

In [3]:
""" Test clean rules on sample of files."""

files = childes.RawCHILDESFiles()
it = files.iter("Eng-NA", "child")
clean_target = (settings.PATH.clean_childes / 'test')
source = (settings.PATH.clean_childes / 'src')

# MKDIR
clean_target.mkdir(exist_ok=True, parents=True)
source.mkdir(exist_ok=True, parents=True)

def piped(line: str, *fn_list) -> str:
    cl_line = line
    for fn in fn_list:
        cl_line = fn(cl_line)
    return cl_line
        
for item in it:
    # print(item.file, item.file.is_file())
    clean_lines = [piped(line, *clean_rules) for line in item.file.read_text().splitlines()]
    (clean_target / item.file.with_suffix(".txt").name).write_text("\n".join(clean_lines))
    (source / item.file.with_suffix(".raw").name).write_text(item.file.read_text())

In [4]:
error_logs = text_cleaning.WordLogger._ERROR_LOG
error_logs.keys()

dict_keys(['&+', '&-', '&~'])

In [6]:
logs = text_cleaning.WordLogger.export_logs()
logs.keys()

dict_keys(['parenthesis-annotation', 'bracket-annotation', '&+', '&=', '&-', '&*'])

In [15]:
Counter(logs["&-"])["um-um"]

30

In [9]:
error_logs['&~']

["matches=[], counting (self.clean_pattern='&~') found: line.count(self.clean_pattern)=1 in (&~ . 204146_205981)",
 "matches=[], counting (self.clean_pattern='&~') found: line.count(self.clean_pattern)=1 in (&~ . 395435_395900)",
 "matches=[], counting (self.clean_pattern='&~') found: line.count(self.clean_pattern)=1 in (&~ . 698802_705141)",
 "matches=[], counting (self.clean_pattern='&~') found: line.count(self.clean_pattern)=1 in (&~ . 1139677_1140536)",
 "matches=[], counting (self.clean_pattern='&~') found: line.count(self.clean_pattern)=1 in (&~ . 1156002_1157326)",
 "matches=[], counting (self.clean_pattern='&~') found: line.count(self.clean_pattern)=1 in (&~ . 1395769_1405637)"]

In [4]:
""" Normalise text testing"""
from lexical_benchmark.datasets.utils import text_cleaning as txt

normalizer = txt.TextNormalization()

def test_norm(s: str, expected_char: str):
    normal = normalizer(s)
    assert expected_char*len(s) == normal, f"Expected: {expected_char*len(s)}"
    print(normal)


test_norm("àáâãäå","a")
test_norm("èéêë","e")
test_norm("ìíîï","i")
test_norm("òóôõö","o")
test_norm("ùúûü","u")

aaaaaa
eeee
iiii
ooooo
uuuu


In [None]:
"""Test regexp for &+"""
import re


tag = r"&\+"
pattern = re.compile(f'{tag}\\S*')


texts = [
    "now, what &+col",
    "now, what &&col",
    "well ‡ what are your plans in terms of friends (.) &+t to invite ? 208701_213055",
    "yeah ‡ probably &+s Thursday after gymnastics he'll give it back „ right ?",
    "<who do you> [//] &+thi who's playing with the truck here",
    "you wanna knock on the &+win &+mi:rror",
    "they put them on their nose and they &+ba balance them on their nose",
]


for t in texts:
    m =  pattern.findall(t)
    print(t, ":::", m)

In [1]:
"""Test Cleaning with custom rules."""

assert platform.node() in settings.PATH.KNOWN_HOSTS, "Code Running in a unknown device, you must provide custom PATH locations"

cleaner = childes.CHILDESCleaner()
nav = childes.RawCHILDESFiles()
it = itertools.islice(nav.iter("Eng-NA", "child"), 100)
target = Path('testing_files')
target.mkdir(parents=True, exist_ok=True)
rules = [
    clean.BRACKET_REMOVER,
    clean.PAREN_REMOVER,
    clean.TEXT_NORMALISATION, 
    clean.CROCHET_REMOVER,
]

cleaner.clean_files(target, it, rules, show_progress=True)

print("Cleanup successful...")

Output()

Task Completed...


In [1]:
raw_childes = childes.RawCHILDESFiles()
meta = raw_childes.load_clean_meta("Eng-NA", "child")
meta.keys()

dict_keys(['@b', '@wp', 'XXX', '@c', 'YYY', '(@s:hun)', 'WWW', '-undescore', 'parenthesis-annotation', '@t', '@k', '@l', 'bracket-annotation'])

In [7]:
Counter(meta['-undescore'])

Counter({'out_of': 5,
         'lots_of': 3,
         'thank_you': 2,
         'kitty_cat': 2,
         'a_lot_of': 2,
         'Darth_Vader': 2,
         'Grand_Moder': 1,
         'as_well': 1,
         'Raisin_Bran': 1,
         'Thing_One': 1,
         'Thing_Two': 1,
         'Snow_White_and_the_Seven_Dwarfs': 1,
         'At_Ats': 1})

In [12]:
# Define the regex pattern
match_pattern = re.compile(r'\w*\(\w+\)\w*')
clean_pattern = re.compile(r'\(\w+\)')
nav = childes.CleanCHILDESFiles()
# it = itertools.islice(nav.iter("Eng-NA", "child"), 100)

# Check matches in each test case
for f_test in nav.iter_test():
    for line in f_test.read_text().splitlines():
        matches = match_pattern.findall(line)
        if matches:
            clean_line = clean_pattern.sub("'", line)
            print(f"{matches}")
            print(f"<{line}> ==> <{clean_line}>")

['s(us)pect']
<this is red this is pink this is blue   I s(us)pect Margaret's through .> ==> <this is red this is pink this is blue   I s pect Margaret's through .>
['(i)t']
<(i)t is .> ==> < t is .>
['(i)t']
<(i)t isn't stirred up .> ==> < t isn't stirred up .>
['s(up)pose']
<I s(up)pose I could find them .> ==> <I s pose I could find them .>
['s(us)pect']
<I saw Esau Kate saw Esau Esau Kate s(us)pect we all free saw .> ==> <I saw Esau Kate saw Esau Esau Kate s pect we all free saw .>
['s(up)pose']
<do you s(up)pose I'll soon be done  > ==> <do you s pose I'll soon be done  >
['s(up)pose']
<do you s(up)pose I'll soon be done  > ==> <do you s pose I'll soon be done  >
['s(up)pose']
<do you s(up)pose I'll soon be done  > ==> <do you s pose I'll soon be done  >
['(ex)cepting']
<all busy sewing   (ex)cepting Mamma not sewing .> ==> <all busy sewing    cepting Mamma not sewing .>
['(th)em']
<you make (th)em .> ==> <you make  em .>
['(i)t']
<(i)t isn't Towser either .> ==> < t isn't Towser 