In [1]:
import re
import random

punctuation = '!()&%{}[];:\'",./?\\<>'

def remove_punc(string):
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def remove_punc_random(string, threshold = 0.3):
    result = []
    for c in string:
        if c in punctuation and random.random() >= threshold:
            continue
            
        result.append(c)
    return ''.join(result)

def add_punc_random(string, threshold = 0.7):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold and string[i][-1] not in punctuation:
            string[i] = string[i] + random.choice(punctuation)
    return ' '.join(string)

def add_space_after_punc(string, threshold = 0.3):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold and string[i][-1] in punctuation:
            string[i] = string[i][:-1] + ' ' + string[i][-1]
    return ' '.join(string)

def replace_random_punc(string):
    string = string.split()
    for i in range(len(string)):
        if string[i][-1] in punctuation:
            string[i] = string[i][:-1] + random.choice(punctuation)
    return ' '.join(string)

def random_upper(string, threshold = 0.5):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold:
            string[i] = string[i].upper()
    return ' '.join(string)

def random_title(string, threshold = 0.5):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold:
            string[i] = string[i].title()
    return ' '.join(string)

def random_char_upper(string, threshold = 0.7):
    result = []
    for c in string:
        if random.random() >= threshold:
            c = c.upper()
        result.append(c)
    return ''.join(result)

funcs = [remove_punc, remove_punc_random, add_punc_random, add_space_after_punc, 
         random_upper, random_title, random_char_upper, replace_random_punc]

chain_funcs = [remove_punc_random, add_punc_random, add_space_after_punc, 
         random_upper, random_title, random_char_upper, replace_random_punc]

def package(string, repeat = 2, repeat_chain = 5, threshold = 0.5):
    
    result = [string]
    result.append(string.lower())
    result.append(string.upper())
    result.append(string.title())
    
    for _ in range(repeat):
    
        for func in funcs:
            result.append(func(string))

        for func in funcs:
            result.append(func(string.lower()))

        for func in funcs:
            result.append(func(string.upper()))

        for func in funcs:
            result.append(func(string.title()))
    
    for _ in range(repeat_chain):
        s = string[:]
        for func in chain_funcs:
            if random.random() > threshold:
                s = func(s)
        result.append(s)
        
    result = list(set(result))
        
    return result

def slide(strings, n = 5):
    result = []
    for i in range(0, len(strings), len(strings) - (n - 1)):
        result.append(strings[i: i + n])
    return result

In [7]:
import cleaning
import random

In [3]:
files = ['../pure-text/filtered-dumping-wiki.txt',
        '../pure-text/dumping-cleaned-news.txt']

In [22]:
with open(files[0]) as fopen:
    data = fopen.read().split('\n')
    
results, result = [], []
for i in data:
    if len(i) and i[-1] != '.':
        i = i + '.'
    if not len(i) and len(result):
        results.append(result)
        result = []
    else:
        if len(i):
            result.append(i)
        
if len(result):
    results.append(result)
    
len(results)

363578

In [23]:
from tqdm import tqdm

def loop(strings):
    results = []
    for i in tqdm(range(len(strings))):
        try:
            slided = slide(strings[i])
            slided = [s for s in slided if len(s) > 1]
            for s in slided:
                s = ' '.join(s)
                p = package(s)
                for row in p:
                    results.append((row, s))
        except:
            pass
    return results

In [24]:
results1 = cleaning.multiprocessing(results, loop)

100%|██████████| 22723/22723 [00:10<00:00, 2112.09it/s]
100%|██████████| 10/10 [00:00<00:00, 79739.62it/s]/s]s]
100%|██████████| 22723/22723 [00:15<00:00, 1436.86it/s]
100%|██████████| 22723/22723 [00:21<00:00, 1047.61it/s]
100%|██████████| 22723/22723 [00:22<00:00, 997.90it/s]]
100%|██████████| 22723/22723 [00:25<00:00, 875.86it/s] 
100%|██████████| 22723/22723 [00:26<00:00, 846.02it/s] 
100%|██████████| 22723/22723 [00:26<00:00, 845.28it/s]
100%|██████████| 22723/22723 [00:27<00:00, 818.32it/s] 
100%|██████████| 22723/22723 [00:39<00:00, 574.06it/s] 
100%|██████████| 22723/22723 [00:54<00:00, 413.47it/s] 
100%|██████████| 22723/22723 [01:01<00:00, 369.44it/s]
100%|██████████| 22723/22723 [01:07<00:00, 337.90it/s]
100%|██████████| 22723/22723 [01:07<00:00, 336.10it/s]
 99%|█████████▉| 22455/22723 [01:09<00:02, 107.27it/s]
100%|██████████| 22723/22723 [01:11<00:00, 319.68it/s]
100%|██████████| 22723/22723 [01:12<00:00, 315.38it/s]


In [25]:
len(results1)

15141820

In [26]:
results1[:100]

[('Hypo-ARena ? Hypo-AreNa (DahulU DikEnaLI SebAgAi " IAlaH SEbUAh StaDiUM seRBA GuNa Di KLAGEnfUrt/ AustRiA " IA MerupaKan Stadium pAsuKan AustRia KarNTen\' StaDium LAMA DiKeNali SebagAi WoRTheRSEESTAdIOn % DibInA Pada 1960 dAn MeMPunyAi KApAsIti SebaNYak 10,900[ Ia diRoBOhkan PadA 2005 DaN DiganTikan Dengan HYpo-ArENa Yang BaRU . JUga dIkeNALi SehinggA 30 JuN 2007 Dengan naMa "WorTHerseestadion"\'',
  'Hypo-Arena. Hypo-Arena (dahulu dikenali sebagai ) ialah sebuah stadium serba guna di Klagenfurt, Austria. Ia merupakan stadium pasukan Austria Karnten. Stadium lama dikenali sebagai Wortherseestadion, dibina pada 1960 dan mempunyai kapasiti sebanyak 10,900. Ia dirobohkan pada 2005 dan digantikan dengan Hypo-Arena yang baru, juga dikenali sehingga 30 Jun 2007 dengan nama "Wortherseestadion".'),
 ('HyPo-AreNa. Hypo-AreNA (dAhulu dIKeNAli SEbAgAI ) iALah sEbuAh stadiUM SeRba gUNA di KlAgEnfUrT, AusTrIa. IA meRupakAN STAdiuM paSuKan AUstria KarnTEn. Stadium LamA dIkEnaLI seBaGaI Worthersee

In [27]:
import json

with open('results-wiki.json', 'w') as fopen:
    json.dump(results1, fopen)

In [31]:
with open(files[1]) as fopen:
    data = fopen.read().split('\n')
    
len(data)

3656919

In [32]:
results, result = [], []
for i in data:
    if len(i) and i[-1] != '.':
        i = i + '.'
    if not len(i) and len(result):
        results.append(result)
        result = []
    else:
        if len(i):
            result.append(i)
        
if len(result):
    results.append(result)

In [33]:
len(results)

173012

In [38]:
results = random.sample(results, 70000)

In [39]:
results1 = cleaning.multiprocessing(results, loop)

100%|██████████| 4375/4375 [00:32<00:00, 136.30it/s]
100%|█████████▉| 4368/4375 [00:32<00:00, 131.64it/s]
 97%|█████████▋| 4255/4375 [00:30<00:00, 138.66it/s]
 98%|█████████▊| 4270/4375 [00:31<00:00, 135.61it/s]
100%|██████████| 4375/4375 [00:32<00:00, 136.40it/s]
100%|██████████| 4375/4375 [00:32<00:00, 136.71it/s]
100%|██████████| 4375/4375 [00:31<00:00, 137.92it/s]
100%|██████████| 4375/4375 [00:31<00:00, 137.02it/s]
100%|██████████| 4375/4375 [00:32<00:00, 135.70it/s]
100%|██████████| 4375/4375 [00:31<00:00, 137.48it/s]
100%|██████████| 4375/4375 [00:31<00:00, 137.07it/s]
100%|██████████| 4375/4375 [00:31<00:00, 137.32it/s]
100%|██████████| 4375/4375 [00:32<00:00, 134.75it/s]
100%|██████████| 4375/4375 [00:32<00:00, 134.97it/s]
100%|██████████| 4375/4375 [00:32<00:00, 133.59it/s]
100%|██████████| 4375/4375 [00:32<00:00, 133.77it/s]


In [41]:
len(results1)

8564087

In [42]:
with open('results-news.json', 'w') as fopen:
    json.dump(results1, fopen)