In [1]:
import re
import random

punctuation = '!()&%{}[];:\'",./?\\<>'

def remove_punc(string):
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def remove_punc_random(string, threshold = 0.3):
    result = []
    for c in string:
        if c in punctuation and random.random() >= threshold:
            continue
            
        result.append(c)
    return ''.join(result)

def add_punc_random(string, threshold = 0.7):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold and string[i][-1] not in punctuation:
            string[i] = string[i] + random.choice(punctuation)
    return ' '.join(string)

def add_space_after_punc(string, threshold = 0.3):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold and string[i][-1] in punctuation:
            string[i] = string[i][:-1] + ' ' + string[i][-1]
    return ' '.join(string)

def replace_random_punc(string):
    string = string.split()
    for i in range(len(string)):
        if string[i][-1] in punctuation:
            string[i] = string[i][:-1] + random.choice(punctuation)
    return ' '.join(string)

def random_upper(string, threshold = 0.5):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold:
            string[i] = string[i].upper()
    return ' '.join(string)

def random_title(string, threshold = 0.5):
    string = string.split()
    for i in range(len(string)):
        if random.random() >= threshold:
            string[i] = string[i].title()
    return ' '.join(string)

def random_char_upper(string, threshold = 0.7):
    result = []
    for c in string:
        if random.random() >= threshold:
            c = c.upper()
        result.append(c)
    return ''.join(result)

funcs = [remove_punc, remove_punc_random, add_punc_random, add_space_after_punc, 
         random_upper, random_title, random_char_upper, replace_random_punc]

chain_funcs = [remove_punc_random, add_punc_random, add_space_after_punc, 
         random_upper, random_title, random_char_upper, replace_random_punc]

def package(string, repeat = 2, repeat_chain = 5, threshold = 0.5):
    
    result = [string]
    result.append(string.lower())
    result.append(string.upper())
    result.append(string.title())
    
    for _ in range(repeat):
    
        for func in funcs:
            result.append(func(string))

        for func in funcs:
            result.append(func(string.lower()))

        for func in funcs:
            result.append(func(string.upper()))

        for func in funcs:
            result.append(func(string.title()))
    
    for _ in range(repeat_chain):
        s = string[:]
        for func in chain_funcs:
            if random.random() > threshold:
                s = func(s)
        result.append(s)
        
    result = list(set(result))
        
    return result

In [2]:
from tqdm import tqdm

def loop(strings):
    results = []
    for i in tqdm(range(len(strings))):
        p = package(strings[i])
        for row in p:
            results.append((row, strings[i]))
    return results

In [6]:
import cleaning
import random

In [7]:
files = ['../pure-text/filtered-dumping-wiki.txt',
        '../pure-text/dumping-cleaned-news.txt']

In [16]:
with open(files[0]) as fopen:
    data = list(filter(None, fopen.read().split('\n')))
    
data = [i for i in data if len(i) >= 15]

len(data)

1992725

In [17]:
data = random.sample(data, 700000)

In [18]:
results1 = cleaning.multiprocessing(data, loop)

100%|██████████| 43750/43750 [00:36<00:00, 1208.31it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1200.26it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1208.62it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1199.90it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1197.28it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1200.30it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1193.82it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1198.89it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1189.47it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1193.01it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1191.08it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1188.14it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1183.98it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1184.67it/s]
100%|██████████| 43750/43750 [00:36<00:00, 1188.06it/s]
100%|██████████| 43750/43750 [00:37<00:00, 1178.91it/s]


In [19]:
len(results1)

37908415

In [20]:
import json

with open('results-wiki-single.json', 'w') as fopen:
    json.dump(results1, fopen)

In [21]:
with open(files[1]) as fopen:
    data = list(filter(None, fopen.read().split('\n')))

In [22]:
len(data)

3483907

In [23]:
del results1

In [24]:
data = random.sample(data, 700000)

In [25]:
results1 = cleaning.multiprocessing(data, loop)

 99%|█████████▉| 43351/43750 [00:42<00:00, 1044.61it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1026.36it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1025.76it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1028.30it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1027.74it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1022.46it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1023.04it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1020.42it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1022.49it/s]
 99%|█████████▉| 43286/43750 [00:42<00:00, 1029.86it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1019.28it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1026.34it/s]
100%|██████████| 43750/43750 [00:43<00:00, 1016.05it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1021.46it/s]
100%|██████████| 43750/43750 [00:42<00:00, 1020.75it/s]
100%|██████████| 43750/43750 [00:43<00:00, 1015.97it/s]


In [26]:
len(results1)

38280627

In [27]:
with open('results-news-single.json', 'w') as fopen:
    json.dump(results1, fopen)