In [1]:
# !wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
# !bunzip2 sentences.tar.bz2
# !tar xvf sentences.tar

In [2]:
import pandas as pd

df = pd.read_csv('sentences.csv', sep = '\t', header = None)
df.head()

Unnamed: 0,0,1,2
0,1,cmn,我們試試看！
1,2,cmn,我该去睡觉了。
2,3,cmn,你在干什麼啊？
3,4,cmn,這是什麼啊？
4,5,cmn,今天是６月１８号，也是Muiriel的生日！


In [3]:
eng = df.loc[df[1] == 'eng']
eng = eng.iloc[:,-1].tolist()

In [4]:
df[1].unique()

array(['cmn', 'deu', 'rus', 'fra', 'eng', 'jpn', 'spa', 'ita', 'kor',
       'vie', 'nld', 'epo', 'por', 'tur', 'heb', 'hun', 'ell', 'ind',
       'ara', 'arz', 'fin', 'bul', 'yue', 'swe', 'ukr', 'bel', 'que',
       'ces', 'swh', 'nno', 'wuu', 'nob', 'zsm', 'est', 'kat', 'pol',
       'lat', 'urd', 'sqi', 'isl', 'fry', 'afr', 'ron', 'fao', 'san',
       'bre', 'tat', 'yid', 'uig', 'uzb', 'srp', 'qya', 'dan', 'pes', nan,
       'slk', 'eus', 'cycl', 'acm', 'tgl', 'lvs', 'kaz', 'hye', 'hin',
       'lit', 'ben', 'cat', 'bos', 'hrv', 'tha', 'orv', 'cha', 'mon',
       'lzh', 'scn', 'gle', 'mkd', 'slv', 'frm', 'glg', 'vol', 'ain',
       'jbo', 'toki', 'ina', 'nds', 'mal', 'tlh', 'roh', 'ltz', 'oss',
       'ido', 'gla', 'mlt', 'sco', 'ast', 'jav', 'oci', 'ile', 'ota',
       'xal', 'tel', 'sjn', 'nov', 'khm', 'tpi', 'ang', 'aze', 'tgk',
       'tuk', 'chv', 'hsb', 'dsb', 'bod', 'sme', 'cym', 'mri', 'ksh',
       'kur', 'ewe', 'kab', 'ber', 'tpw', 'udm', 'lld', 'pms', 'lad',
       'grn',

In [5]:
ms = df.loc[df[1] == 'zlm']
ms = ms.iloc[:,-1].tolist()
len(ms)

98

In [6]:
ind = df.loc[df[1] == 'ind']
ind = ind.iloc[:,-1].tolist()
len(ind)

12515

In [7]:
others = df.loc[~df[1].isin(['zlm', 'ind', 'eng'])]
others = others.iloc[:,-1].tolist()
len(others)

6816517

In [8]:
import re
import cleaning
from tqdm import tqdm

def preprocessing(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    
    chars = ',.()!:\'"/;=-'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(
        u'[0-9!@#$%^&*()_\-+{}|\~`\'";:?/.>,<]',
        ' ',
        string,
        flags = re.UNICODE,
    )
    string = re.sub(r'[ ]+', ' ', string).strip()
    
    return string.lower()

def loop(strings):
    for i in tqdm(range(len(strings))):
        strings[i] = preprocessing(strings[i])
    return strings

In [9]:
eng = cleaning.multiprocessing(eng, loop)

100%|██████████| 80573/80573 [00:01<00:00, 74237.04it/s]
 97%|█████████▋| 77984/80573 [00:01<00:00, 70602.29it/s]
100%|██████████| 80573/80573 [00:01<00:00, 67629.92it/s]
  0%|          | 0/4 [00:00<?, ?it/s]0:00, 69817.19it/s]
100%|██████████| 4/4 [00:00<00:00, 10936.91it/s]
100%|██████████| 80573/80573 [00:01<00:00, 73532.33it/s]
100%|██████████| 80573/80573 [00:01<00:00, 70092.20it/s]
 98%|█████████▊| 78614/80573 [00:01<00:00, 65517.94it/s]
100%|██████████| 80573/80573 [00:01<00:00, 64470.88it/s]
100%|██████████| 80573/80573 [00:01<00:00, 68718.48it/s]
 95%|█████████▍| 76367/80573 [00:01<00:00, 69520.98it/s]
100%|██████████| 80573/80573 [00:01<00:00, 74932.94it/s]
100%|██████████| 80573/80573 [00:01<00:00, 68346.33it/s]

100%|██████████| 80573/80573 [00:01<00:00, 64975.34it/s]
100%|██████████| 80573/80573 [00:01<00:00, 65209.12it/s]
100%|██████████| 80573/80573 [00:01<00:00, 62820.06it/s]


In [10]:
ms = cleaning.multiprocessing(ms, loop)

100%|██████████| 6/6 [00:00<00:00, 3296.11it/s]

100%|██████████| 6/6 [00:00<00:00, 3147.70it/s]


100%|██████████| 6/6 [00:00<00:00, 4368.31it/s]




  0%|          | 0/2 [00:00<?, ?it/s]





100%|██████████| 2/2 [00:00<00:00, 11491.24it/s]


In [11]:
ind = cleaning.multiprocessing(ind, loop)

100%|██████████| 782/782 [00:00<00:00, 59036.43it/s]
100%|██████████| 782/782 [00:00<00:00, 48154.47it/s]

100%|██████████| 782/782 [00:00<00:00, 56839.89it/s]

100%|██████████| 782/782 [00:00<00:00, 55925.96it/s]
  0%|          | 0/3 [00:00<?, ?it/s], 52451.44it/s]

100%|██████████| 782/782 [00:00<00:00, 53525.67it/s]
100%|██████████| 782/782 [00:00<00:00, 40678.97it/s]


100%|██████████| 782/782 [00:00<00:00, 44519.11it/s]


100%|██████████| 782/782 [00:00<00:00, 26638.94it/s]



In [12]:
others = cleaning.multiprocessing(others, loop)

100%|██████████| 426032/426032 [00:05<00:00, 82248.14it/s] 
100%|██████████| 5/5 [00:00<00:00, 23912.79it/s]67.35it/s]
100%|██████████| 426032/426032 [00:05<00:00, 71134.40it/s]
100%|██████████| 426032/426032 [00:06<00:00, 69098.88it/s]
100%|██████████| 426032/426032 [00:06<00:00, 70319.38it/s]
100%|██████████| 426032/426032 [00:06<00:00, 69293.78it/s]
100%|██████████| 426032/426032 [00:05<00:00, 71253.13it/s]
100%|██████████| 426032/426032 [00:06<00:00, 70523.62it/s]
100%|██████████| 426032/426032 [00:05<00:00, 71551.52it/s]
100%|██████████| 426032/426032 [00:05<00:00, 74259.51it/s]
100%|██████████| 426032/426032 [00:05<00:00, 72331.63it/s]
100%|██████████| 426032/426032 [00:05<00:00, 73642.58it/s]
100%|██████████| 426032/426032 [00:05<00:00, 72656.56it/s]
100%|██████████| 426032/426032 [00:05<00:00, 71790.40it/s]
100%|██████████| 426032/426032 [00:05<00:00, 72639.01it/s]
100%|██████████| 426032/426032 [00:05<00:00, 74095.97it/s]
100%|██████████| 426032/426032 [00:06<00:00, 70851.76it

In [13]:
import json

with open('eng.json', 'w') as fopen:
    json.dump(eng, fopen)

In [14]:
with open('ms.json', 'w') as fopen:
    json.dump(ms, fopen)

In [15]:
with open('ind.json', 'w') as fopen:
    json.dump(ind, fopen)

In [16]:
with open('others.json', 'w') as fopen:
    json.dump(others, fopen)

In [17]:
import json

with open('ind.json') as fopen:
    ind = json.load(fopen)
    
ind

['ibu sedang masak di dapur',
 'mary tidak punya teman bicara tapi dia tidak merasa kesepian',
 'mary adalah gadis yang enak diajak bicara',
 'mary memutuskan untuk tidak akan menemuinya lagi',
 'apakah mary sudah mulai',
 'mary sangat suka susu',
 'mary belum mulai',
 'mary gagal lagi meski begitu dia masih muda',
 'mary selalu membantu pacarnya mengerjakan tugas',
 'pesawat ini mampu menampung orang penumpang sekali terbang',
 'harga tanah di jepang mahal sekali',
 'dilihat dari pesawat pulau itu terlihat indah sekali',
 'pulau itu ada di jepang sebelah selatan',
 'pulau itu ada di jepang sebelah barat',
 'pulau itu tertutup es dan salju selama musim dingin',
 'pulau itu ditemukan oleh siapa',
 'pulau itu dihantam topan',
 'pulau itu mudah dijangkau dengan perahu',
 'pulau itu bagaikan surga bagi anak anak',
 'setiap jengkal pulau telah dijelajahi',
 'pulau itu terletak dua mil dari pantai',
 'pulau itu terletak satu mil dari pantai',
 'pulau itu hangat sepanjang tahun',
 'pulau itu 