In [1]:
import json
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer

def clean_text(string):
    string = re.sub(u'[0-9!@#$%^&*()_\-+{}|\~`\'";:?/.>,<]', ' ', string.lower(), flags=re.UNICODE)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

def simple_textcleaning(string):
    string = re.sub('[^A-Za-z ]+', ' ', string)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [2]:
with open('language-detection-data-v4.json','r') as fopen:
    loaded = json.load(fopen)
    sentences = [clean_text(text) for text in loaded['text']]
    langs = loaded['label']

In [3]:
ind_ids = [i for i in range(len(langs)) if langs[i] == 'ind']
zlm_ids = [i for i in range(len(langs)) if langs[i] == 'zlm']
other_ids = [i for i in range(len(langs)) if langs[i] == 'OTHER']
eng_ids = [i for i in range(len(langs)) if langs[i] == 'eng']

other_sentences = [sentences[i] for i in other_ids]
eng_sentences = [sentences[i] for i in eng_ids]

In [4]:
with open('malay-text.txt') as fopen:
    malays = filter(None, fopen.read().split('\n'))

with open('indon-text.txt') as fopen:
    indons = list(filter(None, fopen.read().split('\n')))
    
with open('00-indonesian-wordlist.txt',encoding='ISO-8859-1') as fopen:
    another_indons = list(filter(None, fopen.read().split('\n')))
    
another_indons = [simple_textcleaning(s) for s in another_indons if len(s) > 2]
    
with open('/home/husein/Malaya/stop-word-kerulnet') as fopen:
    stopwords = set(filter(None, fopen.read().split('\n')))
    
ind_set = set(indons)
zlm_set = set(malays)

In [5]:
# %%time
# for no, i in enumerate(ind_ids):
#     if (no+1) % 10000 == 0:
#         print('indon %d'%(no + 1))
#     sentences[i] = ' '.join(w for w in sentences[i].split() if w in ind_set and w not in stopwords)
    
# for no, i in enumerate(zlm_ids):
#     if (no+1) % 10000 == 0:
#         print('malay %d'%(no + 1))
#     sentences[i] = ' '.join(w for w in sentences[i].split() if w in zlm_set and w not in stopwords)

In [6]:
# zlm_sentences = list(filter(None, set([sentences[i] for i in zlm_ids])))
# ind_sentences = list(filter(None, set([sentences[i] for i in ind_ids])))

In [7]:
sentences = other_sentences + eng_sentences
langs = (['OTHER'] * len(other_sentences)) + (['eng'] * len(eng_sentences))

In [8]:
with open('wiki-id.txt') as fopen:
    id_wiki = [simple_textcleaning(s) for s in list(filter(None, fopen.read().split('\n')[:60000]))]
    
with open('wiki-ms.txt') as fopen:
    ms_wiki = [simple_textcleaning(s) for s in list(filter(None, fopen.read().split('\n')[:60000]))]

In [9]:
%%time
for no, i in enumerate(range(len(id_wiki))):
    if (no+1) % 10000 == 0:
        print('indon %d'%(no + 1))
    id_wiki[i] = ' '.join(w for w in id_wiki[i].split() if w in ind_set and w not in stopwords)
    
for no, i in enumerate(range(len(ms_wiki))):
    if (no+1) % 10000 == 0:
        print('malay %d'%(no + 1))
    ms_wiki[i] = ' '.join(w for w in ms_wiki[i].split() if w in zlm_set and w not in stopwords)

indon 10000
indon 20000
indon 30000
indon 40000
indon 50000
indon 60000
malay 10000
malay 20000
malay 30000
malay 40000
malay 50000
malay 60000
CPU times: user 6.71 s, sys: 36 ms, total: 6.75 s
Wall time: 6.74 s


In [10]:
id_wiki = list(filter(None, set(id_wiki)))
ms_wiki = list(filter(None, set(ms_wiki)))

In [11]:
sentences += id_wiki + ms_wiki
langs += (['ind'] * len(id_wiki)) + (['zlm'] * len(ms_wiki))

In [13]:
import numpy as np
np.unique(langs,return_counts=True)

(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),
 array([46910, 50000, 57327, 53692]))

In [14]:
bow_chars = CountVectorizer(ngram_range=(3, 5), analyzer='char_wb', max_features=700000).fit(sentences)
delattr(bow_chars, 'stop_words_')

In [15]:
with open('language-detection-data-v5.json','w') as fopen:
    fopen.write(json.dumps({'text':sentences,'label':langs}))

In [16]:
with open('language-detection-vectorizer.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)