In [1]:
import itertools

def _pad_sequence(
    sequence,
    n,
    pad_left = False,
    pad_right = False,
    left_pad_symbol = None,
    right_pad_symbol = None,
):
    sequence = iter(sequence)
    if pad_left:
        sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence

def ngrams(
    sequence,
    n: int,
    pad_left = False,
    pad_right = False,
    left_pad_symbol = None,
    right_pad_symbol = None,
):
    sequence = _pad_sequence(
        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
    )

    history = []
    while n > 1:
        try:
            next_item = next(sequence)
        except StopIteration:
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

In [2]:
list(ngrams(['saya', 'suka', 'makan', 'ayam'], 2))

[('saya', 'suka'), ('suka', 'makan'), ('makan', 'ayam')]

In [3]:
from glob import glob

In [4]:
languages = glob('*-words.json')
languages

['kelantan-words.json',
 'negeri-sembilan-words.json',
 'melaka-words.json',
 'johor-words.json',
 'pahang-words.json',
 'kedah-words.json',
 'perak-words.json',
 'terengganu-words.json',
 'english-words.json',
 'sabah-words.json',
 'sarawak-words.json']

In [5]:
import json

In [6]:
with open(languages[0]) as fopen:
    lang = set(json.load(fopen))
    
longest = 0
for l in lang:
    ls = len(l.split())
    if ls > longest:
        print(l)
        longest = ls
        
longest

wok lor
mahkamah tinggi ayoh kob
lorong tua sebutan bunyi dihidung
jjughuh baik jjughuh budok tu baik budak itu


8

In [7]:
with open('social-language.json') as fopen:
    malay = json.load(fopen)['malay']
    
len(malay)

7230902

In [8]:
words = []

languages = glob('*-words.json')
for language in languages:
    if 'english' in language:
        continue
    label = language.replace('-words.json', '')
    
    with open(language) as fopen:
        lang = json.load(fopen)
    words.append(lang)
    
len(words)

10

In [9]:
set.intersection(*map(set,words))

set()

In [10]:
with open('malays_word.json') as fopen:
    malays = set(json.load(fopen))

In [11]:
languages_dict = {}
longest = 0
languages = glob('*-words.json')
for language in languages:
    if 'english' in language:
        continue
    print(language)
    label = language.replace('-words.json', '')
    
    with open(language) as fopen:
        lang = set(json.load(fopen))
    
    print(len(lang))
    lang = lang - malays
    print(len(lang))

    
    for l in lang:
        ls = len(l.split())
        if ls > longest:
            print(l)
            longest = ls
            
    languages_dict[label] = lang

kelantan-words.json
503
420
wok lor
mahkamah tinggi ayoh kob
lorong tua sebutan bunyi dihidung
jjughuh baik jjughuh budok tu baik budak itu
negeri-sembilan-words.json
875
801
melaka-words.json
375
273
johor-words.json
215
171
pahang-words.json
294
201
kedah-words.json
2049
2001
perak-words.json
150
121
terengganu-words.json
351
283
sabah-words.json
124
64
sarawak-words.json
209
165


In [12]:
from tqdm import tqdm

results = {}

for s in tqdm(malay):
    splitted = s.split()
    ngs = splitted[:]
    for i in range(2, longest):
        ngs.extend([' '.join(n) for n in ngrams(splitted, i)])
    
    found = False
    for k, v in languages_dict.items():
        r = set(ngs) & v
        if len(r):
            # print(s, k, r)
            found = True
            if k in results:
                results[k].append(s)
            else:
                results[k] = [s]
            break
    
    if not found:
        if 'malay' in results:
            results['malay'].append(s)
        else:
            results['malay'] = [s]

100%|██████████| 7230902/7230902 [09:52<00:00, 12193.86it/s]


In [13]:
for k, v in results.items():
    print(k, len(v))

malay 7153125
kedah 36271
johor 2575
melaka 10393
terengganu 4839
sarawak 7401
negeri-sembilan 7717
kelantan 2438
perak 1296
pahang 3596
sabah 1251


In [14]:
from random import sample

results['malay'] = sample(results['malay'],100000)

In [15]:
with open('sublanguages.json', 'w') as fopen:
    json.dump(results, fopen)

In [16]:
# with open('sublanguages-malay.json', 'w') as fopen:
#     json.dump(malays, fopen)

In [17]:
import boto3

bucketName = 'malaya-dataset'
Key = 'sublanguages.json'
outPutname = "language-detection/sublanguages.json"

s3 = boto3.client('s3')
s3.upload_file(Key,bucketName,outPutname)