In [1]:
%%time
import malaya
import fasttext

CPU times: user 3.95 s, sys: 741 ms, total: 4.69 s
Wall time: 4.1 s


## List available language detected

In [2]:
malaya.language_detection.label()

{0: 'eng', 1: 'ind', 2: 'malay', 3: 'manglish', 4: 'other', 5: 'rojak'}

In [3]:
chinese_text = '今天是６月１８号，也是Muiriel的生日！'
english_text = 'i totally love it man'
indon_text = 'menjabat saleh perombakan menjabat periode komisi energi fraksi partai pengurus partai periode periode partai terpilih periode menjabat komisi perdagangan investasi persatuan periode'
malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'
socialmedia_malay_text = 'nti aku tengok dulu tiket dari kl pukul berapa ada nahh'
socialmedia_indon_text = 'saking kangen papanya pas vc anakku nangis'
rojak_text = 'jadi aku tadi bikin ini gengs dan dijual haha salad only k dan haha drinks only k'
manglish_text = 'power lah even shopback come to edmw riao'

## Load Fast-text model

Make sure fast-text already installed, if not, simply,

```bash
pip install fasttext
```

In this example, I am going to compare with pretrained fasttext from Facebook. https://fasttext.cc/docs/en/language-identification.html

Simply download pretrained model,

```bash
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
```

In [4]:
model = fasttext.load_model('lid.176.ftz') 




In [5]:
fast_text = malaya.language_detection.fasttext()




In [6]:
fast_text.predict(indon_text,get_proba=True)

{'eng': 0.0,
 'ind': 1.0000107288360596,
 'malay': 0.0,
 'manglish': 0.0,
 'other': 0.0,
 'rojak': 0.0}

In [7]:
model.predict(indon_text)

(('__label__id',), array([0.40272361]))

In [8]:
fast_text.predict(malay_text,get_proba=True)

{'eng': 0.0,
 'ind': 0.0,
 'malay': 0.9999417066574097,
 'manglish': 0.0,
 'other': 0.0,
 'rojak': 0.0}

In [9]:
model.predict(malay_text)

(('__label__ms',), array([0.57101035]))

In [10]:
fast_text.predict(socialmedia_malay_text,get_proba=True)

{'eng': 0.0,
 'ind': 0.0,
 'malay': 0.9999960660934448,
 'manglish': 0.0,
 'other': 0.0,
 'rojak': 0.0}

In [11]:
model.predict(socialmedia_malay_text)

(('__label__id',), array([0.7870034]))

In [12]:
fast_text.predict(socialmedia_indon_text,get_proba=True)

{'eng': 0.0,
 'ind': 1.0000200271606445,
 'malay': 0.0,
 'manglish': 0.0,
 'other': 0.0,
 'rojak': 0.0}

In [13]:
model.predict(socialmedia_indon_text)

(('__label__fr',), array([0.2912012]))

In [14]:
fast_text.predict(rojak_text,get_proba=True)

{'eng': 0.0,
 'ind': 0.0,
 'malay': 0.0,
 'manglish': 0.0,
 'other': 0.0,
 'rojak': 0.9999275207519531}

In [15]:
model.predict(rojak_text)

(('__label__id',), array([0.87948251]))

In [16]:
fast_text.predict(manglish_text,get_proba=True)

{'eng': 0.0,
 'ind': 0.0,
 'malay': 0.0,
 'manglish': 1.000038981437683,
 'other': 0.0,
 'rojak': 0.0}

In [17]:
model.predict(manglish_text)

(('__label__en',), array([0.89707506]))

In [18]:
fast_text.predict(chinese_text, get_proba = True)

{'eng': 0.0,
 'ind': 0.0,
 'malay': 0.0,
 'manglish': 0.0,
 'other': 0.5427265167236328,
 'rojak': 0.0}

In [19]:
model.predict(chinese_text)

(('__label__zh',), array([0.97311586]))

In [20]:
fast_text.predict_batch([indon_text,malay_text],get_proba=True)

[{'eng': 0.0,
  'ind': 1.0000107288360596,
  'malay': 0.0,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0},
 {'eng': 0.0,
  'ind': 0.0,
  'malay': 0.9999417066574097,
  'manglish': 0.0,
  'other': 0.0,
  'rojak': 0.0}]

## Load Deep learning model

Deep learning model is slightly more accurate then fast-text model, but the size is around 50MB, while fast-text just like 15MB. Can check accuracy comparison at here, https://malaya.readthedocs.io/en/latest/Accuracy.html#language-detection

In [21]:
deep = malaya.language_detection.deep_model()




Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.




INFO:tensorflow:Restoring parameters from /Users/huseinzolkepli/Malaya/language-detection/deep/model.ckpt


In [22]:
deep.predict(indon_text)




'ind'

In [23]:
deep.predict(malay_text)

'malay'

In [24]:
deep.predict_batch([indon_text,malay_text])

['ind', 'malay']

In [25]:
deep.predict(socialmedia_malay_text)

'malay'

In [26]:
deep.predict(socialmedia_indon_text)

'ind'

In [27]:
deep.predict(rojak_text, get_proba = True)

{'eng': 7.730631e-08,
 'ind': 0.008739273,
 'malay': 0.00026563255,
 'manglish': 3.1339885e-05,
 'other': 7.3840456e-06,
 'rojak': 0.99095637}

In [29]:
deep.predict_batch([rojak_text, malay_text])

['rojak', 'malay']