In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os
import re
import pickle
import json

def clean_text(string):
    string = re.sub(u'[0-9!@#$%^&*()_\-+{}|\~`\'";:?/.>,<]', ' ', string.lower(), flags=re.UNICODE)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [2]:
with open('language-detection-data-v5.json','r') as fopen:
    loaded = json.load(fopen)
    sentences = [clean_text(text) for text in loaded['text']]
    langs = loaded['label']

In [3]:
np.unique(langs,return_counts=True)

(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),
 array([46910, 50000, 57327, 53692]))

In [4]:
with open('language-detection-vectorizer.pkl','rb') as fopen:
    bow_chars = pickle.load(fopen)

In [5]:
%%time
target = LabelEncoder().fit_transform(langs)
features = bow_chars.transform(sentences)
features.shape

CPU times: user 1min 22s, sys: 144 ms, total: 1min 22s
Wall time: 1min 22s


In [6]:
train_X, test_X, train_Y, test_Y = train_test_split(features, target, test_size = 0.2)
del features

In [7]:
from sklearn import metrics

In [8]:
train_X.shape

(166343, 660726)

In [9]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)

In [10]:
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 4,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent':True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=20, verbose_eval=5)

[0]	validation-mlogloss:1.2413
Will train until validation-mlogloss hasn't improved in 20 rounds.
[5]	validation-mlogloss:0.761826
[10]	validation-mlogloss:0.512241
[15]	validation-mlogloss:0.364837
[20]	validation-mlogloss:0.272479
[25]	validation-mlogloss:0.213201
[30]	validation-mlogloss:0.174409
[35]	validation-mlogloss:0.14651
[40]	validation-mlogloss:0.126887
[45]	validation-mlogloss:0.11262
[50]	validation-mlogloss:0.101767
[55]	validation-mlogloss:0.093354
[60]	validation-mlogloss:0.086558
[65]	validation-mlogloss:0.080871
[70]	validation-mlogloss:0.076038
[75]	validation-mlogloss:0.071999
[80]	validation-mlogloss:0.068545
[85]	validation-mlogloss:0.065656
[90]	validation-mlogloss:0.062895
[95]	validation-mlogloss:0.060362
[100]	validation-mlogloss:0.058197
[105]	validation-mlogloss:0.056245
[110]	validation-mlogloss:0.05447
[115]	validation-mlogloss:0.05281
[120]	validation-mlogloss:0.051294
[125]	validation-mlogloss:0.049879
[130]	validation-mlogloss:0.048627
[135]	validation

In [11]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['OTHER', 'eng', 'ind', 'zlm']))

              precision    recall  f1-score   support

       OTHER       0.98      0.99      0.99      9424
         eng       1.00      0.99      0.99      9972
         ind       1.00      0.99      0.99     11511
         zlm       1.00      1.00      1.00     10679

   micro avg       0.99      0.99      0.99     41586
   macro avg       0.99      0.99      0.99     41586
weighted avg       0.99      0.99      0.99     41586



In [12]:
%%time
model.predict(xgb.DMatrix(test_X[:10]),ntree_limit=model.best_ntree_limit)

CPU times: user 176 ms, sys: 0 ns, total: 176 ms
Wall time: 168 ms


array([[2.6191405e-08, 8.8543983e-10, 9.9999976e-01, 2.0418724e-07],
       [3.3177045e-13, 4.8105534e-15, 9.9999988e-01, 9.2294400e-08],
       [2.0692481e-07, 5.1412195e-12, 4.1463139e-08, 9.9999976e-01],
       [9.9652714e-01, 2.4567433e-03, 9.7234029e-04, 4.3783864e-05],
       [4.2808292e-11, 8.3007802e-11, 9.9999928e-01, 6.9759943e-07],
       [1.9842105e-04, 9.9978501e-01, 1.2235393e-05, 4.3988857e-06],
       [2.8214106e-02, 9.7169679e-01, 6.0329297e-05, 2.8760714e-05],
       [9.2271131e-07, 4.4209383e-11, 1.3530789e-07, 9.9999893e-01],
       [9.9976605e-01, 8.5635213e-05, 1.4085708e-04, 7.4979439e-06],
       [1.8502185e-05, 9.0518365e-10, 2.4503325e-07, 9.9998128e-01]],
      dtype=float32)

In [13]:
chinese_text = '今天是６月１８号，也是Muiriel的生日！'
english_text = 'i totally love it man'
indon_text = 'persemakmuran serikat terletak serikat bergabung terkenal terkenal penyulingan musik basket tingginya terletak referensi referensi studi master union'
malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'

In [14]:
with open('xgboost-language-detection.pkl','wb') as fopen:
    pickle.dump(model,fopen)

In [15]:
del train_d, test_d, model

In [16]:
from sklearn.naive_bayes import MultinomialNB
multinomial = MultinomialNB().fit(train_X, train_Y)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = np.unique(langs)))

              precision    recall  f1-score   support

       OTHER       1.00      0.98      0.99     37486
         eng       0.99      1.00      1.00     40028
         ind       1.00      1.00      1.00     45816
         zlm       0.99      1.00      0.99     43013

   micro avg       0.99      0.99      0.99    166343
   macro avg       0.99      0.99      0.99    166343
weighted avg       0.99      0.99      0.99    166343



In [17]:
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = np.unique(langs)))

              precision    recall  f1-score   support

       OTHER       1.00      0.97      0.99      9424
         eng       0.99      1.00      0.99      9972
         ind       1.00      1.00      1.00     11511
         zlm       0.99      1.00      0.99     10679

   micro avg       0.99      0.99      0.99     41586
   macro avg       0.99      0.99      0.99     41586
weighted avg       0.99      0.99      0.99     41586



In [18]:
with open('multinomial-language-detection.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)

In [19]:
del multinomial

In [20]:
from sklearn.linear_model import SGDClassifier

In [21]:
sgd = SGDClassifier(loss='modified_huber',penalty='elasticnet').fit(train_X, train_Y)
print(metrics.classification_report(train_Y, sgd.predict(train_X), target_names = np.unique(langs)))



              precision    recall  f1-score   support

       OTHER       0.98      1.00      0.99     37486
         eng       1.00      0.99      0.99     40028
         ind       1.00      0.99      1.00     45816
         zlm       1.00      1.00      1.00     43013

   micro avg       0.99      0.99      0.99    166343
   macro avg       0.99      0.99      0.99    166343
weighted avg       0.99      0.99      0.99    166343



In [22]:
print(metrics.classification_report(test_Y, sgd.predict(test_X), target_names = np.unique(langs)))

              precision    recall  f1-score   support

       OTHER       0.97      0.99      0.98      9424
         eng       0.99      0.99      0.99      9972
         ind       1.00      0.99      0.99     11511
         zlm       1.00      1.00      1.00     10679

   micro avg       0.99      0.99      0.99     41586
   macro avg       0.99      0.99      0.99     41586
weighted avg       0.99      0.99      0.99     41586



In [24]:
sgd.predict(bow_chars.transform(['Prakiraan Cuaca dan Tinggi Gelombang tgl 29 Des 2018 di wilayah Posko Terpadu Penanggulangan Bencana Tsunami Banten']))

array([0])

In [23]:
with open('sgd-language-detection.pkl','wb') as fopen:
    pickle.dump(sgd,fopen)