In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import os
import pickle
import json
import re



In [2]:
with open('language-detection-data-v3.json','r') as fopen:
    loaded = json.load(fopen)
    sentences = loaded['text']
    langs = loaded['label']

In [3]:
target = LabelEncoder().fit_transform(langs)
bow_chars = CountVectorizer(ngram_range=(1, 5), analyzer='char').fit(sentences)
features = bow_chars.transform(sentences)

In [4]:
train_X, test_X, train_Y, test_Y = train_test_split(features, target, test_size = 0.2)
del features

In [5]:
from sklearn import metrics

In [7]:
train_X.shape

(205973, 1604515)

In [8]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 4,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent':True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:1.25139
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.810296
[10]	validation-mlogloss:0.580655
[15]	validation-mlogloss:0.436803
[20]	validation-mlogloss:0.343211
[25]	validation-mlogloss:0.280427
[30]	validation-mlogloss:0.237192
[35]	validation-mlogloss:0.205339
[40]	validation-mlogloss:0.181497
[45]	validation-mlogloss:0.164338
[50]	validation-mlogloss:0.149914
[55]	validation-mlogloss:0.138163
[60]	validation-mlogloss:0.12843
[65]	validation-mlogloss:0.120969
[70]	validation-mlogloss:0.113923
[75]	validation-mlogloss:0.107904
[80]	validation-mlogloss:0.102263
[85]	validation-mlogloss:0.097207
[90]	validation-mlogloss:0.093166
[95]	validation-mlogloss:0.089176
[100]	validation-mlogloss:0.085699
[105]	validation-mlogloss:0.08254
[110]	validation-mlogloss:0.079482
[115]	validation-mlogloss:0.077007
[120]	validation-mlogloss:0.074483
[125]	validation-mlogloss:0.072032
[130]	validation-mlogloss:0.069583
[135]	valida

[1165]	validation-mlogloss:0.010408
[1170]	validation-mlogloss:0.010399
[1175]	validation-mlogloss:0.010379
[1180]	validation-mlogloss:0.010362
[1185]	validation-mlogloss:0.010341
[1190]	validation-mlogloss:0.01033
[1195]	validation-mlogloss:0.010304
[1200]	validation-mlogloss:0.010291
[1205]	validation-mlogloss:0.010273
[1210]	validation-mlogloss:0.010256
[1215]	validation-mlogloss:0.010244
[1220]	validation-mlogloss:0.010219
[1225]	validation-mlogloss:0.010206
[1230]	validation-mlogloss:0.010195
[1235]	validation-mlogloss:0.010181
[1240]	validation-mlogloss:0.01016
[1245]	validation-mlogloss:0.010144
[1250]	validation-mlogloss:0.010134
[1255]	validation-mlogloss:0.010116
[1260]	validation-mlogloss:0.010111
[1265]	validation-mlogloss:0.010098
[1270]	validation-mlogloss:0.010085
[1275]	validation-mlogloss:0.010077
[1280]	validation-mlogloss:0.010074
[1285]	validation-mlogloss:0.010062
[1290]	validation-mlogloss:0.010049
[1295]	validation-mlogloss:0.010041
[1300]	validation-mlogloss:0.0

[2310]	validation-mlogloss:0.009336
[2315]	validation-mlogloss:0.009334
[2320]	validation-mlogloss:0.009333
[2325]	validation-mlogloss:0.009331
[2330]	validation-mlogloss:0.009333
[2335]	validation-mlogloss:0.009332
[2340]	validation-mlogloss:0.009331
[2345]	validation-mlogloss:0.009332
[2350]	validation-mlogloss:0.009331
[2355]	validation-mlogloss:0.009332
[2360]	validation-mlogloss:0.009327
[2365]	validation-mlogloss:0.009327
[2370]	validation-mlogloss:0.009325
[2375]	validation-mlogloss:0.009324
[2380]	validation-mlogloss:0.009324
[2385]	validation-mlogloss:0.009323
[2390]	validation-mlogloss:0.009323
[2395]	validation-mlogloss:0.009323
[2400]	validation-mlogloss:0.009321
[2405]	validation-mlogloss:0.009315
[2410]	validation-mlogloss:0.009315
[2415]	validation-mlogloss:0.009318
[2420]	validation-mlogloss:0.009318
[2425]	validation-mlogloss:0.009314
[2430]	validation-mlogloss:0.009312
[2435]	validation-mlogloss:0.009308
[2440]	validation-mlogloss:0.009306
[2445]	validation-mlogloss:0

[3455]	validation-mlogloss:0.009109
[3460]	validation-mlogloss:0.009109
[3465]	validation-mlogloss:0.009109
[3470]	validation-mlogloss:0.00911
[3475]	validation-mlogloss:0.009108
[3480]	validation-mlogloss:0.009107
[3485]	validation-mlogloss:0.009106
[3490]	validation-mlogloss:0.009107
[3495]	validation-mlogloss:0.009105
[3500]	validation-mlogloss:0.009105
[3505]	validation-mlogloss:0.009105
[3510]	validation-mlogloss:0.009104
[3515]	validation-mlogloss:0.009103
[3520]	validation-mlogloss:0.009102
[3525]	validation-mlogloss:0.009101
[3530]	validation-mlogloss:0.009101
[3535]	validation-mlogloss:0.009097
[3540]	validation-mlogloss:0.009097
[3545]	validation-mlogloss:0.009097
[3550]	validation-mlogloss:0.009096
[3555]	validation-mlogloss:0.009096
[3560]	validation-mlogloss:0.009094
[3565]	validation-mlogloss:0.009092
[3570]	validation-mlogloss:0.009091
[3575]	validation-mlogloss:0.009092
[3580]	validation-mlogloss:0.009089
[3585]	validation-mlogloss:0.009087
[3590]	validation-mlogloss:0.

In [9]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['OTHER', 'eng', 'ind', 'zlm']))

             precision    recall  f1-score   support

      OTHER       1.00      1.00      1.00     15913
        eng       1.00      1.00      1.00     16014
        ind       1.00      1.00      1.00     11163
        zlm       1.00      0.99      1.00      8404

avg / total       1.00      1.00      1.00     51494



In [10]:
%%time
model.predict(xgb.DMatrix(test_X[:10]),ntree_limit=model.best_ntree_limit)

CPU times: user 472 ms, sys: 12 ms, total: 484 ms
Wall time: 471 ms


array([[5.6092135e-06, 9.9998343e-01, 1.0311571e-05, 6.8173944e-07],
       [9.9997509e-01, 7.8620596e-06, 1.3869825e-05, 3.2090541e-06],
       [4.3909775e-05, 9.9994111e-01, 2.7721569e-06, 1.2177001e-05],
       [9.9996281e-01, 1.1751860e-06, 2.6583111e-05, 9.4354264e-06],
       [8.1835212e-03, 9.9168515e-01, 5.1244333e-05, 8.0156526e-05],
       [3.4427110e-08, 3.7077743e-09, 8.6153857e-04, 9.9913836e-01],
       [9.9961674e-01, 3.2068553e-04, 5.8512782e-05, 4.0465679e-06],
       [2.9904972e-05, 9.9991071e-01, 3.2941181e-05, 2.6488804e-05],
       [9.9999285e-01, 2.7119893e-07, 4.9593837e-06, 1.9269939e-06],
       [2.1033115e-07, 9.9999583e-01, 2.6217469e-06, 1.3550665e-06]],
      dtype=float32)

In [26]:
chinese_text = '今天是６月１８号，也是Muiriel的生日！'
english_text = 'i totally love it man'
indon_text = 'berbicara dalam bahasa Indonesia membutuhkan teknologi yang baik untuk bekerja dengan baik, tetapi teknologi yang sulit didapat'
malay_text = 'beliau berkata program Inisitif Peduli Rakyat (IPR) yang diperkenalkan oleh kerajaan negeri Selangor lebih besar sumbangannya'

In [11]:
with open('xgboost-language-detection.pkl','wb') as fopen:
    pickle.dump(model,fopen)

In [12]:
with open('language-detection-vectorizer.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)

In [13]:
del train_d, test_d, model

In [14]:
from sklearn.naive_bayes import MultinomialNB
multinomial = MultinomialNB().fit(train_X, train_Y)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = np.unique(langs)))

             precision    recall  f1-score   support

      OTHER       1.00      0.99      1.00     64087
        eng       0.98      1.00      0.99     63986
        ind       0.97      0.99      0.98     43873
        zlm       0.99      0.93      0.96     34027

avg / total       0.98      0.98      0.98    205973



In [15]:
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = np.unique(langs)))

             precision    recall  f1-score   support

      OTHER       1.00      0.99      0.99     15913
        eng       0.97      1.00      0.99     16014
        ind       0.97      0.99      0.98     11163
        zlm       0.99      0.93      0.96      8404

avg / total       0.98      0.98      0.98     51494



In [16]:
with open('multinomial-language-detection.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)

In [17]:
del multinomial

In [18]:
from sklearn.linear_model import SGDClassifier

In [19]:
sgd = SGDClassifier(loss='modified_huber',penalty='elasticnet').fit(train_X, train_Y)
print(metrics.classification_report(train_Y, sgd.predict(train_X), target_names = np.unique(langs)))



             precision    recall  f1-score   support

      OTHER       0.99      1.00      1.00     64087
        eng       1.00      1.00      1.00     63986
        ind       0.99      1.00      0.99     43873
        zlm       1.00      0.98      0.99     34027

avg / total       0.99      0.99      0.99    205973



In [20]:
print(metrics.classification_report(test_Y, sgd.predict(test_X), target_names = np.unique(langs)))

             precision    recall  f1-score   support

      OTHER       0.99      1.00      1.00     15913
        eng       1.00      1.00      1.00     16014
        ind       0.99      0.99      0.99     11163
        zlm       0.99      0.97      0.98      8404

avg / total       0.99      0.99      0.99     51494



In [21]:
%%time
sgd.predict_proba(test_X[:10])

CPU times: user 44 ms, sys: 4 ms, total: 48 ms
Wall time: 42.3 ms


array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]])

In [None]:
with open('sgd-language-detection.pkl','wb') as fopen:
    pickle.dump(sgd,fopen)

In [29]:
sgd.predict(bow_chars.transform(['saya suka makan nasi']))

array([2])