In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import os
import pickle
import json
import re



In [2]:
with open('language_dict.json','r') as fopen:
    languages = json.load(fopen)

#### You can get the dataset from [here](https://tatoeba.org/eng/downloads)

In [3]:
lang = pd.read_csv('sentences.csv',sep='\t')
lang = lang.dropna()
lang.head()

Unnamed: 0,1,cmn,我們試試看！
0,2,cmn,我该去睡觉了。
1,3,cmn,你在干什麼啊？
2,4,cmn,這是什麼啊？
3,5,cmn,今天是６月１８号，也是Muiriel的生日！
4,6,cmn,生日快乐，Muiriel！


In [4]:
selected_langs = ['zlm','eng','ind']
lang.loc[~lang.cmn.isin(selected_langs),'cmn'] = 'OTHER'
selected_langs.append('OTHER')
sentences, langs = [], []
for i in selected_langs:
    filtered = lang.loc[lang.cmn == i]
    sentences += filtered.iloc[:80000,-1].tolist()
    langs += filtered.iloc[:80000,1].tolist()

In [5]:
del lang

In [6]:
np.unique(langs,return_counts=True)

(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),
 array([80000, 80000, 11808,    91]))

In [7]:
for file in ['negative','positive']:
    with open(file,'r') as fopen:
        bm = (' '.join(fopen.read().split('\n'))).split()
        new_langs = [' '.join(bm[i:i+4]) for i in range(0, len(bm), 4)] 
        sentences += new_langs
        langs += ['zlm'] * len(new_langs)

In [8]:
def simple_textcleaning_language_detection(string):
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = filter(None, string.split())
    string = [y.strip() for y in string if len(y) > 1]
    return ' '.join(string).lower()

In [9]:
bm = ''
for i in [i for i in os.listdir(os.getcwd()) if i.find('isu')>=0][:20]:
    with open(i,'r') as fopen:
        isu = json.load(fopen)
    bm += ' '.join([simple_textcleaning_language_detection(i['summary']) for i in isu if i['language']=='id'])
bm = bm.split()
new_langs = [' '.join(bm[i:i+4]) for i in range(0, len(bm), 4)] 
sentences += new_langs
langs += ['zlm'] * len(new_langs)

In [10]:
np.unique(langs,return_counts=True)

(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),
 array([80000, 80000, 11808, 73106]))

In [11]:
target = LabelEncoder().fit_transform(langs)
bow_chars = CountVectorizer(ngram_range=(2, 4), analyzer='char').fit(sentences)
vectors = bow_chars.transform(sentences)
vectors.shape

(244914, 783541)

In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)
del vectors

In [13]:
from sklearn import metrics

In [14]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': 4,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent':True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=5)

[0]	validation-mlogloss:1.25802
Will train until validation-mlogloss hasn't improved in 100 rounds.
[5]	validation-mlogloss:0.797518
[10]	validation-mlogloss:0.559224
[15]	validation-mlogloss:0.411225
[20]	validation-mlogloss:0.314492
[25]	validation-mlogloss:0.250854
[30]	validation-mlogloss:0.207083
[35]	validation-mlogloss:0.175508
[40]	validation-mlogloss:0.151243
[45]	validation-mlogloss:0.133398
[50]	validation-mlogloss:0.119393
[55]	validation-mlogloss:0.108089
[60]	validation-mlogloss:0.098856
[65]	validation-mlogloss:0.090922
[70]	validation-mlogloss:0.084587
[75]	validation-mlogloss:0.079036
[80]	validation-mlogloss:0.073985
[85]	validation-mlogloss:0.069712
[90]	validation-mlogloss:0.065984
[95]	validation-mlogloss:0.062644
[100]	validation-mlogloss:0.059571
[105]	validation-mlogloss:0.056827
[110]	validation-mlogloss:0.054316
[115]	validation-mlogloss:0.051973
[120]	validation-mlogloss:0.049912
[125]	validation-mlogloss:0.04793
[130]	validation-mlogloss:0.046164
[135]	valid

[1165]	validation-mlogloss:0.014672
[1170]	validation-mlogloss:0.01467
[1175]	validation-mlogloss:0.014673
[1180]	validation-mlogloss:0.014663
[1185]	validation-mlogloss:0.014663
[1190]	validation-mlogloss:0.014656
[1195]	validation-mlogloss:0.014649
[1200]	validation-mlogloss:0.01464
[1205]	validation-mlogloss:0.014642
[1210]	validation-mlogloss:0.014632
[1215]	validation-mlogloss:0.014625
[1220]	validation-mlogloss:0.014623
[1225]	validation-mlogloss:0.01461
[1230]	validation-mlogloss:0.014602
[1235]	validation-mlogloss:0.014594
[1240]	validation-mlogloss:0.014596
[1245]	validation-mlogloss:0.014593
[1250]	validation-mlogloss:0.014595
[1255]	validation-mlogloss:0.014596
[1260]	validation-mlogloss:0.014595
[1265]	validation-mlogloss:0.014597
[1270]	validation-mlogloss:0.014601
[1275]	validation-mlogloss:0.014591
[1280]	validation-mlogloss:0.014583
[1285]	validation-mlogloss:0.014584
[1290]	validation-mlogloss:0.014589
[1295]	validation-mlogloss:0.014581
[1300]	validation-mlogloss:0.01

In [17]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = ['OTHER', 'eng', 'ind', 'zlm']))

             precision    recall  f1-score   support

      OTHER       1.00      1.00      1.00     16045
        eng       1.00      1.00      1.00     16101
        ind       0.99      0.96      0.98      2378
        zlm       0.99      1.00      1.00     14459

avg / total       1.00      1.00      1.00     48983



In [21]:
%%time
model.predict(xgb.DMatrix(test_X[:10]),ntree_limit=model.best_ntree_limit)

CPU times: user 224 ms, sys: 0 ns, total: 224 ms
Wall time: 212 ms


array([[9.99844074e-01, 5.22631963e-05, 8.06017197e-05, 2.30375899e-05],
       [1.56041992e-07, 9.99998927e-01, 5.57112294e-08, 8.07711103e-07],
       [4.69850347e-05, 1.31818715e-05, 9.95305300e-01, 4.63460851e-03],
       [9.99860406e-01, 5.04074887e-05, 7.77397945e-05, 1.13682099e-05],
       [2.00512186e-05, 9.99937177e-01, 1.08563409e-05, 3.18840721e-05],
       [4.35217735e-05, 9.99954700e-01, 2.60621089e-08, 1.70737201e-06],
       [2.88806890e-09, 1.00000000e+00, 6.19760021e-10, 1.40123824e-08],
       [4.86070462e-07, 6.17368460e-06, 8.47673917e-04, 9.99145627e-01],
       [4.92698973e-07, 9.99999404e-01, 7.50260121e-09, 7.90154004e-08],
       [3.07888217e-06, 9.99996901e-01, 2.61773336e-09, 7.28362437e-08]],
      dtype=float32)

In [18]:
with open('xgboost-language-detection.pkl','wb') as fopen:
    pickle.dump(model,fopen)

In [19]:
with open('bow-xgboost-language-detection.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)