In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
import os
import pickle
import json
import re



In [2]:
with open('language_dict.json','r') as fopen:
    languages = json.load(fopen)

#### You can get the dataset from [here](https://tatoeba.org/eng/downloads)

In [3]:
lang = pd.read_csv('sentences.csv',sep='\t')
lang = lang.dropna()
lang.head()

Unnamed: 0,1,cmn,我們試試看！
0,2,cmn,我该去睡觉了。
1,3,cmn,你在干什麼啊？
2,4,cmn,這是什麼啊？
3,5,cmn,今天是６月１８号，也是Muiriel的生日！
4,6,cmn,生日快乐，Muiriel！


In [4]:
selected_langs = ['zlm','eng','ind']
lang.loc[~lang.cmn.isin(selected_langs),'cmn'] = 'OTHER'
selected_langs.append('OTHER')
sentences, langs = [], []
for i in selected_langs:
    filtered = lang.loc[lang.cmn == i]
    sentences += filtered.iloc[:100000,-1].tolist()
    langs += filtered.iloc[:100000,1].tolist()

In [5]:
del lang

In [6]:
np.unique(langs,return_counts=True)

(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),
 array([100000, 100000,  11808,     91]))

In [7]:
for file in ['negative','positive']:
    with open(file,'r') as fopen:
        bm = (' '.join(fopen.read().split('\n'))).split()
        new_langs = [' '.join(bm[i:i+4]) for i in range(0, len(bm), 4)] 
        sentences += new_langs
        langs += ['zlm'] * len(new_langs)

In [8]:
def simple_textcleaning_language_detection(string):
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = filter(None, string.split())
    string = [y.strip() for y in string if len(y) > 1]
    return ' '.join(string).lower()

In [9]:
bm = ''
for i in [i for i in os.listdir(os.getcwd()) if i.find('isu')>=0]:
    with open(i,'r') as fopen:
        isu = json.load(fopen)
    bm += ' '.join([simple_textcleaning_language_detection(i['summary']) for i in isu if i['language']=='id'])
bm = bm.split()
new_langs = [' '.join(bm[i:i+4]) for i in range(0, len(bm), 4)] 
sentences += new_langs
langs += ['zlm'] * len(new_langs)

In [10]:
np.unique(langs,return_counts=True)

(array(['OTHER', 'eng', 'ind', 'zlm'], dtype='<U5'),
 array([100000, 100000,  11808,  94281]))

In [11]:
target = LabelEncoder().fit_transform(langs)
bow_chars = CountVectorizer(ngram_range=(2, 4), analyzer='char').fit(sentences)
vectors = bow_chars.transform(sentences)
vectors.shape

(306089, 934458)

In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)
del vectors

In [13]:
from sklearn import metrics

In [14]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = np.unique(langs)))

             precision    recall  f1-score   support

      OTHER       1.00      0.99      0.99     79852
        eng       0.98      1.00      0.99     80065
        ind       0.95      0.52      0.67      9458
        zlm       0.94      0.99      0.97     75496

avg / total       0.97      0.97      0.97    244871



In [15]:
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = np.unique(langs)))

             precision    recall  f1-score   support

      OTHER       1.00      0.98      0.99     20148
        eng       0.98      1.00      0.99     19935
        ind       0.91      0.49      0.64      2350
        zlm       0.94      0.99      0.97     18785

avg / total       0.97      0.97      0.97     61218



In [16]:
with open('multinomial-language-detection.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)

In [17]:
with open('bow-language-detection.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)