# 朴素贝叶斯完成语种检测

In [1]:
#读数据
in_f = open('data.csv')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]

In [2]:
dataset[:5]

[('1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme',
  'nl'),
 ('1 mill\xc3\xb3n de afectados ante las inundaciones en sri lanka unicef est\xc3\xa1 distribuyendo ayuda de emergencia srilanka',
  'es'),
 ('1 mill\xc3\xb3n de fans en facebook antes del 14 de febrero y paty miki dani y berta se tiran en paraca\xc3\xaddas qu\xc3\xa9 har\xc3\xadas t\xc3\xba porunmillondefans',
  'es'),
 ('1 satellite galileo sottoposto ai test presso lesaestec nl galileo navigation space in inglese',
  'it'),
 ('10 der welt sind bei', 'de')]

In [3]:
from sklearn.model_selection import train_test_split
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [4]:
len(x_test)

2267

In [5]:
len(y_test)

2267

In [6]:
len(x_train)

6799

In [10]:
print y_train

['it', 'de', 'de', 'es', 'it', 'es', 'de', 'de', 'en', 'es', 'it', 'it', 'it', 'nl', 'es', 'it', 'de', 'es', 'it', 'en', 'fr', 'de', 'it', 'it', 'de', 'de', 'fr', 'en', 'en', 'nl', 'es', 'it', 'nl', 'de', 'fr', 'fr', 'de', 'en', 'it', 'fr', 'de', 'en', 'en', 'fr', 'nl', 'nl', 'de', 'fr', 'fr', 'fr', 'es', 'fr', 'fr', 'es', 'de', 'it', 'es', 'it', 'es', 'de', 'nl', 'en', 'es', 'it', 'fr', 'it', 'nl', 'en', 'fr', 'de', 'de', 'fr', 'es', 'it', 'de', 'de', 'es', 'de', 'es', 'de', 'it', 'fr', 'de', 'fr', 'fr', 'it', 'es', 'es', 'en', 'fr', 'es', 'en', 'en', 'es', 'en', 'es', 'fr', 'nl', 'fr', 'de', 'en', 'nl', 'it', 'en', 'de', 'en', 'it', 'fr', 'fr', 'de', 'de', 'it', 'fr', 'nl', 'es', 'en', 'it', 'it', 'it', 'it', 'it', 'fr', 'es', 'nl', 'en', 'es', 'de', 'es', 'en', 'it', 'nl', 'de', 'en', 'es', 'fr', 'nl', 'it', 'de', 'it', 'de', 'nl', 'it', 'de', 'it', 'it', 'it', 'en', 'es', 'nl', 'es', 'en', 'en', 'es', 'nl', 'fr', 'fr', 'it', 'fr', 'es', 'nl', 'fr', 'es', 'it', 'en', 'fr', 'it', 'fr

In [7]:
import re

def remove_noise(document):
    noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
    clean_text = re.sub(noise_pattern, "", document)
    return clean_text.strip()

In [8]:
remove_noise("Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html")

'Trump images are now more popular than cat gifs.'

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
    lowercase=True,     # lowercase the text
    analyzer='char_wb', # tokenise by character ngrams
    ngram_range=(1,2),  # use ngrams of size 1 and 2
    max_features=1000,  # keep the most common 1000 ngrams
    preprocessor=remove_noise
)
vec.fit(x_train)

def get_features(x):
    vec.transform(x)

In [12]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
classifier.score(vec.transform(x_test), y_test)

0.9770621967357741

## 规范化，写为class

In [15]:
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class LanguageDetector():

    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000, preprocessor=self._remove_noise)

    def _remove_noise(self, document):
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

In [16]:
in_f = open('data.csv')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))

['en']
0.9770621967357741
