# Language Detection

## Test data

In [1]:
import pandas as pd

In [2]:
test = pd.DataFrame([["Hello world", "en"], 
                    ["Чудесный новый мир", "ru"],
                    ["Objects passed to the function are Series objects whose index is either the DataFrame’s index (axis=0) or the DataFrame’s columns (axis=1).", "en"],
                    ["Unter dem Namen Tagfalter werden Schmetterlinge aus verschiedenen Familien, die hauptsächlich tagsüber fliegen, zusammengefasst. Die Tagfalter im engeren Sinne bildet aber nur eine Gruppe von Familien, die tatsächlich alle nahe miteinander verwandt sind.", "de"]],
                    columns = ['Text', 'Language'])
test

Unnamed: 0,Text,Language
0,Hello world,en
1,Чудесный новый мир,ru
2,Objects passed to the function are Series obje...,en
3,Unter dem Namen Tagfalter werden Schmetterling...,de


## Using langid.py

[langid.py](https://github.com/saffsd/langid.py) is a standalone Language Identification (LangID) tool.

In [3]:
!pip install langid
import langid



The value returned is the unnormalized probability estimate for the language.

In [4]:
langid.classify(test['Text'][0])

('en', -23.719746112823486)

In [5]:
langid.classify(test['Text'][1])

('ru', -354.3220765590668)

## Using stop-words

[Original post](http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/)

In [6]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords

In [7]:
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios

def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return 'unknown' if ratios[most_rated_language] == 0 else most_rated_language

In [8]:
detect_language(test['Text'][0])

'unknown'

In [9]:
detect_language(test['Text'][1])

'unknown'

## Test

In [11]:
df = pd.DataFrame(test)
df = df.assign(m2=test['Text'].apply(lambda text: langid.classify(text)),
              m3=test['Text'].apply(lambda text: detect_language(text)))
df.columns = ['Text', 'Language', 'Classified by langid.py', 'Classified by stop-words']

In [12]:
df

Unnamed: 0,Text,Language,Classified by langid.py,Classified by stop-words
0,Hello world,en,"(en, -23.719746112823486)",unknown
1,Чудесный новый мир,ru,"(ru, -354.3220765590668)",unknown
2,Objects passed to the function are Series obje...,en,"(en, -291.201434135437)",english
3,Unter dem Namen Tagfalter werden Schmetterling...,de,"(de, -799.5419707298279)",german
