# Detecting Text Language by Counting Stop Words

Stop words are words which are filtered out before processing because they are mostly grammatical as opposed to semantic in nature e.g. search engines remove words like 'want'.

# 1. Tokenizing

In [1]:
import sys
from nltk.tokenize import wordpunct_tokenize 
# RE-based tokenizer which splits text on whitespace and punctuation (except for underscore)

In [2]:
text = "Yo man, it's time for you to shut yo' mouth! I ain't even messin' dawg."

In [3]:
test_tokens = wordpunct_tokenize(text)
test_tokens

['Yo',
 'man',
 ',',
 'it',
 "'",
 's',
 'time',
 'for',
 'you',
 'to',
 'shut',
 'yo',
 "'",
 'mouth',
 '!',
 'I',
 'ain',
 "'",
 't',
 'even',
 'messin',
 "'",
 'dawg',
 '.']

There are other tokenizers e.g. RegexpTokenizer where we can enter your own regexp, WhitespaceTokenizer (similar to Python's  string.split()) and BlanklineTokenizer.

# 2. Exploring NLTK's stop words corpus

NLTK comes with a corpus of stop words in various languages.

In [4]:
from nltk.corpus import stopwords
stopwords.readme().replace('\n', ' ') 
# Since this is raw text, we need to replace \n's with spaces for it to be readable.

'Stopwords Corpus  This corpus contains lists of stop words for several languages.  These are high-frequency grammatical words which are usually ignored in text retrieval applications.  They were obtained from: http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/  The stop words for the Romanian language were obtained from: http://arlc.ro/resources/  The English list has been augmented https://github.com/nltk/nltk_data/issues/22  The German list has been corrected https://github.com/nltk/nltk_data/pull/49  A Kazakh list has been added https://github.com/nltk/nltk_data/pull/52  A Nepali list has been added https://github.com/nltk/nltk_data/pull/83  An Azerbaijani list has been added https://github.com/nltk/nltk_data/pull/100  A Greek list has been added https://github.com/nltk/nltk_data/pull/103  An Indonesian list has been added https://github.com/nltk/nltk_data/pull/112 '

In [5]:
stopwords.fileids()
# Most corpora consist of a set of files, each containing a piece of text. 
# A list of identifiers for these files is accessed via fileids().

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

Corpus readers provide a variety of methods to read data from the corpus:

In [9]:
stopwords.raw('french')

'au\naux\navec\nce\nces\ndans\nde\ndes\ndu\nelle\nen\net\neux\nil\nje\nla\nle\nleur\nlui\nma\nmais\nme\nmême\nmes\nmoi\nmon\nne\nnos\nnotre\nnous\non\nou\npar\npas\npour\nqu\nque\nqui\nsa\nse\nses\nson\nsur\nta\nte\ntes\ntoi\nton\ntu\nun\nune\nvos\nvotre\nvous\nc\nd\nj\nl\nà\nm\nn\ns\nt\ny\nété\nétée\nétées\nétés\nétant\nétante\nétants\nétantes\nsuis\nes\nest\nsommes\nêtes\nsont\nserai\nseras\nsera\nserons\nserez\nseront\nserais\nserait\nserions\nseriez\nseraient\nétais\nétait\nétions\nétiez\nétaient\nfus\nfut\nfûmes\nfûtes\nfurent\nsois\nsoit\nsoyons\nsoyez\nsoient\nfusse\nfusses\nfût\nfussions\nfussiez\nfussent\nayant\nayante\nayantes\nayants\neu\neue\neues\neus\nai\nas\navons\navez\nont\naurai\nauras\naura\naurons\naurez\nauront\naurais\naurait\naurions\nauriez\nauraient\navais\navait\navions\naviez\navaient\neut\neûmes\neûtes\neurent\naie\naies\nait\nayons\nayez\naient\neusse\neusses\neût\neussions\neussiez\neussent\n'

In [10]:
# Reading it Better
stopwords.raw('french').replace('\n', ' ')

'au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l à m n s t y été étée étées étés étant étante étants étantes suis es est sommes êtes sont serai seras sera serons serez seront serais serait serions seriez seraient étais était étions étiez étaient fus fut fûmes fûtes furent sois soit soyons soyez soient fusse fusses fût fussions fussiez fussent ayant ayante ayantes ayants eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent '

In [13]:
stopwords.words('english')[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

We can also use .sents() which returns sentences. However, in our particular case, this will cause an error:

In [14]:
stopwords.sents('english')

AttributeError: 'WordListCorpusReader' object has no attribute 'sents'

The error is because the stopwords corpus reader is of type WordListCorpusReader so there are no sentences. It's the same for .paras().

In [16]:
len(stopwords.words(['english'])) # There is a total of 179 English stop words

179

In [17]:
len(stopwords.words(['Greek'])) # There is a total of 265 Greek stop words

265

In [18]:
len(stopwords.words(['english', 'greek'])) # There is a total of 444 Greek and English stop words

444

# 3. The classification

We loop through the list of stop words in all languages and check how many stop words our test text contains in each language. The text is then classified to be in the language in which it has the most stop words.

In [23]:
language_ratios = {}

test_words = [word.lower() for word in test_tokens] # lowercase all tokens
test_words_set = set(test_words)
test_words_set

{'!',
 "'",
 ',',
 '.',
 'ain',
 'dawg',
 'even',
 'for',
 'i',
 'it',
 'man',
 'messin',
 'mouth',
 's',
 'shut',
 't',
 'time',
 'to',
 'yo',
 'you'}

In [24]:
for language in stopwords.fileids():
    stopwords_set = set(stopwords.words(language)) 
    # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too.
    common_elements = test_words_set.intersection(stopwords_set)
    language_ratios[language] = len(common_elements) # language "score"

In [25]:
# lets view the ratio
language_ratios

{'arabic': 0,
 'azerbaijani': 0,
 'danish': 3,
 'dutch': 0,
 'english': 8,
 'finnish': 0,
 'french': 2,
 'german': 1,
 'greek': 0,
 'hungarian': 1,
 'indonesian': 0,
 'italian': 1,
 'kazakh': 0,
 'nepali': 0,
 'norwegian': 3,
 'portuguese': 1,
 'romanian': 2,
 'russian': 0,
 'spanish': 1,
 'swedish': 2,
 'turkish': 0}

In [26]:
# The key parameter to the max() function is a function that computes a key. In our case, we already have a key so we set key to languages_ratios.get which actually returns the key.
most_rated_language = max(language_ratios, key=language_ratios.get) 
most_rated_language

'english'

In [27]:
test_words_set.intersection(set(stopwords.words(most_rated_language))) # We can see which English stop words were found.

{'ain', 'for', 'i', 'it', 's', 't', 'to', 'you'}