This is an example of how to use the WebLangTagger.

In [1]:
# The web language tagger
from weblangtagger import WebLangTagger

# Text from estnltk
from estnltk import Text

# Regex
import regex as re

# For creating the compound token tagger
from estnltk.taggers import TokensTagger
from estnltk.taggers import CompoundTokenTagger

# The list of all the compound token tagger first level patterns
from estnltk.taggers.text_segmentation.compound_token_tagger import ALL_1ST_LEVEL_PATTERNS

INFO:utils.py:157: NumExpr defaulting to 4 threads.


WebLangTagger uses CompoundTokenTagger layer 'compound_tokens' as input layer. CompoundTokenTagger is used to detect emoticons, web addresses, hashtags, usernames, email addresses and non ending abbreviations.

In [2]:
# Creating a new pattern to detect emoticons
# This pattern with a few modifications is form Kristiina Toots' bachelor thesis "Netikeele metagraafia"
new_emoticon_pattern = \
    { 'comment': '2.3) A pattern for capturing emoticons;',
      'example': ':)',
      'pattern_type': 'emoticon',
      '_regex_pattern_': re.compile(r"""(\(:-\)+|:\)+|:o\)+|:\]+|:3+|:c\)+|:>+|=\]+|8\)+|=\)+|:\}+|:\^\)
        +|:-D+|:D+|8-D+|8D+|x-D+|xD+|X-D+|XD+|=-D+|=D+|=-3+|=3+|B\^D+|>:\[+|:-\(+|:\(
        +|:-c+|:c+|:-<+|:<+|:-\[+|:\[+|:\{+|;\(+|:-\|\|+|:@+|>:\(+|:'-\(+|:'\(+|:'-\)+|:'\)
        +|D:<+|D8+|D;+|D=+|DX+|v\.v+|D-':
        +|>:O+|:-O+|:O+|:-o+|:o+|8-0+|O_O+|o-o+|O_o+|o_O+|o_o+|O-O+|-\.-|:\*|:\^\*+|\(\s'\}\
        {'\s\)|;-\)+|;\)+|\*-\)+|\*\)+|;-\]+|;\]+|;D+|;\^\)+|:-,
        +|>:P+|:-P+|:P+|X-P+|x-p+|xp+|XP+|:-p+|:p+|=p+|:-b+|:b+|:d+|d:+|>:\\+|>:/+|:-/+|:-\.+|:/
        +|:\\+|=/+|=\\+|:L+|=L+|:S+|>\.<+|:\|+|:-\|+|:\$+|:-X+|:X+|:-#+|:#+|O:-\)
        +|0:-3+|0:-\)+|0:\)+|>:\)+|>;\)+|>:-\)+|\|;-\)+|\|-O+|%-\)+|%\)+|<3+|</3+\)+)"""),
      '_group_': 1,
      '_priority_': (3, 0),
      'normalized': r"lambda m: re.sub(r'\s' ,'' , m.group(1))",
    }

# Adding the new pattern to the list of all patterns
ALL_1ST_LEVEL_PATTERNS.append( new_emoticon_pattern )

# Creating a CompoundTokenTagger that uses the new pattern in addition to the old
# tag_hashtags_and_usernames=True is optional
new_compound_token_tagger = CompoundTokenTagger(patterns_1=ALL_1ST_LEVEL_PATTERNS,tag_hashtags_and_usernames=True)
new_compound_token_tagger

name,output layer,output attributes,input layers
CompoundTokenTagger,compound_tokens,"('type', 'normalized')","('tokens',)"

0,1
custom_abbreviations,()
ignored_words,set()
tag_numbers,True
tag_units,True
tag_email_and_www,True
tag_emoticons,True
tag_hashtags_and_usernames,True
tag_xml,True
tag_initials,True
tag_abbreviations,True


In [3]:
# Creating WebLangTagger 
# For performance reasons 'use_unknown_words' and 'use_missing_commas' are False
weblang_tagger = WebLangTagger(use_unknown_words=False,use_missing_commas=False,use_punct_reps=True)
weblang_tagger

name,output layer,output attributes,input layers
WebLangTagger,weblang_tokens,"('type',)","('words', 'compound_tokens')"

0,1
use_unknown_words,False
use_emoticons,True
use_letter_reps,True
use_punct_reps,True
use_capital_letters,True
use_missing_commas,False
use_ignored_capital,True
use_no_spaces,True
use_incorrect_spaces,True
use_foreign_letters,True


In [13]:
text = Text("Seee on NETIKEELE TUNNUSTE näitelause!!! 🙂 :)")

text.tag_layer(['tokens'])
new_compound_token_tagger.tag(text)

text.tag_layer(["words"])
text["compound_tokens"]

layer name,attributes,parent,enveloping,ambiguous,span count
compound_tokens,"type, normalized",,tokens,False,1

text,type,normalized
"[':', ')']",['emoticon'],:)


In [14]:
weblang_tagger.tag(text)

text
Seee on NETIKEELE TUNNUSTE näitelause!!! 🙂 :)

layer name,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,9
compound_tokens,"type, normalized",,tokens,False,1
words,normalized_form,,,True,8
weblang_tokens,type,,,True,5


In [15]:
text["weblang_tokens"]

0,1
capital_letters,1
emojis,1
emoticons,1
foreign_letters,0
foreign_z_letters,0
ignored_capital,0
incorrect_spaces,0
letter_reps,1
missing_commas,0
no_spaces,0

layer name,attributes,parent,enveloping,ambiguous,span count
weblang_tokens,type,,,True,5

text,type
Seee,letter_reps
NETIKEELE TUNNUSTE,capital_letters
!!!,punct_reps
🙂,emoji
:),emoticons


In [19]:
text.weblang_tokens.meta

{'punct_reps': 1,
 'letter_reps': 1,
 'no_spaces': 0,
 'capital_letters': 1,
 'foreign_letters': 0,
 'foreign_z_letters': 0,
 'ignored_capital': 0,
 'incorrect_spaces': 0,
 'emoticons': 1,
 'missing_commas': 0,
 'unknown_words': 0,
 'emojis': 1,
 'word_count': 8}