In [6]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
import re

# Regular Expressions and Word Tokenization

In [10]:
test_string = "Tokugawa Ieyasu was the founder and first shogun of the Tokugawa shogunate of Japan, which ruled Japan from 1603 until the Meiji Restoration in 1868. He was one of the three 'Great Unifiers' of Japan, along with his former lord Oda Nobunaga and Toyotomi Hideyoshi."

In [11]:
sentence_endings = r"[.,?!]"

This splits up the sentence based on the punctuation contained in sentence endings. In this case, primarily the commas and the periods have been split. 

In [12]:
print(re.split(sentence_endings, test_string))

['Tokugawa Ieyasu was the founder and first shogun of the Tokugawa shogunate of Japan', ' which ruled Japan from 1603 until the Meiji Restoration in 1868', " He was one of the three 'Great Unifiers' of Japan", ' along with his former lord Oda Nobunaga and Toyotomi Hideyoshi', '']


In [13]:
#Find all capitalized words, and print result 

#captialized A-Z, 
capitalized_words = r"[A-Z]\w+" #\w+ means look for words
print(re.findall(capitalized_words, test_string))

['Tokugawa', 'Ieyasu', 'Tokugawa', 'Japan', 'Japan', 'Meiji', 'Restoration', 'He', 'Great', 'Unifiers', 'Japan', 'Oda', 'Nobunaga', 'Toyotomi', 'Hideyoshi']


The find all in this case, finds only the proper names of the three unifiers. The exception being the 'He' at the beginning of a sentence. Useful case here, could be for proper nouns in english. 

In [14]:
digits = r"\d+"
print(re.findall(digits, test_string))

['1603', '1868']


This splits it based off only string numbers that exist within the document. In this case, it only gets the dates of beginning and end of the Tokugawa Shogunate

## Tokenization

In [20]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize 
import nltk


In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\blasa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [22]:
sent = sent_tokenize(test_string)

In [23]:
sent

['Tokugawa Ieyasu was the founder and first shogun of the Tokugawa shogunate of Japan, which ruled Japan from 1603 until the Meiji Restoration in 1868.',
 "He was one of the three 'Great Unifiers' of Japan, along with his former lord Oda Nobunaga and Toyotomi Hideyoshi."]

Sent_tokenize method breaks a string up by sentences

In [26]:
tokenized_sent = word_tokenize(sent[1])

In [27]:
tokenized_sent

['He',
 'was',
 'one',
 'of',
 'the',
 'three',
 "'Great",
 'Unifiers',
 "'",
 'of',
 'Japan',
 ',',
 'along',
 'with',
 'his',
 'former',
 'lord',
 'Oda',
 'Nobunaga',
 'and',
 'Toyotomi',
 'Hideyoshi',
 '.']

Breaks down the 2nd sentence of the test string into word parts

In [29]:
unique_tokens = set(word_tokenize(test_string))

unique_tokens

Selects unique words within a string

## Finding Specific Tokens, Using REGEX Patterns

In [31]:
match = re.search('Tokugawa', test_string)

In [32]:
match.start()

0

In [33]:
match.end()

8

Finds the index position of searched word

### Regex Pattern

In [35]:
pat1 = r"\[.*\]"

In [37]:
search = re.search(pat1, test_string)

In [38]:
search

Looks for any text inside of []

In [42]:
pattern2 = r"([A-Z])\w+"

In [43]:
s = re.match(pattern2, sent[0])

In [44]:
s

<re.Match object; span=(0, 8), match='Tokugawa'>

# Tweet Tokenizer

## Pattern 1

In [47]:
from nltk.tokenize import regexp_tokenize 
from nltk.tokenize import TweetTokenizer

In [56]:
# this REGEX looks for anywords that follow a hashtag, regardless of length 
pattern1 = r"#\w+" 

In [49]:
tweets = ['This is the best #nlp exercise ive found online! #python',
 '#NLP is super fun! <3 #learning',
 'Thanks @datacamp :) #nlp #python']

In [52]:
hashtags = regexp_tokenize(tweets[0], pattern1)

In [53]:
hashtags

['#nlp', '#python']

In [54]:
 regexp_tokenize(tweets[1], pattern1)

['#NLP', '#learning']

Tweets in the tweets section have been previously tokenized. To extract out the hashtags, you have to tokenenize the tweets first. Or at least get them into an array. 

## Pattern 2 

In [58]:
# This REGEX finds any word that has begins with @ or #, regardless of length 
pattern2 = r"([@#]\w+)"

In [59]:
mentions_hashtags = regexp_tokenize(tweets[-1], pattern2)
print(mentions_hashtags)

['@datacamp', '#nlp', '#python']


In [60]:
#selects tweets from the right. 
tweets[-1] 

'Thanks @datacamp :) #nlp #python'

In [61]:
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

In [62]:
tknzr = TweetTokenizer()

In [63]:
all_tokens = [tknzr.tokenize(t) for t in tweets]

creates two arrays for each of the tokenized sentences in tweets 

In [64]:
all_tokens

[['This',
  'is',
  'the',
  'best',
  '#nlp',
  'exercise',
  'ive',
  'found',
  'online',
  '!',
  '#python'],
 ['#NLP', 'is', 'super', 'fun', '!', '<3', '#learning'],
 ['Thanks', '@datacamp', ':)', '#nlp', '#python']]

## Emoji and Non-Ascii Tokenization

In [65]:
german_text = 'Wann gehen wir Pizza essen? 🍕 Und fährst du mit Über? 🚕'

In [66]:
all_words = word_tokenize(german_text)
print(all_words)

['Wann', 'gehen', 'wir', 'Pizza', 'essen', '?', '🍕', 'Und', 'fährst', 'du', 'mit', 'Über', '?', '🚕']


In [67]:
# Tokenize and print only capital words
capital_words = r"[A-ZÜ]\w+"
print(regexp_tokenize(german_text, capital_words))

['Wann', 'Pizza', 'Und', 'Über']


In [68]:
# Tokenize and print only emoji
emoji = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
print(regexp_tokenize(german_text, emoji))

['🍕', '🚕']
