# NLTK

#### Install NLTK

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Download models or corpora

In [None]:
!python -m nltk.downloader

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q'
Command "q'" unrecognized

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


#### Import and use

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
tweet = "@elonmusk digital currency is the future! :) #dogecoin"

In [None]:
query = 'future'

In [None]:
tweet.find(query) # Not great solution for searching for a term... (What if I looked for "dig"?)

34

#### Tokenization

In [None]:
tweet.split() # Also not great...

['@elonmusk', 'digital', 'currency', 'is', 'the', 'future!', ':)', '#dogecoin']

In [None]:
["future" in tweet.split()] # Also not great...

[False]

In [None]:
nltk.word_tokenize(tweet) # Bingo!

['@',
 'elonmusk',
 'digital',
 'currency',
 'is',
 'the',
 'future',
 '!',
 ':',
 ')',
 '#',
 'dogecoin']

In [None]:
[query in nltk.word_tokenize(tweet)] # Bingo!


[True]

#### More options in customizing your tokenizer

In [None]:
nltk.word_tokenize(tweet, language='spanish')

['@',
 'elonmusk',
 'digital',
 'currency',
 'is',
 'the',
 'future',
 '!',
 ':',
 ')',
 '#',
 'dogecoin']

In [None]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]+', discard_empty=True)

In [None]:
custom_tokenizer.tokenize(tweet)

['elonmusk', 'digital', 'currency', 'is', 'the', 'future', 'dogecoin']

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [None]:
tweet_tokenizer.tokenize(tweet)

['digital', 'currency', 'is', 'the', 'future', '!', ':)', '#dogecoin']

In [None]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('digital', 'currency'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['digital_currency', 'is', 'the', 'future', '!', ':)', '#dogecoin']

In [None]:
query = 'digital'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

False

### Normalization

In [None]:
tweet.lower()

'@elonmusk digital currency is the future! :) #dogecoin'

In [None]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
#     tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
#     tokens = [re.sub('á', 'a', t) for t in tokens]
    tokens = [re.sub('ă', 'a', t) for t in tokens]

    return tokens

In [None]:
romanian_query = 'monedă digitală'
normalize_tokens(tweet_tokenizer.tokenize(romanian_query))

['moneda', 'digitala']

In [None]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import unidecode
unidecode.unidecode(romanian_query)

'moneda digitala'

In [None]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
print(normalized_tweet)

['digital', 'currency', 'is', 'the', 'future', ':)']


#### Stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stopwords.words('romanian')

['a',
 'abia',
 'acea',
 'aceasta',
 'această',
 'aceea',
 'aceeasi',
 'acei',
 'aceia',
 'acel',
 'acela',
 'acelasi',
 'acele',
 'acelea',
 'acest',
 'acesta',
 'aceste',
 'acestea',
 'acestei',
 'acestia',
 'acestui',
 'aceşti',
 'aceştia',
 'adica',
 'ai',
 'aia',
 'aibă',
 'aici',
 'al',
 'ala',
 'ale',
 'alea',
 'alt',
 'alta',
 'altceva',
 'altcineva',
 'alte',
 'altfel',
 'alti',
 'altii',
 'altul',
 'am',
 'anume',
 'apoi',
 'ar',
 'are',
 'as',
 'asa',
 'asta',
 'astea',
 'astfel',
 'asupra',
 'atare',
 'atat',
 'atata',
 'atatea',
 'atatia',
 'ati',
 'atit',
 'atita',
 'atitea',
 'atitia',
 'atunci',
 'au',
 'avea',
 'avem',
 'aveţi',
 'avut',
 'aş',
 'aţi',
 'ba',
 'ca',
 'cam',
 'cand',
 'care',
 'careia',
 'carora',
 'caruia',
 'cat',
 'catre',
 'ce',
 'cea',
 'ceea',
 'cei',
 'ceilalti',
 'cel',
 'cele',
 'celor',
 'ceva',
 'chiar',
 'ci',
 'cind',
 'cine',
 'cineva',
 'cit',
 'cita',
 'cite',
 'citeva',
 'citi',
 'citiva',
 'cu',
 'cui',
 'cum',
 'cumva',
 'cât',
 'câte

In [None]:
cleaned_tweet = [t for t in normalized_tweet if t not in stopwords.words("english")]
print(cleaned_tweet)

['digital', 'currency', 'future', ':)']


#### Stemming / Lemmatization


In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
# WordNet and OpenMultilingualWordnet necessary for lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in normalized_tweet]

['digit', 'currenc', 'is', 'the', 'futur', ':)']

In [None]:
stemmer = SnowballStemmer(language='english') # This one has support for Romanian too!

[stemmer.stem(t) for t in normalized_tweet]

['digit', 'currenc', 'is', 'the', 'futur', ':)']

In [None]:
lemmatizer = WordNetLemmatizer() # Doesn't have support for Romanian ... see Spacy instead

[lemmatizer.lemmatize(t) for t in normalized_tweet]

['digital', 'currency', 'is', 'the', 'future', ':)']

#### Combine with part-of-speech tagging!

In [None]:
# Trained tagger needed for POS-tagging:
nltk.download('averaged_perceptron_tagger')

nltk.pos_tag(normalized_tweet)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('digital', 'JJ'),
 ('currency', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('future', 'NN'),
 (':)', 'NN')]

In [None]:

from nltk.corpus import wordnet as wn

tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [None]:
query = "digital currencies"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['digital', 'currencies']


In [None]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)


['digital', 'currency', 'be', 'the', 'future', ':)']
['digital', 'currency']


In [None]:
# Now we can properly match the two!

print(set(lemmatized_tweet).intersection(lemmatized_query))

{'currency', 'digital'}


#### Sentence segmentation

In [None]:
query = "I am too fast. I am too furious."

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [None]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [None]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [None]:
from nltk.tokenize import PunktSentenceTokenizer
PunktSentenceTokenizer??

## Spacy

In [None]:
%%bash
pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!python -m spacy download en_core_web_sm

2023-01-30 17:12:09.837250: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


#### Define the model

In [None]:
import spacy




#### Process the texts

In [None]:
nlp = spacy.load("en_core_web_sm")



In [None]:
tweet = "@elonmusk digital currency is the future! :) #dogecoin"


In [None]:
text = nlp(tweet)
for token in text:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

@elonmusk @elonmusk PROPN NNP nmod @xxxx False False
digital digital ADJ JJ amod xxxx True False
currency currency NOUN NN nsubj xxxx True False
is be AUX VBZ ROOT xx True True
the the DET DT det xxx True True
future future NOUN NN attr xxxx True False
! ! PUNCT . punct ! False False
:) :) PUNCT NFP ROOT :) False False
# # SYM $ dep # False False
dogecoin dogecoin VERB VB ROOT xxxx True False


### Romanian example

Here we have support for Romanian for all preprocessing tasks!

In [None]:
!python -m spacy download ro_core_news_sm   # Get Romanian model

2023-03-19 19:12:49.839927: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-19 19:12:49.840022: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-19 19:12:51.801870: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ro-core-news-sm==3.5.0
  Using cached https://github.com/explosion/spacy-models/releases/downlo

In [None]:
from spacy.lang.ro.examples import sentences 

nlp = spacy.load("ro_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari
Apple Apple PROPN nsubj
plănuiește plănuii AUX ROOT
să să PART mark
cumpere cumpăra AUX ccomp
o un DET det
companie companie NOUN obj
britanică britanic ADJ amod
pentru pentru ADP case
un un DET det
miliard miliard NUM obl
de de ADP case
dolari dolar NOUN nmod


# Numericalization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

import string

In [None]:
# Very customizable:
CountVectorizer??

In [None]:
corpus = [
 'This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?',
 '@user This one is a tweet #meta ;)' 
]

In [None]:
vectorizer = CountVectorizer()


In [None]:

X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'meta' 'one' 'second' 'the' 'third' 'this'
 'tweet' 'user']


In [None]:
print(X.toarray())


[[0 1 1 1 0 0 0 1 0 1 0 0]
 [0 2 0 1 0 0 1 1 0 1 0 0]
 [1 0 0 1 0 1 0 1 1 1 0 0]
 [0 1 1 1 0 0 0 1 0 1 0 0]
 [0 0 0 1 1 1 0 0 0 1 1 1]]


### Customizing the Vectorizer

In [None]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)
tokenize_funct = tokenizer.tokenize
nltk.download('stopwords')
word_blacklist = stopwords.words('english') + list(string.punctuation)
vectorizer_tweet = CountVectorizer(tokenizer=tokenize_funct, stop_words=word_blacklist)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
X1 = vectorizer_tweet.fit_transform(corpus)
print(vectorizer_tweet.get_feature_names_out())

['#meta' ';)' 'document' 'first' 'one' 'second' 'third' 'tweet']




In [None]:
X1

<5x8 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [None]:
vectorizer_tweet.transform([corpus[0]]).toarray()


array([[0, 0, 1, 1, 0, 0, 0, 0]])

In [None]:
vectorizertfidf = TfidfVectorizer()
Xtfidf = vectorizertfidf.fit_transform(corpus)
print(vectorizertfidf.get_feature_names_out())

['and' 'document' 'first' 'is' 'meta' 'one' 'second' 'the' 'third' 'this'
 'tweet' 'user']


In [None]:
Xtfidf.toarray()

array([[0.        , 0.48961805, 0.58983706, 0.34836727, 0.        ,
        0.        , 0.        , 0.41188214, 0.        , 0.34836727,
        0.        , 0.        ],
       [0.        , 0.70933829, 0.        , 0.25235002, 0.        ,
        0.        , 0.52958485, 0.29835887, 0.        , 0.25235002,
        0.        , 0.        ],
       [0.54054601, 0.        , 0.        , 0.25757307, 0.        ,
        0.43610912, 0.        , 0.3045342 , 0.54054601, 0.25757307,
        0.        , 0.        ],
       [0.        , 0.48961805, 0.58983706, 0.34836727, 0.        ,
        0.        , 0.        , 0.41188214, 0.        , 0.34836727,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.23518498, 0.49356209,
        0.39820278, 0.        , 0.        , 0.        , 0.23518498,
        0.49356209, 0.49356209]])

Text is now vectorized and ready for machine learning!