In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
import nltk

In [3]:
nltk.download("wordnet", download_dir="/home/oussama/anaconda3/nltk_data/")

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/oussama/anaconda3/nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!


True

# Tokenization 

In [4]:
text = "I believe this would help the reader understand how tokenization works. as well as realize its importance."

In [5]:
sents = sent_tokenize(text)

In [6]:
sents

['I believe this would help the reader understand how tokenization works.',
 'as well as realize its importance.']

In [7]:
word_tokenize(text)

['I',
 'believe',
 'this',
 'would',
 'help',
 'the',
 'reader',
 'understand',
 'how',
 'tokenization',
 'works',
 '.',
 'as',
 'well',
 'as',
 'realize',
 'its',
 'importance',
 '.']

In [8]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['I', 'believe', 'this', 'would', 'help', 'the', 'reader', 'understand', 'how', 'tokenization', 'works', '.'], ['as', 'well', 'as', 'realize', 'its', 'importance', '.']]


# Stopwords 

In [9]:
custom_list = set(stopwords.words('english') + list(punctuation))
custom_list

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [10]:
word_list = [word for word in word_tokenize(text) if word not in custom_list]
print(word_list)

['I', 'believe', 'would', 'help', 'reader', 'understand', 'tokenization', 'works', 'well', 'realize', 'importance']


# N-Grams 

In [11]:
from nltk.collocations import BigramCollocationFinder

In [12]:
finder = BigramCollocationFinder.from_words(word_list)

In [13]:
print(finder.ngram_fd.items())

dict_items([(('I', 'believe'), 1), (('believe', 'would'), 1), (('would', 'help'), 1), (('help', 'reader'), 1), (('reader', 'understand'), 1), (('understand', 'tokenization'), 1), (('tokenization', 'works'), 1), (('works', 'well'), 1), (('well', 'realize'), 1), (('realize', 'importance'), 1)])


# Stemming

In [14]:
from nltk.stem.lancaster import LancasterStemmer

In [15]:
new_text = "It is important to be very pythonly while you are pythoning with python. All pythoners have pythoned at least once."

In [16]:
ls = LancasterStemmer()
new_word_list = [word for word in word_tokenize(new_text) if word not in custom_list]
stem_lan = [ls.stem(word) for word in new_word_list]
print(stem_lan)

['it', 'import', 'python', 'python', 'python', 'al', 'python', 'python', 'least']


# Word Sense Disambiguation

In [20]:
from nltk.wsd import lesk

In [25]:
context_1= lesk(word_tokenize("Sing in a lower tone, along with the bass."), "bass")
print(context_1.definition())

the member with the lowest range of a family of musical instruments


In [26]:
context_2= lesk(word_tokenize("The sea bass really hard to catch."), "bass")
print(context_2.definition())

the lean flesh of a saltwater fish of the family Serranidae


In [27]:
context_3= lesk(word_tokenize("My mous is not working. I need to change it."), "mouse")
print(context_3.definition())

a hand-operated electronic device that controls the coordinates of a cursor on your computer screen as you move it around on a pad; on the bottom of the device is a ball that rolls on the surface of the pad


# Count Vectorizer

In [40]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [29]:
corpus = [
    "This the first document from heaven",
    "but the second document is from mars",
    "And this is the third one from nowhere",
    "Is the first document from nowhere?"
]

In [30]:
df = pd.DataFrame({"text" : corpus})
df

Unnamed: 0,text
0,This the first document from heaven
1,but the second document is from mars
2,And this is the third one from nowhere
3,Is the first document from nowhere?


In [34]:
count_v = CountVectorizer(stop_words= custom_list)
X = count_v.fit_transform(df.text).toarray()
print(count_v.get_feature_names())

['document', 'first', 'heaven', 'mars', 'nowhere', 'one', 'second', 'third']


In [35]:
print(X)
print(count_v.vocabulary_)

[[1 1 1 0 0 0 0 0]
 [1 0 0 1 0 0 1 0]
 [0 0 0 0 1 1 0 1]
 [1 1 0 0 1 0 0 0]]
{'first': 1, 'document': 0, 'heaven': 2, 'second': 6, 'mars': 3, 'third': 7, 'one': 5, 'nowhere': 4}


# TF-IDF Vectorizer

In [37]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

TfidfVectorizer()

In [38]:
print(vectorizer.vocabulary_)

{'this': 13, 'the': 11, 'first': 3, 'document': 2, 'from': 4, 'heaven': 5, 'but': 1, 'second': 10, 'is': 6, 'mars': 7, 'and': 0, 'third': 12, 'one': 9, 'nowhere': 8}


In [39]:
print(vectorizer.idf_)

[1.91629073 1.91629073 1.22314355 1.51082562 1.         1.91629073
 1.22314355 1.91629073 1.51082562 1.91629073 1.91629073 1.
 1.91629073 1.51082562]


# Hashing Vectorizer

In [41]:
df

Unnamed: 0,text
0,This the first document from heaven
1,but the second document is from mars
2,And this is the third one from nowhere
3,Is the first document from nowhere?


In [43]:
hash_v = HashingVectorizer(n_features= 9, norm= None, alternate_sign= False)
print(hash_v.fit_transform(df.text).toarray())

[[0. 0. 2. 2. 1. 0. 1. 0. 0.]
 [0. 1. 2. 1. 2. 1. 0. 0. 0.]
 [1. 1. 0. 2. 1. 2. 1. 0. 0.]
 [0. 1. 2. 2. 1. 0. 0. 0. 0.]]
