In [1]:
import numpy as np
from nltk.tokenize import word_tokenize 
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

### Tokenization

In [2]:
# Word Tokenization
text = 'Hello everyone. How are you doing?  Today is a sunny day.'
word_tokens = word_tokenize(text)
print(word_tokens)

['Hello', 'everyone', '.', 'How', 'are', 'you', 'doing', '?', 'Today', 'is', 'a', 'sunny', 'day', '.']


In [3]:
# Sentence Tokenization
from nltk.tokenize import sent_tokenize
text = 'Hello everyone. How are you doing?  Today is a sunny day.'
sent_tokens = sent_tokenize(text)
print(sent_tokens)

['Hello everyone.', 'How are you doing?', 'Today is a sunny day.']


### Tokenization + Punctuation Removal

In [4]:
# RegexpTokenizer 
from nltk.tokenize import RegexpTokenizer  
regexp = RegexpTokenizer(r'\w+')
text = 'Hello everyone. How are you doing?  Today is a sunny day.'
word_tokens = regexp.tokenize(text)
print(word_tokens)

['Hello', 'everyone', 'How', 'are', 'you', 'doing', 'Today', 'is', 'a', 'sunny', 'day']


In [5]:
# Use regular expression
import re
from nltk.tokenize import word_tokenize 
text = 'Hello everyone. How are you doing?  Today is a sunny day.'
word_tokens = word_tokenize(text)
# \w = word characters in a string, \s = white space
word_tokens = [re.sub(r'[^\w\s]', '', token) for token in word_tokens if re.sub(r'[^\w\s]', '', token)]  
word_tokens

['Hello',
 'everyone',
 'How',
 'are',
 'you',
 'doing',
 'Today',
 'is',
 'a',
 'sunny',
 'day']

### Stop Word Removal

In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')
corpus_stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
print(corpus_stop_words)

{'weren', 'over', 'should', 'needn', 'same', "she'll", 'doing', 'under', 'were', 'itself', 'again', "we've", "mightn't", 'mustn', 'our', 's', 'doesn', 'until', "you've", 'him', 'do', 'the', "you'll", 'above', 'not', 'if', 'these', 'mightn', 'because', 'nor', 'you', "hasn't", "they'll", 'am', "it's", "they'd", 'yourselves', 'just', 'o', 'their', 'during', "he's", 'into', "shouldn't", 'wouldn', 'shan', 'theirs', 'wasn', 'll', "we'd", 'd', 'her', 'each', 'where', 'no', 'was', 'be', 'why', "i've", 'in', 'can', "they've", 'are', 'up', 'as', 'own', 'will', "should've", "you'd", "haven't", 'such', "we're", 'hers', 'y', "we'll", 'all', 'too', 'this', 'didn', 'me', "that'll", 'when', 'hasn', 're', 'have', "hadn't", 'has', 'hadn', 'more', 'further', 'some', "he'll", 'only', 'they', 'yourself', 'between', 'an', 'with', 'does', 'ours', 'a', "she'd", "he'd", "i'm", 'those', 'but', 'of', "wouldn't", 'm', 'don', "isn't", 'aren', 'very', "it'd", "shan't", 'it', 'to', "needn't", 've', "they're", "she's

In [9]:
filtered_word_tokens = [word for word in word_tokens if not word.lower() in corpus_stop_words]
print("Original words:", word_tokens)
print("After stop-word removal:", filtered_word_tokens)

Original words: ['Hello', 'everyone', 'How', 'are', 'you', 'doing', 'Today', 'is', 'a', 'sunny', 'day']
After stop-word removal: ['Hello', 'everyone', 'Today', 'sunny', 'day']


In [10]:
word_tokens = filtered_word_tokens

### Stemming

In [11]:
# For stemming
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [12]:
# Stemming example
word_list = ['changed', 'changing', 'changable']
stemmed_word_list = [porter_stemmer.stem(word) for word in word_list]
 
print("Original words:", word_list)
print("Stemmed words:", stemmed_word_list)

Original words: ['changed', 'changing', 'changable']
Stemmed words: ['chang', 'chang', 'changabl']


In [13]:
# Stemming example
word_list = ['walking', 'walked', 'walker']
stemmed_word_list = [porter_stemmer.stem(word) for word in word_list]

print("Original words:", word_list)
print("Stemmed words:", stemmed_word_list)

Original words: ['walking', 'walked', 'walker']
Stemmed words: ['walk', 'walk', 'walker']


In [14]:
# Stemming example
word_list = ['sleeping', 'sleeper', 'slept']
stemmed_word_list = [porter_stemmer.stem(word) for word in word_list]
 
print("Original words:", word_list)
print("Stemmed words:", stemmed_word_list)

Original words: ['sleeping', 'sleeper', 'slept']
Stemmed words: ['sleep', 'sleeper', 'slept']


In [15]:
# Stemming example
word_list = ['better', 'happily', 'happy']
stemmed_word_list = [porter_stemmer.stem(word) for word in word_list]
 
print("Original words:", word_list)
print("Stemmed words:", stemmed_word_list)

Original words: ['better', 'happily', 'happy']
Stemmed words: ['better', 'happili', 'happi']


In [16]:
# Stemming example
word_list = word_tokens
stemmed_word_list = [porter_stemmer.stem(word) for word in word_list]
 
print("Original words:", word_list)
print("Stemmed words:", stemmed_word_list)

Original words: ['Hello', 'everyone', 'Today', 'sunny', 'day']
Stemmed words: ['hello', 'everyon', 'today', 'sunni', 'day']


### Lemmatization

In [17]:
# For Lemmatization
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...


True

In [18]:
# For lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [19]:
# Lemmatization example
# 'n' for nouns, 'v' for verbs, 'a' for adjectives, 'r' for adverbs, 's' for satellite (auxilliary) adjectives.
word_list = ['changing', 'changed', 'walking', 'walked', 'sleeping', 'slept']
lemma_list = [lemmatizer.lemmatize(word,pos='v') for word in word_list]
 
print("Original words:", word_list)
print("Lemmas:", lemma_list)

Original words: ['changing', 'changed', 'walking', 'walked', 'sleeping', 'slept']
Lemmas: ['change', 'change', 'walk', 'walk', 'sleep', 'sleep']


In [20]:
# Lemmatization example
word_list = ['runner', 'walker', 'sleeper']
lemma_list = [lemmatizer.lemmatize(word, pos='n') for word in word_list]
 
print("Original words:", word_list)
print("Lemmas:", lemma_list)

Original words: ['runner', 'walker', 'sleeper']
Lemmas: ['runner', 'walker', 'sleeper']


In [21]:
# Lemmatization example
word_list =  ['runnable', 'walkable', 'changable']
lemma_list = [lemmatizer.lemmatize(word,pos='a') for word in word_list]
 
print("Original words:", word_list)
print("Lemmas:", lemma_list)

Original words: ['runnable', 'walkable', 'changable']
Lemmas: ['runnable', 'walkable', 'changable']


In [22]:
# Lemmatization example
word_list = ['better', 'happily', 'happy']
lemma_list = [lemmatizer.lemmatize(word,pos='a') for word in word_list]
 
print("Original words:", word_list)
print("Lemmas:", lemma_list)

Original words: ['better', 'happily', 'happy']
Lemmas: ['good', 'happily', 'happy']


In [23]:
# Lemmatization example
word_list = ['better']
lemma_list = [lemmatizer.lemmatize(word,pos='r') for word in word_list]
 
print("Original words:", word_list)
print("Lemmas:", lemma_list)

Original words: ['better']
Lemmas: ['well']


In [24]:
# Lemmatization example
word_list = word_tokens
word_token_lemma_list = [lemmatizer.lemmatize(word) for word in word_list]
 
print("Original words:", word_list)
print("Lemmas:", word_token_lemma_list)

Original words: ['Hello', 'everyone', 'Today', 'sunny', 'day']
Lemmas: ['Hello', 'everyone', 'Today', 'sunny', 'day']


### Part of Speech (POS) Tagging

In [25]:
# For POS 
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nadda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [26]:
# list of pos tags: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
# Note: 'NN' = noun, 'NNP' = proper noun, 'VBD' = past-tense verb

from nltk import pos_tag
pos_tags = pos_tag(word_tokens)
print("Original words:", word_tokens)
print("POS tagged:", pos_tags)

Original words: ['Hello', 'everyone', 'Today', 'sunny', 'day']
POS tagged: [('Hello', 'NNP'), ('everyone', 'NN'), ('Today', 'NNP'), ('sunny', 'VBD'), ('day', 'NN')]


In [27]:
# To obtain POS before running lemmatization 
# When lemmatization is executed, can use POS as value of pos parameter
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    tag_dict = {'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'J': wordnet.ADJ,
                'R': wordnet.ADV}    
    tag = nltk.pos_tag([word])[0][1][0].upper() # Get first character of pos tag from word
    return tag_dict.get(tag, wordnet.NOUN)  # default when not found = 'N'
    
# Lemmatization
word_list = word_tokens
lemma_list = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in word_list]
 
print("Original words:", word_list)
print("Lemmas without pos:", word_token_lemma_list)
print("Lemmas:", lemma_list)

Original words: ['Hello', 'everyone', 'Today', 'sunny', 'day']
Lemmas without pos: ['Hello', 'everyone', 'Today', 'sunny', 'day']
Lemmas: ['Hello', 'everyone', 'Today', 'sunny', 'day']


### Cosine Similarity

In [28]:
from nltk.corpus import stopwords 
corpus_stop_words = set(stopwords.words('english')) 

def preprocess_text(text):
    tokens = [word.lower() for word in word_tokenize(text)] 
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens if re.sub(r'[^\w\s]', '', token)]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if not word in corpus_stop_words]  
    return tokens

In [29]:
first = 'A cat sits nearby the window'
second = 'A dog sits nearby the window'

In [30]:
first_set = set(preprocess_text(first))
second_set = set(preprocess_text(second))
all_tokens = list(first_set.union(second_set))
all_tokens

['sits', 'dog', 'cat', 'nearby', 'window']

In [31]:
print(first_set)
print(second_set)

{'cat', 'sits', 'nearby', 'window'}
{'sits', 'window', 'nearby', 'dog'}


In [32]:
first_exist = [1 if word in first_set else 0 for word in all_tokens]
second_exist = [1 if word in second_set else 0 for word in all_tokens]
print(first_exist)
print(second_exist)

[1, 0, 1, 1, 1]
[1, 1, 0, 1, 1]


In [33]:
squared_first_sum = sum([val*val for val in first_exist])**0.5
squared_second_sum = sum([val*val for val in second_exist])**0.5
cosine_similar = np.dot(first_exist, second_exist) / (squared_first_sum*squared_second_sum)
cosine_similar

0.75

In [34]:
cosine_dist = 1 - cosine_similar
cosine_dist

0.25

### TF-IDF

In [35]:
first = 'It is going to rain today.' 
second = 'Today I am not going outside.' 
third = 'I am going to watch TV.' 
texts = [first, second, third]
texts = [text.lower() for text in texts]

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(texts)
print('Word indexes:', tfidf.vocabulary_)

Word indexes: {'it': 3, 'is': 2, 'going': 1, 'to': 7, 'rain': 6, 'today': 8, 'am': 0, 'not': 4, 'outside': 5, 'watch': 10, 'tv': 9}


In [37]:
print('tf-idf value:')
print(result)

tf-idf value:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (3, 11)>
  Coords	Values
  (0, 3)	0.4711101009983051
  (0, 2)	0.4711101009983051
  (0, 1)	0.2782452148327134
  (0, 7)	0.35829137488557944
  (0, 6)	0.4711101009983051
  (0, 8)	0.35829137488557944
  (1, 1)	0.3154441510317797
  (1, 8)	0.4061917781433946
  (1, 0)	0.4061917781433946
  (1, 4)	0.5340933749435833
  (1, 5)	0.5340933749435833
  (2, 1)	0.3154441510317797
  (2, 7)	0.4061917781433946
  (2, 0)	0.4061917781433946
  (2, 10)	0.5340933749435833
  (2, 9)	0.5340933749435833


In [39]:
print('tf-idf values in matrix form:')
print(result.toarray())

tf-idf values in matrix form:
[[0.         0.27824521 0.4711101  0.4711101  0.         0.
  0.4711101  0.35829137 0.35829137 0.         0.        ]
 [0.40619178 0.31544415 0.         0.         0.53409337 0.53409337
  0.         0.         0.40619178 0.         0.        ]
 [0.40619178 0.31544415 0.         0.         0.         0.
  0.         0.40619178 0.         0.53409337 0.53409337]]


In [40]:
for word, idf in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    print(word, ':', idf)

am : 1.2876820724517808
going : 1.0
is : 1.6931471805599454
it : 1.6931471805599454
not : 1.6931471805599454
outside : 1.6931471805599454
rain : 1.6931471805599454
to : 1.2876820724517808
today : 1.2876820724517808
tv : 1.6931471805599454
watch : 1.6931471805599454


In [41]:
print('tf-idf values in matrix form:')
print(result.toarray())

tf-idf values in matrix form:
[[0.         0.27824521 0.4711101  0.4711101  0.         0.
  0.4711101  0.35829137 0.35829137 0.         0.        ]
 [0.40619178 0.31544415 0.         0.         0.53409337 0.53409337
  0.         0.         0.40619178 0.         0.        ]
 [0.40619178 0.31544415 0.         0.         0.         0.
  0.         0.40619178 0.         0.53409337 0.53409337]]
