In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt') # for tokenization
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\namit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The sample text we are going to use

In [2]:
corpus = """ Miley Ray Cyrus (/ˈmaɪli ˈsaɪrəs/, MY-lee SY-rəs; born Destiny Hope Cyrus, November 23, 1992) is an American singer, songwriter, and actress. Regarded as a pop icon, she has been recognized for her evolving artistry, style, and hailed as the "Teen Queen" of the 2000s era. She is also cited as one of the few examples of a child star with a successful music career as an adult.[2] As the daughter of country singer Billy Ray Cyrus, she emerged as a teen idol at age 13 as the lead character in the Disney Channel television series Hannah Montana (2006–2011). As Hannah Montana, she achieved success on the Billboard charts with two number-one soundtracks and a US top-ten single.

Her solo career started with the US number-one pop rock albums Meet Miley Cyrus (2007) and Breakout (2008); featuring the top-ten singles "See You Again" and "7 Things". The EP The Time of Our Lives (2009) reached number two in the US while its lead single "Party in the U.S.A." became one of the best-selling singles of all time in the country and was later certified thirteen-times platinum by the RIAA. The ballad "The Climb", also reached number four in the US. Trying to recalibrate her image, she explored dance-pop in Can't Be Tamed (2010), which received mixed reviews; however, its title track reached the top-ten in the US. Cyrus later signed with RCA Records and took a new artistic direction with the hip-hop and R&B-influenced Bangerz (2013). Her fifth chart-topping album, it yielded the singles "We Can't Stop" and her first Billboard Hot 100 number-one "Wrecking Ball". She then dabbled in experimental styles on Miley Cyrus & Her Dead Petz (2015), embraced country pop on Younger Now (2017) and ventured into rock and synth-pop on Plastic Hearts (2020). After signing with Columbia Records in 2021, Cyrus released Endless Summer Vacation (2023). Its lead single "Flowers" set various records and marked her second US number-one. The song won two Grammy Awards including Record of the Year, and the album was nominated for Album of the Year."""

Converting para to sentences, then cleaning and then generating tokens. Just for learning purposes. In real world only cleaning needs to be hard coded and the rest is taken care of by the models like count vectorizer for creating vectors ( while implementing the bag of words model)

In [3]:
sentences = nltk.sent_tokenize(corpus) # convert the corpus into documents ( i.e. para -> sentences)

In [4]:
sentences

[' Miley Ray Cyrus (/ˈmaɪli ˈsaɪrəs/, MY-lee SY-rəs; born Destiny Hope Cyrus, November 23, 1992) is an American singer, songwriter, and actress.',
 'Regarded as a pop icon, she has been recognized for her evolving artistry, style, and hailed as the "Teen Queen" of the 2000s era.',
 'She is also cited as one of the few examples of a child star with a successful music career as an adult.',
 '[2] As the daughter of country singer Billy Ray Cyrus, she emerged as a teen idol at age 13 as the lead character in the Disney Channel television series Hannah Montana (2006–2011).',
 'As Hannah Montana, she achieved success on the Billboard charts with two number-one soundtracks and a US top-ten single.',
 'Her solo career started with the US number-one pop rock albums Meet Miley Cyrus (2007) and Breakout (2008); featuring the top-ten singles "See You Again" and "7 Things".',
 'The EP The Time of Our Lives (2009) reached number two in the US while its lead single "Party in the U.S.A." became one of

In [5]:
lm = WordNetLemmatizer()
sm = PorterStemmer()

In [6]:
new_corpus = []

for s in sentences:
    review = re.sub('[^a-zA-Z]' , ' ' , s)
    review = review.lower()
    new_corpus.append(review)

In [7]:
new_corpus

[' miley ray cyrus    ma li  sa r s   my lee sy r s  born destiny hope cyrus  november           is an american singer  songwriter  and actress ',
 'regarded as a pop icon  she has been recognized for her evolving artistry  style  and hailed as the  teen queen  of the     s era ',
 'she is also cited as one of the few examples of a child star with a successful music career as an adult ',
 '    as the daughter of country singer billy ray cyrus  she emerged as a teen idol at age    as the lead character in the disney channel television series hannah montana             ',
 'as hannah montana  she achieved success on the billboard charts with two number one soundtracks and a us top ten single ',
 'her solo career started with the us number one pop rock albums meet miley cyrus        and breakout         featuring the top ten singles  see you again  and    things  ',
 'the ep the time of our lives        reached number two in the us while its lead single  party in the u s a   became one of

In [8]:
tokens = []

In [9]:
for line in new_corpus:
    words = nltk.word_tokenize(line)
    for word in words:
        if word not in set(stopwords.words('english')):
            res = lm.lemmatize(word)
            tokens.append(res)

In [10]:
print(tokens)

['miley', 'ray', 'cyrus', 'li', 'sa', 'r', 'lee', 'sy', 'r', 'born', 'destiny', 'hope', 'cyrus', 'november', 'american', 'singer', 'songwriter', 'actress', 'regarded', 'pop', 'icon', 'recognized', 'evolving', 'artistry', 'style', 'hailed', 'teen', 'queen', 'era', 'also', 'cited', 'one', 'example', 'child', 'star', 'successful', 'music', 'career', 'adult', 'daughter', 'country', 'singer', 'billy', 'ray', 'cyrus', 'emerged', 'teen', 'idol', 'age', 'lead', 'character', 'disney', 'channel', 'television', 'series', 'hannah', 'montana', 'hannah', 'montana', 'achieved', 'success', 'billboard', 'chart', 'two', 'number', 'one', 'soundtrack', 'u', 'top', 'ten', 'single', 'solo', 'career', 'started', 'u', 'number', 'one', 'pop', 'rock', 'album', 'meet', 'miley', 'cyrus', 'breakout', 'featuring', 'top', 'ten', 'single', 'see', 'thing', 'ep', 'time', 'life', 'reached', 'number', 'two', 'u', 'lead', 'single', 'party', 'u', 'became', 'one', 'best', 'selling', 'single', 'time', 'country', 'later', 'ce

Count Vectorizer

In [11]:
#Step1: Import
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
#Step2 : Clean the text
def clean_text(corpus):
    # Convert to lowercase
    corpus = corpus.lower()    
    # Remove punctuation
    corpus = re.sub(r'[^\w\s]', '', corpus)    #\w matches words, digtis, underscores \s matches whitespaces
    # Remove numbers
    corpus = re.sub(r'\d+', '', corpus)    #\d+ matches one or more numbers
    # Remove extra whitespaces
    corpus = re.sub(r'\s+', ' ', corpus)    #\s+ matches more than one whitespaces
    return corpus

clean_text(corpus)


' miley ray cyrus ˈmaɪli ˈsaɪrəs mylee syrəs born destiny hope cyrus november is an american singer songwriter and actress regarded as a pop icon she has been recognized for her evolving artistry style and hailed as the teen queen of the s era she is also cited as one of the few examples of a child star with a successful music career as an adult as the daughter of country singer billy ray cyrus she emerged as a teen idol at age as the lead character in the disney channel television series hannah montana as hannah montana she achieved success on the billboard charts with two numberone soundtracks and a us topten single her solo career started with the us numberone pop rock albums meet miley cyrus and breakout featuring the topten singles see you again and things the ep the time of our lives reached number two in the us while its lead single party in the usa became one of the bestselling singles of all time in the country and was later certified thirteentimes platinum by the riaa the bal

In [14]:
# Step3 : Sentence vectorize the corpus because the count vectorizer object expects and iterable which the string object corpus is not
sentences = nltk.sent_tokenize(corpus)

In [15]:
# Step4 : Create a vectorizer object
cv = CountVectorizer()

In [16]:
# Step5: Fit the document in the vectorizer object
cv.fit(sentences)

In [17]:
# Printing the identified Unique words along with their indices
print("Vocabulary: ", cv.vocabulary_)

Vocabulary:  {'miley': 110, 'ray': 130, 'cyrus': 59, 'ˈmaɪli': 202, 'ˈsaɪrəs': 203, 'my': 114, 'lee': 106, 'sy': 167, 'rəs': 143, 'born': 44, 'destiny': 64, 'hope': 91, 'november': 117, '23': 16, '1992': 2, 'is': 101, 'an': 28, 'american': 27, 'singer': 152, 'songwriter': 157, 'and': 29, 'actress': 18, 'regarded': 138, 'as': 32, 'pop': 128, 'icon': 94, 'she': 149, 'has': 86, 'been': 40, 'recognized': 135, 'for': 81, 'her': 88, 'evolving': 72, 'artistry': 31, 'style': 162, 'hailed': 84, 'the': 173, 'teen': 170, 'queen': 129, 'of': 120, '2000s': 3, 'era': 71, 'also': 26, 'cited': 55, 'one': 122, 'few': 77, 'examples': 73, 'child': 54, 'star': 159, 'with': 195, 'successful': 165, 'music': 113, 'career': 48, 'adult': 19, 'daughter': 62, 'country': 58, 'billy': 43, 'emerged': 68, 'idol': 95, 'at': 33, 'age': 22, '13': 1, 'lead': 105, 'character': 51, 'in': 97, 'disney': 66, 'channel': 50, 'television': 171, 'series': 147, 'hannah': 85, 'montana': 112, '2006': 4, '2011': 9, 'achieved': 17, '

In [18]:
# Encode the Document
vector = cv.transform(sentences)

In [19]:
# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.toarray())

Encoded Document is:
[[0 0 1 ... 0 1 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# TF- IDF Vectorizer

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tf = TfidfVectorizer()

In [22]:
X = tf.fit_transform(sentences)

In [23]:
X[0].toarray()

array([[0.        , 0.        , 0.22236919, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.22236919, 0.        , 0.22236919, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.22236919, 0.19309019, 0.09926782,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.22236919,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.26381214,
        0.        , 0.        , 0.        , 0.        , 0.22236919,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

<h2>Word to Vec</h2>

In [26]:
import gensim
from gensim.models import word2vec, keyedvectors

In [27]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
vec_king = wv['king']



In [28]:
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [30]:
print(vec_king.ndim)
print(vec_king.shape)

1
(300,)


In [32]:
wv.most_similar('king')

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]