# NLP Paragraph Analysis: Preprocessing (Data Cleaning and Vectorization)

## Imports

In [4]:
# !pip install nltk

In [2]:
import pandas as pd
import numpy as np 

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")
pd.set_option('display.float_format', lambda x: '%.3f' % x) 

In [3]:
text = "In the realm of data science, extracting meaningful insights from vast datasets is an art."\
"From predictive modeling to natural language processing (NLP), the field encompasses a spectrum of techniques."\
"Consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences."\
"This process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore."\
"The synergy of NLP and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond."
print(text)

In the realm of data science, extracting meaningful insights from vast datasets is an art.From predictive modeling to natural language processing (NLP), the field encompasses a spectrum of techniques.Consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences.This process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore.The synergy of NLP and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond.


## Tokenization

- `sent_tokenize`
- `word_tokenize`

In [58]:
# Import necessary packages

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [6]:
# Split text into sentences using NLTK

sentence_token = sent_tokenize(text.lower())
print(sentence_token)

['in the realm of data science, extracting meaningful insights from vast datasets is an art.from predictive modeling to natural language processing (nlp), the field encompasses a spectrum of techniques.consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences.this process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore.the synergy of nlp and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond.']


In [7]:
for sentence in sentence_token:
    print('- ', sentence)

-  in the realm of data science, extracting meaningful insights from vast datasets is an art.from predictive modeling to natural language processing (nlp), the field encompasses a spectrum of techniques.consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences.this process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore.the synergy of nlp and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond.


In [8]:
len(sentence_token)

1

In [9]:
# Split text into words using NLTK

word_token = word_tokenize(text.lower())
print(word_token)

['in', 'the', 'realm', 'of', 'data', 'science', ',', 'extracting', 'meaningful', 'insights', 'from', 'vast', 'datasets', 'is', 'an', 'art.from', 'predictive', 'modeling', 'to', 'natural', 'language', 'processing', '(', 'nlp', ')', ',', 'the', 'field', 'encompasses', 'a', 'spectrum', 'of', 'techniques.consider', 'a', 'scenario', 'where', 'a', 'machine', 'learning', 'algorithm', 'analyzes', 'customer', 'behavior', ',', 'tokenizing', 'text', 'data', 'to', 'discern', 'patterns', 'in', 'user', 'preferences.this', 'process', 'involves', 'breaking', 'down', 'textual', 'information', 'into', 'tokens', ',', 'unlocking', 'a', 'treasure', 'trove', 'of', 'information', 'for', 'data', 'scientists', 'to', 'explore.the', 'synergy', 'of', 'nlp', 'and', 'data', 'science', 'opens', 'doors', 'to', 'innovation', ',', 'driving', 'advancements', 'in', 'fields', 'like', 'recommendation', 'systems', ',', 'sentiment', 'analysis', ',', 'and', 'beyond', '.']


In [10]:
for word in word_token:
    print('- ', word)

-  in
-  the
-  realm
-  of
-  data
-  science
-  ,
-  extracting
-  meaningful
-  insights
-  from
-  vast
-  datasets
-  is
-  an
-  art.from
-  predictive
-  modeling
-  to
-  natural
-  language
-  processing
-  (
-  nlp
-  )
-  ,
-  the
-  field
-  encompasses
-  a
-  spectrum
-  of
-  techniques.consider
-  a
-  scenario
-  where
-  a
-  machine
-  learning
-  algorithm
-  analyzes
-  customer
-  behavior
-  ,
-  tokenizing
-  text
-  data
-  to
-  discern
-  patterns
-  in
-  user
-  preferences.this
-  process
-  involves
-  breaking
-  down
-  textual
-  information
-  into
-  tokens
-  ,
-  unlocking
-  a
-  treasure
-  trove
-  of
-  information
-  for
-  data
-  scientists
-  to
-  explore.the
-  synergy
-  of
-  nlp
-  and
-  data
-  science
-  opens
-  doors
-  to
-  innovation
-  ,
-  driving
-  advancements
-  in
-  fields
-  like
-  recommendation
-  systems
-  ,
-  sentiment
-  analysis
-  ,
-  and
-  beyond
-  .


In [11]:
len(word_token)

98

## Removing Punctuation and Numbers

In [59]:
# Get the word tokens without punctuation and numbers
# First way

tokens_without_punc_way1 = [w for w in word_token if w.isalpha()] # .isalnum() for number and object
tokens_without_punc_way1

['in',
 'the',
 'realm',
 'of',
 'data',
 'science',
 'extracting',
 'meaningful',
 'insights',
 'from',
 'vast',
 'datasets',
 'is',
 'an',
 'predictive',
 'modeling',
 'to',
 'natural',
 'language',
 'processing',
 'nlp',
 'the',
 'field',
 'encompasses',
 'a',
 'spectrum',
 'of',
 'a',
 'scenario',
 'where',
 'a',
 'machine',
 'learning',
 'algorithm',
 'analyzes',
 'customer',
 'behavior',
 'tokenizing',
 'text',
 'data',
 'to',
 'discern',
 'patterns',
 'in',
 'user',
 'process',
 'involves',
 'breaking',
 'down',
 'textual',
 'information',
 'into',
 'tokens',
 'unlocking',
 'a',
 'treasure',
 'trove',
 'of',
 'information',
 'for',
 'data',
 'scientists',
 'to',
 'synergy',
 'of',
 'nlp',
 'and',
 'data',
 'science',
 'opens',
 'doors',
 'to',
 'innovation',
 'driving',
 'advancements',
 'in',
 'fields',
 'like',
 'recommendation',
 'systems',
 'sentiment',
 'analysis',
 'and',
 'beyond']

In [13]:
len(tokens_without_punc_way1)

84

In [14]:
# Get the word tokens without punctuation and numbers with re module
# Second way

import re

tokens_without_punc_way2 = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
print(tokens_without_punc_way2)
print('---'*10)
tokens_without_punc_way2 = word_tokenize(tokens_without_punc_way2.lower())
print(tokens_without_punc_way2)

in the realm of data science  extracting meaningful insights from vast datasets is an art from predictive modeling to natural language processing  nlp   the field encompasses a spectrum of techniques consider a scenario where a machine learning algorithm analyzes customer behavior  tokenizing text data to discern patterns in user preferences this process involves breaking down textual information into tokens  unlocking a treasure trove of information for data scientists to explore the synergy of nlp and data science opens doors to innovation  driving advancements in fields like recommendation systems  sentiment analysis  and beyond 
------------------------------
['in', 'the', 'realm', 'of', 'data', 'science', 'extracting', 'meaningful', 'insights', 'from', 'vast', 'datasets', 'is', 'an', 'art', 'from', 'predictive', 'modeling', 'to', 'natural', 'language', 'processing', 'nlp', 'the', 'field', 'encompasses', 'a', 'spectrum', 'of', 'techniques', 'consider', 'a', 'scenario', 'where', 'a'

In [15]:
len(tokens_without_punc_way2)

92

## Removing Stopwords

In [16]:
# Import necessary packages

from nltk.corpus import stopwords

In [17]:
# Take a look at the stop words included in nltk's corpus!

stop_words = stopwords.words("english")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
len(stop_words)

179

In [19]:
# Remove stop words

tokens_without_stopwords = [w for w in tokens_without_punc_way1 if w not in stopwords.words("english")]
print(tokens_without_stopwords)

['realm', 'data', 'science', 'extracting', 'meaningful', 'insights', 'vast', 'datasets', 'predictive', 'modeling', 'natural', 'language', 'processing', 'nlp', 'field', 'encompasses', 'spectrum', 'scenario', 'machine', 'learning', 'algorithm', 'analyzes', 'customer', 'behavior', 'tokenizing', 'text', 'data', 'discern', 'patterns', 'user', 'process', 'involves', 'breaking', 'textual', 'information', 'tokens', 'unlocking', 'treasure', 'trove', 'information', 'data', 'scientists', 'synergy', 'nlp', 'data', 'science', 'opens', 'doors', 'innovation', 'driving', 'advancements', 'fields', 'like', 'recommendation', 'systems', 'sentiment', 'analysis', 'beyond']


In [20]:
len(tokens_without_stopwords)

58

In [21]:
# TODO
# Print the stop words included in the sample text

stop_words_in = [w for w in tokens_without_punc_way1 if w in stopwords.words("english")]
print(stop_words_in)

['in', 'the', 'of', 'from', 'is', 'an', 'to', 'the', 'a', 'of', 'a', 'where', 'a', 'to', 'in', 'down', 'into', 'a', 'of', 'for', 'to', 'of', 'and', 'to', 'in', 'and']


In [22]:
len(stop_words_in)

26

## Lemmatization

- `WordNetLemmatizer()`

In [23]:
# Import necessary packages

from nltk.stem.wordnet import WordNetLemmatizer

In [24]:
# Reduce words to their root form with WordNetLemmatizer

lemmed = [WordNetLemmatizer().lemmatize(w) for w in tokens_without_stopwords]
print(lemmed)

['realm', 'data', 'science', 'extracting', 'meaningful', 'insight', 'vast', 'datasets', 'predictive', 'modeling', 'natural', 'language', 'processing', 'nlp', 'field', 'encompasses', 'spectrum', 'scenario', 'machine', 'learning', 'algorithm', 'analyzes', 'customer', 'behavior', 'tokenizing', 'text', 'data', 'discern', 'pattern', 'user', 'process', 'involves', 'breaking', 'textual', 'information', 'token', 'unlocking', 'treasure', 'trove', 'information', 'data', 'scientist', 'synergy', 'nlp', 'data', 'science', 'open', 'door', 'innovation', 'driving', 'advancement', 'field', 'like', 'recommendation', 'system', 'sentiment', 'analysis', 'beyond']


In [25]:
" ".join(lemmed)

'realm data science extracting meaningful insight vast datasets predictive modeling natural language processing nlp field encompasses spectrum scenario machine learning algorithm analyzes customer behavior tokenizing text data discern pattern user process involves breaking textual information token unlocking treasure trove information data scientist synergy nlp data science open door innovation driving advancement field like recommendation system sentiment analysis beyond'

In [61]:
[print(x, '-->', y) for x,y in zip(tokens_without_stopwords, lemmed)]

realm --> realm
data --> data
science --> science
extracting --> extracting
meaningful --> meaningful
insights --> insight
vast --> vast
datasets --> datasets
predictive --> predictive
modeling --> modeling
natural --> natural
language --> language
processing --> processing
nlp --> nlp
field --> field
encompasses --> encompasses
spectrum --> spectrum
scenario --> scenario
machine --> machine
learning --> learning
algorithm --> algorithm
analyzes --> analyzes
customer --> customer
behavior --> behavior
tokenizing --> tokenizing
text --> text
data --> data
discern --> discern
patterns --> pattern
user --> user
process --> process
involves --> involves
breaking --> breaking
textual --> textual
information --> information
tokens --> token
unlocking --> unlocking
treasure --> treasure
trove --> trove
information --> information
data --> data
scientists --> scientist
synergy --> synergy
nlp --> nlp
data --> data
science --> science
opens --> open
doors --> door
innovation --> innovation
driv

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

## Stemming

- `PorterStemmer()`

In [27]:
# Import necessary packages

from nltk.stem import PorterStemmer

In [28]:
# Reduce words to their stems with PorterStemmer

stemmed = [PorterStemmer().stem(w) for w in tokens_without_stopwords]
print(stemmed)

['realm', 'data', 'scienc', 'extract', 'meaning', 'insight', 'vast', 'dataset', 'predict', 'model', 'natur', 'languag', 'process', 'nlp', 'field', 'encompass', 'spectrum', 'scenario', 'machin', 'learn', 'algorithm', 'analyz', 'custom', 'behavior', 'token', 'text', 'data', 'discern', 'pattern', 'user', 'process', 'involv', 'break', 'textual', 'inform', 'token', 'unlock', 'treasur', 'trove', 'inform', 'data', 'scientist', 'synergi', 'nlp', 'data', 'scienc', 'open', 'door', 'innov', 'drive', 'advanc', 'field', 'like', 'recommend', 'system', 'sentiment', 'analysi', 'beyond']


In [29]:
" ".join(stemmed)

'realm data scienc extract meaning insight vast dataset predict model natur languag process nlp field encompass spectrum scenario machin learn algorithm analyz custom behavior token text data discern pattern user process involv break textual inform token unlock treasur trove inform data scientist synergi nlp data scienc open door innov drive advanc field like recommend system sentiment analysi beyond'

In [30]:
[print(x, '-->', y) for x,y in zip(tokens_without_stopwords, stemmed)]

realm --> realm
data --> data
science --> scienc
extracting --> extract
meaningful --> meaning
insights --> insight
vast --> vast
datasets --> dataset
predictive --> predict
modeling --> model
natural --> natur
language --> languag
processing --> process
nlp --> nlp
field --> field
encompasses --> encompass
spectrum --> spectrum
scenario --> scenario
machine --> machin
learning --> learn
algorithm --> algorithm
analyzes --> analyz
customer --> custom
behavior --> behavior
tokenizing --> token
text --> text
data --> data
discern --> discern
patterns --> pattern
user --> user
process --> process
involves --> involv
breaking --> break
textual --> textual
information --> inform
tokens --> token
unlocking --> unlock
treasure --> treasur
trove --> trove
information --> inform
data --> data
scientists --> scientist
synergy --> synergi
nlp --> nlp
data --> data
science --> scienc
opens --> open
doors --> door
innovation --> innov
driving --> drive
advancements --> advanc
fields --> field
like 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [31]:
df = pd.DataFrame(np.column_stack([tokens_without_stopwords, lemmed, stemmed]), 
                  columns=['tokens_without_stopwords', 'lemmed', 'stemmed'])

print(df.to_markdown())

|    | tokens_without_stopwords   | lemmed         | stemmed   |
|---:|:---------------------------|:---------------|:----------|
|  0 | realm                      | realm          | realm     |
|  1 | data                       | data           | data      |
|  2 | science                    | science        | scienc    |
|  3 | extracting                 | extracting     | extract   |
|  4 | meaningful                 | meaningful     | meaning   |
|  5 | insights                   | insight        | insight   |
|  6 | vast                       | vast           | vast      |
|  7 | datasets                   | datasets       | dataset   |
|  8 | predictive                 | predictive     | predict   |
|  9 | modeling                   | modeling       | model     |
| 10 | natural                    | natural        | natur     |
| 11 | language                   | language       | languag   |
| 12 | processing                 | processing     | process   |
| 13 | nlp               

## All Together (Cleaning Function)

In [32]:
text

'In the realm of data science, extracting meaningful insights from vast datasets is an art.From predictive modeling to natural language processing (NLP), the field encompasses a spectrum of techniques.Consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences.This process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore.The synergy of NLP and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond.'

In [33]:
def cleaning(data):

    #1. Tokenize and lower
    text_tokens = word_tokenize(data.lower())

    #2. Remove Puncs and numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]

    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]

    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]

    #5. joining
    return " ".join(text_cleaned)

In [34]:
# Clean the text with CLEANING function

cleaned_text = pd.Series(text).apply(cleaning)
print (cleaned_text.to_string())

0    realm data science extracting meaningful insig...


In [35]:
cleaned_text.to_csv('output.txt', index=False)

with open('output.txt', 'r') as file:
    content = file.read()
    print(content)

0
realm data science extracting meaningful insight vast datasets predictive modeling natural language processing nlp field encompasses spectrum scenario machine learning algorithm analyzes customer behavior tokenizing text data discern pattern user process involves breaking textual information token unlocking treasure trove information data scientist synergy nlp data science open door innovation driving advancement field like recommendation system sentiment analysis beyond



In [36]:
" ".join(lemmed)

'realm data science extracting meaningful insight vast datasets predictive modeling natural language processing nlp field encompasses spectrum scenario machine learning algorithm analyzes customer behavior tokenizing text data discern pattern user process involves breaking textual information token unlocking treasure trove information data scientist synergy nlp data science open door innovation driving advancement field like recommendation system sentiment analysis beyond'

## CountVectorizer (Bag of Words)

- `CountVectorizer` - Bag of Words

**INSTRUCTIONS**

Creating a function `tokenize` like cleaning function that takes text and applies the following:
- convert to all lowercase,
- punctuation removal,
- numbers removal,
- word tokenization, 
- lemmatization, 
- and stop word removal using `nltk`

In [37]:
text

'In the realm of data science, extracting meaningful insights from vast datasets is an art.From predictive modeling to natural language processing (NLP), the field encompasses a spectrum of techniques.Consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences.This process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore.The synergy of NLP and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond.'

In [38]:
corpus = sent_tokenize(text.lower())
print(corpus)

['in the realm of data science, extracting meaningful insights from vast datasets is an art.from predictive modeling to natural language processing (nlp), the field encompasses a spectrum of techniques.consider a scenario where a machine learning algorithm analyzes customer behavior, tokenizing text data to discern patterns in user preferences.this process involves breaking down textual information into tokens, unlocking a treasure trove of information for data scientists to explore.the synergy of nlp and data science opens doors to innovation, driving advancements in fields like recommendation systems, sentiment analysis, and beyond.']


In [39]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [40]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize andremove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

In [41]:
# Import necessary packages

from sklearn.feature_extraction.text import CountVectorizer

In [42]:
# Initialize count vectorizer object (Use TOKENIZE function as tokenizer)

count_vectorizer = CountVectorizer(tokenizer=tokenize)

In [43]:
# Create X object by applying fit_transform with count_vectorizer object to corpus

X = count_vectorizer.fit_transform(corpus)

In [45]:
X

<1x56 sparse matrix of type '<class 'numpy.int64'>'
	with 56 stored elements in Compressed Sparse Row format>

In [46]:
count_vectorizer.get_feature_names_out()

array(['advancement', 'algorithm', 'analysis', 'analyzes', 'art',
       'behavior', 'beyond', 'breaking', 'consider', 'customer', 'data',
       'datasets', 'discern', 'door', 'driving', 'encompasses', 'explore',
       'extracting', 'field', 'information', 'innovation', 'insight',
       'involves', 'language', 'learning', 'like', 'machine',
       'meaningful', 'modeling', 'natural', 'nlp', 'open', 'pattern',
       'predictive', 'preference', 'process', 'processing', 'realm',
       'recommendation', 'scenario', 'science', 'scientist', 'sentiment',
       'spectrum', 'synergy', 'system', 'technique', 'text', 'textual',
       'token', 'tokenizing', 'treasure', 'trove', 'unlocking', 'user',
       'vast'], dtype=object)

In [47]:
# Convert sparse matrix (X) to numpy array to view

X.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [48]:
# View token vocabulary and counts

count_vectorizer.vocabulary_

{'realm': 37,
 'data': 10,
 'science': 40,
 'extracting': 17,
 'meaningful': 27,
 'insight': 21,
 'vast': 55,
 'datasets': 11,
 'art': 4,
 'predictive': 33,
 'modeling': 28,
 'natural': 29,
 'language': 23,
 'processing': 36,
 'nlp': 30,
 'field': 18,
 'encompasses': 15,
 'spectrum': 43,
 'technique': 46,
 'consider': 8,
 'scenario': 39,
 'machine': 26,
 'learning': 24,
 'algorithm': 1,
 'analyzes': 3,
 'customer': 9,
 'behavior': 5,
 'tokenizing': 50,
 'text': 47,
 'discern': 12,
 'pattern': 32,
 'user': 54,
 'preference': 34,
 'process': 35,
 'involves': 22,
 'breaking': 7,
 'textual': 48,
 'information': 19,
 'token': 49,
 'unlocking': 53,
 'treasure': 51,
 'trove': 52,
 'scientist': 41,
 'explore': 16,
 'synergy': 44,
 'open': 31,
 'door': 13,
 'innovation': 20,
 'driving': 14,
 'advancement': 0,
 'like': 25,
 'recommendation': 38,
 'system': 45,
 'sentiment': 42,
 'analysis': 2,
 'beyond': 6}

## TF-IDF

### TfidfTransformer

- `TfidfTransformer` - TF-IDF values

In [49]:
# Import necessary packages

from sklearn.feature_extraction.text import TfidfTransformer

In [50]:
# Initialize tfidf_transformer object

tfidf_transformer = TfidfTransformer(smooth_idf=False)

In [51]:
# Apply fit_transform with tfidf_transformer object to X

X = tfidf_transformer.fit_transform(X)

In [52]:
# Convert sparse matrix to numpy array to view

X.toarray()

array([[0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.43905704, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426, 0.10976426, 0.10976426, 0.21952852, 0.21952852,
        0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.21952852, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.21952852, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426, 0.10976426, 0.10976426, 0.10976426, 0.10976426,
        0.10976426]])

### TfidfVectorizer

- `TfidfVectorizer` - Bag of Words AND TF-IDF values
- `TfidfVectorizer` = `CountVectorizer` + `TfidfTransformer`

In [53]:
# Import necesary packages

from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
# Initialize `tfidf_vectorizer` object

tfidf_vectorizer = TfidfVectorizer()

In [55]:
# Create `X` object by applying `fit_transform` with `tfidf_vectorizer` object to `corpus`,

X = tfidf_vectorizer.fit_transform(corpus)

In [56]:
tfidf_vectorizer.get_feature_names_out()

array(['advancements', 'algorithm', 'an', 'analysis', 'analyzes', 'and',
       'art', 'behavior', 'beyond', 'breaking', 'consider', 'customer',
       'data', 'datasets', 'discern', 'doors', 'down', 'driving',
       'encompasses', 'explore', 'extracting', 'field', 'fields', 'for',
       'from', 'in', 'information', 'innovation', 'insights', 'into',
       'involves', 'is', 'language', 'learning', 'like', 'machine',
       'meaningful', 'modeling', 'natural', 'nlp', 'of', 'opens',
       'patterns', 'predictive', 'preferences', 'process', 'processing',
       'realm', 'recommendation', 'scenario', 'science', 'scientists',
       'sentiment', 'spectrum', 'synergy', 'systems', 'techniques',
       'text', 'textual', 'the', 'this', 'to', 'tokenizing', 'tokens',
       'treasure', 'trove', 'unlocking', 'user', 'vast', 'where'],
      dtype=object)

In [57]:
# Convert sparse matrix (X) to numpy array to view

X.toarray()

array([[0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.16552118, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.33104236, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.16552118,
        0.24828177, 0.16552118, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.16552118,
        0.33104236, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.16552118, 0.08276059, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.24828177,
        0.08276059, 0.33104236, 0.08276059, 0.08276059, 0.08276059,
        0.08276059, 0.08276059, 0.08276059, 0.08276059, 0.08276059]])

### END OF THE PROJECT