In [3]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer , WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from nltk.corpus import stopwords

In [8]:
'''nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')'''

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
text='RPA or Robotic Process Automation (RPA) refers to specialized software that can simulate actual human interaction with the Information System (IS) to carry out business processes. '

In [21]:
token=word_tokenize(text)
token

['RPA',
 'or',
 'Robotic',
 'Process',
 'Automation',
 '(',
 'RPA',
 ')',
 'refers',
 'to',
 'specialized',
 'software',
 'that',
 'can',
 'simulate',
 'actual',
 'human',
 'interaction',
 'with',
 'the',
 'Information',
 'System',
 '(',
 'IS',
 ')',
 'to',
 'carry',
 'out',
 'business',
 'processes',
 '.']

In [22]:
sentence=sent_tokenize(text)

In [23]:
#POS tagging(providing parts of speech to each word)
pos=nltk.pos_tag(token)
pos

[('RPA', 'NNP'),
 ('or', 'CC'),
 ('Robotic', 'NNP'),
 ('Process', 'NNP'),
 ('Automation', 'NNP'),
 ('(', '('),
 ('RPA', 'NNP'),
 (')', ')'),
 ('refers', 'NNS'),
 ('to', 'TO'),
 ('specialized', 'VB'),
 ('software', 'NN'),
 ('that', 'WDT'),
 ('can', 'MD'),
 ('simulate', 'VB'),
 ('actual', 'JJ'),
 ('human', 'JJ'),
 ('interaction', 'NN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('Information', 'NN'),
 ('System', 'NNP'),
 ('(', '('),
 ('IS', 'NNP'),
 (')', ')'),
 ('to', 'TO'),
 ('carry', 'VB'),
 ('out', 'RP'),
 ('business', 'NN'),
 ('processes', 'NNS'),
 ('.', '.')]

In [26]:
#stop word removal(remmoving the words that has no significance like 'the','is')
stop_words=set(stopwords.words('english'))
filtered_token= [word for word in token if word.lower() not in stop_words ] 
filtered_token

['RPA',
 'Robotic',
 'Process',
 'Automation',
 '(',
 'RPA',
 ')',
 'refers',
 'specialized',
 'software',
 'simulate',
 'actual',
 'human',
 'interaction',
 'Information',
 'System',
 '(',
 ')',
 'carry',
 'business',
 'processes',
 '.']

In [30]:
#stemming(reducing word to base 'playing' = 'play' )
stemmer = PorterStemmer()
stemm_token = [stemmer.stem(word) for word in filtered_token]
stemm_token

['rpa',
 'robot',
 'process',
 'autom',
 '(',
 'rpa',
 ')',
 'refer',
 'special',
 'softwar',
 'simul',
 'actual',
 'human',
 'interact',
 'inform',
 'system',
 '(',
 ')',
 'carri',
 'busi',
 'process',
 '.']

In [36]:
#lemmatization(converts into dictionary form ran to run)
lemmatizer = WordNetLemmatizer()
lemmatize_token = [lemmatizer.lemmatize(word) for word in filtered_token]
lemmatize_token

['RPA',
 'Robotic',
 'Process',
 'Automation',
 '(',
 'RPA',
 ')',
 'refers',
 'specialized',
 'software',
 'simulate',
 'actual',
 'human',
 'interaction',
 'Information',
 'System',
 '(',
 ')',
 'carry',
 'business',
 'process',
 '.']

In [51]:
processed_text = ' '.join(lemmatize_token)

In [52]:
vectorizer = TfidfVectorizer()

In [64]:
# term frequency=how many times a word occur in doc
# inverse document frequency = measure rarity of each word
tidf = vectorizer.fit_transform([processed_text])

In [65]:
feature = vectorizer.get_feature_names_out()

In [66]:
print('Matrix')
print(tidf.toarray())
print(feature)

Matrix
[[0.21821789 0.21821789 0.21821789 0.21821789 0.21821789 0.21821789
  0.21821789 0.43643578 0.21821789 0.21821789 0.43643578 0.21821789
  0.21821789 0.21821789 0.21821789]]
['actual' 'automation' 'business' 'carry' 'human' 'information'
 'interaction' 'process' 'refers' 'robotic' 'rpa' 'simulate' 'software'
 'specialized' 'system']


In [68]:
vectorizer.idf_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [59]:
#extra eg
doc1='this is first document'
doc2='this is second document'
string=[doc1,doc2]

In [60]:
vectorizer = TfidfVectorizer()
tfidfMatrix = vectorizer.fit_transform(string)

In [61]:
print(tfidfMatrix.toarray())
feature = vectorizer.get_feature_names_out()
print(feature)

[[0.44832087 0.63009934 0.44832087 0.         0.44832087]
 [0.44832087 0.         0.44832087 0.63009934 0.44832087]]
['document' 'first' 'is' 'second' 'this']


In [62]:
print(vectorizer.idf_)

[1.         1.40546511 1.         1.40546511 1.        ]


In [None]:
idf