In [1]:
import nltk
from nltk.tokenize import word_tokenize     #Tokenizes text into words.
from nltk.corpus import stopwords            #Provides a list of common stopwords in various languages.
from nltk.stem import PorterStemmer, WordNetLemmatizer     #Used for stemming and lemmatization, 
from nltk import pos_tag             #Assigns parts of speech to words in a text.

In [2]:
text = "Natural Language Processing is the subfield of linguistic, computer science and artificial intelligence concerned with the interaction between human and computer"

In [3]:
import re
filtered_text = re.sub(r'[^\w\s]','',text)     #removes punctuation marks from the text
print(filtered_text)

Natural Language Processing is the subfield of linguistic computer science and artificial intelligence concerned with the interaction between human and computer


In [4]:
tokens = word_tokenize(filtered_text)         #splits the text into individual words or tokens.
print(tokens)

['Natural', 'Language', 'Processing', 'is', 'the', 'subfield', 'of', 'linguistic', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interaction', 'between', 'human', 'and', 'computer']


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhargavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words("english"))                       #removingstopwords
filtered_tokens = [x for x in tokens if x.lower() not in stop_words]
print(filtered_tokens)
#Stopwords are common words that do not carry significant meaning in the context of text analysis. They are removed to focus on meaningful words.

['Natural', 'Language', 'Processing', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'human', 'computer']


In [7]:
pos_tagged = pos_tag(filtered_tokens)             #Assigns a part-of-speech tag to each word in the text.
print(pos_tagged)

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('subfield', 'VBD'), ('linguistic', 'JJ'), ('computer', 'NN'), ('science', 'NN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('concerned', 'VBN'), ('interaction', 'NN'), ('human', 'JJ'), ('computer', 'NN')]


In [8]:
p = PorterStemmer()                         # reduces words to their root or base form.
stem = [p.stem(x) for x in filtered_tokens]
print(stem)

['natur', 'languag', 'process', 'subfield', 'linguist', 'comput', 'scienc', 'artifici', 'intellig', 'concern', 'interact', 'human', 'comput']


In [9]:
l = WordNetLemmatizer()                       # reduces words to their base form but considers the context and meaning of the word.
lemma = [l.lemmatize(x) for x in filtered_tokens]
print(lemma)

['Natural', 'Language', 'Processing', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'human', 'computer']


In [10]:
def pos(x):
    if x.startswith('V'):
        return 'v'
    elif x.startswith('R'):
        return 'r'
    elif x.startswith('J'):
        return 'a'
    else:
        return 'n'

In [11]:
l = WordNetLemmatizer()                                #maps NLTK POS tags to WordNet POS tags for more accurate lemmatization
lemma = [l.lemmatize(x,pos(w)) for x,w in pos_tagged]
print(lemma)

['Natural', 'Language', 'Processing', 'subfield', 'linguistic', 'computer', 'science', 'artificial', 'intelligence', 'concern', 'interaction', 'human', 'computer']


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer          #Term Frequency-Inverse Document Frequency

In [13]:
data = " ".join(lemma)

tv = TfidfVectorizer()
tfidf_matrix = tv.fit_transform([data])

In [14]:
features = tv.get_feature_names_out()                 #TF-IDF score for each word in the document.

tfidf_score = dict(zip(features, tfidf_matrix.toarray()[0]))

for word, score in tfidf_score.items():
    print(f"{word} --->  {score}")

artificial --->  0.2581988897471611
computer --->  0.5163977794943222
concern --->  0.2581988897471611
human --->  0.2581988897471611
intelligence --->  0.2581988897471611
interaction --->  0.2581988897471611
language --->  0.2581988897471611
linguistic --->  0.2581988897471611
natural --->  0.2581988897471611
processing --->  0.2581988897471611
science --->  0.2581988897471611
subfield --->  0.2581988897471611


In [15]:
data = " ".join(lemma)

tv = TfidfVectorizer(use_idf=True)
tfidf_matrix = tv.fit_transform([data])
features = tv.get_feature_names_out()                  ## or tv.get_feature_names()

tfidf_score = dict(zip(features, tfidf_matrix.toarray()[0]))

for word, score in tfidf_score.items():
    print(f"{word} --->  {score}")

artificial --->  0.2581988897471611
computer --->  0.5163977794943222
concern --->  0.2581988897471611
human --->  0.2581988897471611
intelligence --->  0.2581988897471611
interaction --->  0.2581988897471611
language --->  0.2581988897471611
linguistic --->  0.2581988897471611
natural --->  0.2581988897471611
processing --->  0.2581988897471611
science --->  0.2581988897471611
subfield --->  0.2581988897471611


In [16]:
data = " ".join(lemma)

tv = TfidfVectorizer(use_idf=False)
tfidf_matrix = tv.fit_transform([data])
features = tv.get_feature_names_out()

tfidf_score = dict(zip(features, tfidf_matrix.toarray()[0]))

for word, score in tfidf_score.items():
    print(f"{word} --->  {score}")

artificial --->  0.2581988897471611
computer --->  0.5163977794943222
concern --->  0.2581988897471611
human --->  0.2581988897471611
intelligence --->  0.2581988897471611
interaction --->  0.2581988897471611
language --->  0.2581988897471611
linguistic --->  0.2581988897471611
natural --->  0.2581988897471611
processing --->  0.2581988897471611
science --->  0.2581988897471611
subfield --->  0.2581988897471611


In [17]:
 # this code preprocesses text by removing punctuation and stopwords, tokenizes it, performs stemming, lemmatization, and finally computes the TF-IDF scores for each word in the document.

In [18]:
# Tokenization is the process of breaking down text into smaller units, typically words or tokens.
# Stemming is the process of reducing words to their root or base form (stem), often by removing suffixes or prefixes.
# Lemmatization is similar to stemming but more sophisticated. It involves reducing words to their base or dictionary form (lemma) based on their meaning and context.
# For example, the word "better" would be lemmatized to "good", and "running" would be lemmatized to "run".