In [1]:
text1="natural Language Processing is a subfield of AI"
tag1="NLP"
text2="Computer Vision is a subfield of AI"
tag2="CV"

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the texts and transform them into feature vectors
X = vectorizer.fit_transform([text1, text2])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the frequency table as a matrix
frequency_table = X.toarray()

# Create a dictionary to store the frequency table
frequency_dict = {}

# Iterate over the feature names and frequency table to create the dictionary
for feature_index, feature_name in enumerate(feature_names):
    frequency_dict[feature_name] = frequency_table[:, feature_index]

# Print the frequency table
print("Word\tText1\tText2")
for feature_name in feature_names:
    print(f"{feature_name}\t{frequency_dict[feature_name][0]}\t{frequency_dict[feature_name][1]}")

Word	Text1	Text2
ai	1	1
computer	0	1
is	1	1
language	1	0
natural	1	0
of	1	1
processing	1	0
subfield	1	1
vision	0	1


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd


# Define the two texts and their corresponding tags
text1 = "Natural Language Processing is a subfield of AI"
tag1 = "NLP"
text2 = "Computer Vision is a subfield of AI"
tag2 = "CV"

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the texts and transform them into feature vectors
X = vectorizer.fit_transform([text1, text2])

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Get the frequency table as a matrix
frequency_table = X.toarray()

# Create a DataFrame to hold the frequency table
df = pd.DataFrame(frequency_table, columns=feature_names)

# Add the text and tag columns
df['Text'] = [text1, text2]
df['Tag'] = [tag1, tag2]

# Print the DataFrame
print(df)

   ai  computer  is  language  natural  of  processing  subfield  vision  \
0   1         0   1         1        1   1           1         1       0   
1   1         1   1         0        0   1           0         1       1   

                                              Text  Tag  
0  Natural Language Processing is a subfield of AI  NLP  
1              Computer Vision is a subfield of AI   CV  


In [4]:
df

Unnamed: 0,ai,computer,is,language,natural,of,processing,subfield,vision,Text,Tag
0,1,0,1,1,1,1,1,1,0,Natural Language Processing is a subfield of AI,NLP
1,1,1,1,0,0,1,0,1,1,Computer Vision is a subfield of AI,CV


**Enhancing Vectorizer with Lemmatizer**

In [5]:
import nltk
nltk.download(['punkt','wordnet'])  #Lexical DB for english lang #punkt - tokenizer model for variuos lang
nltk.download('omw-1.4')  #Open Multilingual WordNet - 1.4 - WordNet for multiple languages

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
#punkt - Tokenizer
#wordnet - Lemmatizer

from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()

print(lemm.lemmatize("mouse"))
print(lemm.lemmatize("feet"))
print(lemm.lemmatize("misery"))
print(lemm.lemmatize("houses"))
print(lemm.lemmatize("caring"))
print(lemm.lemmatize("swanandi"))


mouse
foot
misery
house
caring
swanandi


In [7]:
sentence = "My grandma is very caring. The striped bat are hanging on their feet"

#Tokenization
li_words = nltk.word_tokenize(sentence)
print(li_words)

['My', 'grandma', 'is', 'very', 'caring', '.', 'The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'feet']


In [8]:
output = [lemm.lemmatize(w) for w in li_words]
print(output)

['My', 'grandma', 'is', 'very', 'caring', '.', 'The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'foot']


**Provide POS(Parts of Speech) tag as second argument to lemmatize()**

In [9]:
print(lemm.lemmatize("caring", "v"))

care


In [10]:
print(lemm.lemmatize("stripes", "v"))
print(lemm.lemmatize("stripes", "n"))
print(lemm.lemmatize("are", "v"))

strip
stripe
be


**WordNet Lemmatizer with POS tag**

In [11]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [12]:
def get_wordnet_pos_tag(word):
  tag = nltk.pos_tag([word])
  return tag

get_wordnet_pos_tag("caring")
get_wordnet_pos_tag("loudly")

[('loudly', 'RB')]

In [13]:
from nltk.corpus.reader import wordnet

def get_wordnet_pos_tag(word):
  tag = nltk.pos_tag([word])[0][1][0]
  tag_dict={
      "J":wordnet.ADJ,
      "R":wordnet.ADV,
      "N":wordnet.NOUN,
      "V":wordnet.VERB
  }
  print(tag)
  return tag_dict.get(tag, wordnet.NOUN) #Default tag -  POS tag - when the word NOT classified as any predefined POS

get_wordnet_pos_tag("caring")

V


'v'

In [14]:
lemm = WordNetLemmatizer()
word = "caring"
print(lemm.lemmatize(word,get_wordnet_pos_tag(word)))

V
care


In [15]:
lemm = WordNetLemmatizer()
word = "brave"
print(lemm.lemmatize(word,get_wordnet_pos_tag(word)))

N
brave


In [16]:
lemm = WordNetLemmatizer()
word = "striked"
print(lemm.lemmatize(word,get_wordnet_pos_tag(word)))

V
strike


In [17]:
print(lemm.lemmatize("hanging", "v"))


hang


In [18]:
sentence = "My grandma is very caring. The striped bat are hanging on their feet"

#Tokenization
li_words = nltk.word_tokenize(sentence)
print(li_words)

output = [lemm.lemmatize(w,get_wordnet_pos_tag(w)) for w in li_words]
print(output)

['My', 'grandma', 'is', 'very', 'caring', '.', 'The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'feet']
P
N
V
R
V
.
D
V
N
V
V
I
P
N
['My', 'grandma', 'be', 'very', 'care', '.', 'The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot']


**Spacy Lemmatizer**

In [19]:
import spacy
sp_nlp = spacy.load('en_core_web_sm')
sentence = "My grandma is very caring. The striped bat are hanging on their feet"

#Parse the sentence using the lang model -  'en_core_web_sm'
doc = sp_nlp(sentence)
print(doc)

#Extract lemma for each token in sentence
output = [token.lemma_ for token in doc]
print(output)

My grandma is very caring. The striped bat are hanging on their feet
['my', 'grandma', 'be', 'very', 'caring', '.', 'the', 'striped', 'bat', 'be', 'hang', 'on', 'their', 'foot']
