# Text Mining and NLP Concepts

# Tokenization

In [1]:
import re

In [2]:
sentence = 'Tokenization is a fundamental step in natural language processing that breaks down a sentence into individual components, such as words, punctuation marks, and sometimes phrases, to make text easier to analyze and process by computers.'

In [3]:
sentence.split()
re.sub(r'([^\s\w]|_)+',' ',sentence).split()

['Tokenization',
 'is',
 'a',
 'fundamental',
 'step',
 'in',
 'natural',
 'language',
 'processing',
 'that',
 'breaks',
 'down',
 'a',
 'sentence',
 'into',
 'individual',
 'components',
 'such',
 'as',
 'words',
 'punctuation',
 'marks',
 'and',
 'sometimes',
 'phrases',
 'to',
 'make',
 'text',
 'easier',
 'to',
 'analyze',
 'and',
 'process',
 'by',
 'computers']

## Extracting n-grams
## n-grams can be extracted from 3 different techniques:
### 1. Custom defined function
### 2. NLTK
### 3. TextBlob

## Extracting n-grams using customed defined function

In [4]:
import re

In [5]:
def n_gram_extractor(input_str,n):
    tokens = re.sub(r'([^\s\w]|_)+',' ',input_str).split()
    for i in range(len(tokens)-n+1):
        print(tokens[i:i+n])

In [6]:
n_gram_extractor('The cute little boy is playing with the kitten.',2)

['The', 'cute']
['cute', 'little']
['little', 'boy']
['boy', 'is']
['is', 'playing']
['playing', 'with']
['with', 'the']
['the', 'kitten']


In [7]:
n_gram_extractor('The cute little boy is playing with the kitten.',3)

['The', 'cute', 'little']
['cute', 'little', 'boy']
['little', 'boy', 'is']
['boy', 'is', 'playing']
['is', 'playing', 'with']
['playing', 'with', 'the']
['with', 'the', 'kitten']


## Extracting n-grams using NLTK

In [8]:
from nltk import ngrams
from nltk import word_tokenize

In [9]:
words = word_tokenize("I am reading NLP Fundamentals")
print(words)

['I', 'am', 'reading', 'NLP', 'Fundamentals']


In [10]:
list(ngrams('The cute little boy is playing with the kitten.'.split(),1))

[('The',),
 ('cute',),
 ('little',),
 ('boy',),
 ('is',),
 ('playing',),
 ('with',),
 ('the',),
 ('kitten.',)]

In [11]:
list(ngrams('The cute little boy is playing with the kitten.'.split(),2))

[('The', 'cute'),
 ('cute', 'little'),
 ('little', 'boy'),
 ('boy', 'is'),
 ('is', 'playing'),
 ('playing', 'with'),
 ('with', 'the'),
 ('the', 'kitten.')]

In [12]:
list(ngrams("The cute little boy is playing with the kitten.".split(),3))

[('The', 'cute', 'little'),
 ('cute', 'little', 'boy'),
 ('little', 'boy', 'is'),
 ('boy', 'is', 'playing'),
 ('is', 'playing', 'with'),
 ('playing', 'with', 'the'),
 ('with', 'the', 'kitten.')]

## Extracting n-grams using TextBlob
### TextBlob is  a library used in python for processing textual data.

In [13]:
from textblob import TextBlob
blob = TextBlob("The cute little boy is playing with the kitten.")

In [14]:
blob.ngrams(n = 2)

[WordList(['The', 'cute']),
 WordList(['cute', 'little']),
 WordList(['little', 'boy']),
 WordList(['boy', 'is']),
 WordList(['is', 'playing']),
 WordList(['playing', 'with']),
 WordList(['with', 'the']),
 WordList(['the', 'kitten'])]

In [15]:
blob.ngrams(n = 3)

[WordList(['The', 'cute', 'little']),
 WordList(['cute', 'little', 'boy']),
 WordList(['little', 'boy', 'is']),
 WordList(['boy', 'is', 'playing']),
 WordList(['is', 'playing', 'with']),
 WordList(['playing', 'with', 'the']),
 WordList(['with', 'the', 'kitten'])]

In [16]:
blob.ngrams(n =4)

[WordList(['The', 'cute', 'little', 'boy']),
 WordList(['cute', 'little', 'boy', 'is']),
 WordList(['little', 'boy', 'is', 'playing']),
 WordList(['boy', 'is', 'playing', 'with']),
 WordList(['is', 'playing', 'with', 'the']),
 WordList(['playing', 'with', 'the', 'kitten'])]

## Tokenization using Keras

In [17]:
sentence1 = "The Indian-American scientist was chosen for the award from a list of nearly 50,000 nominations. The Padma Shri is India’s fourth largest civilian award and was given to Kak owing to his research in #AI and #cryptography."

In [18]:
from keras.preprocessing.text import text_to_word_sequence
text_to_word_sequence(sentence1)

['the',
 'indian',
 'american',
 'scientist',
 'was',
 'chosen',
 'for',
 'the',
 'award',
 'from',
 'a',
 'list',
 'of',
 'nearly',
 '50',
 '000',
 'nominations',
 'the',
 'padma',
 'shri',
 'is',
 'india’s',
 'fourth',
 'largest',
 'civilian',
 'award',
 'and',
 'was',
 'given',
 'to',
 'kak',
 'owing',
 'to',
 'his',
 'research',
 'in',
 'ai',
 'and',
 'cryptography']

## Tokenize sentences using other nltk tokenizers:
### 1.Tweet Tokenizer
### 2. MWE Tokenizer(Multi-Word Expression)
### 3. Regexp Tokenizer
### 4. Whitespace Tokenizer
### 5. Word Punct Tokenizer

## 1. Tweet Tokenizer

In [19]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence1)

['The',
 'Indian-American',
 'scientist',
 'was',
 'chosen',
 'for',
 'the',
 'award',
 'from',
 'a',
 'list',
 'of',
 'nearly',
 '50,000',
 'nominations',
 '.',
 'The',
 'Padma',
 'Shri',
 'is',
 'India',
 '’',
 's',
 'fourth',
 'largest',
 'civilian',
 'award',
 'and',
 'was',
 'given',
 'to',
 'Kak',
 'owing',
 'to',
 'his',
 'research',
 'in',
 '#AI',
 'and',
 '#cryptography',
 '.']

## 2. MWE Tokenizer (Multi-Word Tokenizer)

In [20]:
from nltk.tokenize import MWETokenizer

In [21]:
mwe_tokenizer = MWETokenizer([('Indian-American'),('Padma','Shri')])
mwe_tokenizer.tokenize(sentence1.split())

['The',
 'Indian-American',
 'scientist',
 'was',
 'chosen',
 'for',
 'the',
 'award',
 'from',
 'a',
 'list',
 'of',
 'nearly',
 '50,000',
 'nominations.',
 'The',
 'Padma_Shri',
 'is',
 'India’s',
 'fourth',
 'largest',
 'civilian',
 'award',
 'and',
 'was',
 'given',
 'to',
 'Kak',
 'owing',
 'to',
 'his',
 'research',
 'in',
 '#AI',
 'and',
 '#cryptography.']

## 3. RegExp Tokenizer

In [22]:
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence1)

['The',
 'Indian',
 '-American',
 'scientist',
 'was',
 'chosen',
 'for',
 'the',
 'award',
 'from',
 'a',
 'list',
 'of',
 'nearly',
 '50',
 ',000',
 'nominations',
 '.',
 'The',
 'Padma',
 'Shri',
 'is',
 'India',
 '’s',
 'fourth',
 'largest',
 'civilian',
 'award',
 'and',
 'was',
 'given',
 'to',
 'Kak',
 'owing',
 'to',
 'his',
 'research',
 'in',
 '#AI',
 'and',
 '#cryptography.']

## 4. Whitespace Tokenizer

In [23]:
from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence1)

['The',
 'Indian-American',
 'scientist',
 'was',
 'chosen',
 'for',
 'the',
 'award',
 'from',
 'a',
 'list',
 'of',
 'nearly',
 '50,000',
 'nominations.',
 'The',
 'Padma',
 'Shri',
 'is',
 'India’s',
 'fourth',
 'largest',
 'civilian',
 'award',
 'and',
 'was',
 'given',
 'to',
 'Kak',
 'owing',
 'to',
 'his',
 'research',
 'in',
 '#AI',
 'and',
 '#cryptography.']

## 5. WordPunct Tokenizer

In [24]:
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence1)

['The',
 'Indian',
 '-',
 'American',
 'scientist',
 'was',
 'chosen',
 'for',
 'the',
 'award',
 'from',
 'a',
 'list',
 'of',
 'nearly',
 '50',
 ',',
 '000',
 'nominations',
 '.',
 'The',
 'Padma',
 'Shri',
 'is',
 'India',
 '’',
 's',
 'fourth',
 'largest',
 'civilian',
 'award',
 'and',
 'was',
 'given',
 'to',
 'Kak',
 'owing',
 'to',
 'his',
 'research',
 'in',
 '#',
 'AI',
 'and',
 '#',
 'cryptography',
 '.']

## Sentence Tokenization

In [25]:
from nltk.tokenize import sent_tokenize
sent_tokenize("We are learning NLP in Python. Python is a very useful tool in DS. We love NLP ")

['We are learning NLP in Python.',
 'Python is a very useful tool in DS.',
 'We love NLP']

## Parts of Speech Tagging (POS Tagging)

In [26]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Swapnil Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('I', 'PRP'),
 ('am', 'VBP'),
 ('reading', 'VBG'),
 ('NLP', 'NNP'),
 ('Fundamentals', 'NNS')]

## Stop Words

In [27]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Swapnil
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
stop_words = stopwords.words('English')
print(stop_words)
len(stop_words) # There are 179 predefined English Stopwords

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

179

In [29]:
sentence2 = "I am learning NLP. It is one of the most popular library in Python"

In [30]:
sentence_words = word_tokenize(sentence2)
print(sentence_words)

['I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'one', 'of', 'the', 'most', 'popular', 'library', 'in', 'Python']


## Filtering stop words from the input string

In [31]:
sentence_no_stops = ' '.join([word for word in sentence_words if word not in stop_words])
print(sentence_no_stops)

I learning NLP . It one popular library Python


## Text Normalization

In [32]:
# Replace words in string
sentence3 = "I visited NY from IND on 31-12-24"

normalized_sentence = sentence3.replace("NY", "New York").replace("IND","India").replace("-24","-2024")
print(normalized_sentence)

I visited New York from India on 31-12-2024


## Spelling Corrections

In [33]:
from autocorrect import Speller

In [34]:
spell = Speller(lang = 'en')
help(Speller)

Help on class Speller in module autocorrect:

class Speller(builtins.object)
 |  Speller(lang='en', threshold=0, nlp_data=None, fast=False, only_replacements=False)
 |  
 |  Methods defined here:
 |  
 |  __call__ = autocorrect_sentence(self, sentence)
 |  
 |  __init__(self, lang='en', threshold=0, nlp_data=None, fast=False, only_replacements=False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  autocorrect_sentence(self, sentence)
 |  
 |  autocorrect_word(self, word)
 |      most likely correction for everything up to a double typo
 |  
 |  existing(self, words)
 |      {'the', 'teh'} => {'the'}
 |  
 |  get_candidates(self, word)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [35]:
spell("Natureal") # Correct spelling is printed

'Natural'

In [36]:
sent1 = word_tokenize("Ntural Luanguage Processin deals with the art of extracting insightes from Natural Languaes")
print(sent1)

['Ntural', 'Luanguage', 'Processin', 'deals', 'with', 'the', 'art', 'of', 'extracting', 'insightes', 'from', 'Natural', 'Languaes']


In [37]:
sentence_corrected = ' '.join([spell(word) for word in sent1])
print(sentence_corrected)

Natural Language Processing deals with the art of extracting insights from Natural Languages


## Stemming

In [38]:
import nltk
stemmer = nltk.stem.PorterStemmer()

In [39]:
stemmer.stem("Programming")

'program'

In [40]:
stemmer.stem("Programs")

'program'

In [41]:
stemmer.stem("Jumping")

'jump'

In [42]:
stemmer.stem("Jumps")

'jump'

In [43]:
stemmer.stem("battling") # battl is not an actual word as stemming doesn't check in dictionary

'battl'

In [44]:
stemmer.stem('amazing')

'amaz'

### Porter Stemmer

In [45]:
sent2 = "Before eating, it would be nice to sanitize your hands with a sanitizer"

In [46]:
from nltk.stem.porter import PorterStemmer

In [47]:
ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sent2.split()])

'befor eating, it would be nice to sanit your hand with a sanit'

### Regexp Stemmer

In [48]:
sent3 = "I love playing Cricket. Cricket players practice hard in their inning"

In [49]:
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$')
' '.join([regex_stemmer.stem(wd) for wd in sent3.split()])

'I love play Cricket. Cricket players practice hard in their inn'

## Lemmatization

In [50]:
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to C:\Users\Swapnil
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
lemmatizer = WordNetLemmatizer()

In [52]:
lemmatizer.lemmatize("Programming")

'Programming'

In [53]:
lemmatizer.lemmatize('Programs')

'Programs'

In [54]:
lemmatizer.lemmatize('battling')

'battling'

In [55]:
lemmatizer.lemmatize('amazing')

'amazing'

In [56]:
from earthy.nltk_wrappers import lemmatize_sent



In [57]:
sent4 = "The codes executed today are far better than what we execute generally."

In [58]:
lemmatize_sent(sent4)
words, lemmas, tags = zip(*lemmatize_sent(sent4))
lemmas

('The',
 'code',
 'execute',
 'today',
 'be',
 'far',
 'good',
 'than',
 'what',
 'we',
 'execute',
 'generally',
 '.')

## Singularize and Pluralize words

In [59]:
from textblob import TextBlob

In [60]:
sent5 = TextBlob('She sells seashells on the seashore')

In [61]:
sent5.words

WordList(['She', 'sells', 'seashells', 'on', 'the', 'seashore'])

In [62]:
sent5.words[2].singularize()

'seashell'

In [63]:
sent5.words[5].pluralize()

'seashores'

## Language Translation

In [64]:
# From Spanish to English
from textblob import TextBlob
en_blob = TextBlob(u'muy bien')
en_blob.translate(from_lang = 'es', to = 'en') 

TextBlob("very good")

## Custom Stop Words removal

In [65]:
from nltk import word_tokenize

In [66]:
sent5 = "She sells seashells on the seashore"

In [67]:
custom_stop_word_list = ['she' , 'on' , 'the' , 'am' ,'is','not']
' '.join([word for word in word_tokenize(sent5) if word.lower() not in custom_stop_word_list])

'sells seashells seashore'

## Extracting general features from raw texts

#### Number of words
#### Detect presence of wh words
#### Polarity
#### Subjectivity
#### Language identification

In [68]:
import pandas as pd
df = pd.DataFrame([['The vaccine for covid-19 will be announced on 1st August.'],
                   ['Do you know how much expectation the world population is having from this research?'],
                   ['This risk of virus will end on 31st July.']])

In [69]:
df.columns = ['text']
df

Unnamed: 0,text
0,The vaccine for covid-19 will be announced on ...
1,Do you know how much expectation the world pop...
2,This risk of virus will end on 31st July.


## Number of words

In [70]:
from textblob import TextBlob
df['number_of_words'] = df['text'].apply(lambda x : len(TextBlob(x).words))
df['number_of_words']

0    10
1    14
2     9
Name: number_of_words, dtype: int64

### Detect presence of wh words

In [71]:
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])
df['are_wh_words_present'] = df['text'].apply(lambda x: True if len(set(TextBlob(str(x)).words).intersection(wh_words)) > 0 else False)
df['are_wh_words_present']

0    False
1     True
2    False
Name: are_wh_words_present, dtype: bool

### Polarity

In [72]:
df['polarity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['polarity']

0    0.0
1    0.2
2    0.0
Name: polarity, dtype: float64

### Subjectivity

In [73]:
df['subjectivity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
df['subjectivity']

0    0.0
1    0.2
2    0.0
Name: subjectivity, dtype: float64

### Language Detector

In [74]:
# Install required packages
# pip install spacy
# pip install spacy-langdetect

from langdetect import detect, detect_langs

# Input text
# text = 'This is an English text.'
text = 'muy bien'

# Detect the language
detected_language = detect(text)  # Returns the language code
detected_probabilities = detect_langs(text)  # Returns probabilities

print(f"Detected Language: {detected_language}")
print(f"Probabilities: {detected_probabilities}")

Detected Language: es
Probabilities: [es:0.9999951671528979]


## Bag of Words

In [75]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['At least seven Indian pharma companies are working to develop a vaccine against coronavirus',
'the deadly virus that has already infected more than 14 million globally.',
'Bharat Biotech, Indian Immunologicals, are among the domestic pharma firms working on the coronavirus vaccines in India.'
]

In [76]:
bag_of_words_model = CountVectorizer()
print(bag_of_words_model.fit_transform(corpus).todense())

[[0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1]
 [1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0]
 [0 0 0 1 1 0 1 1 0 1 0 0 1 1 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 2 0 0 1 0 1]]


In [77]:
bag_of_words_df = pd.DataFrame(bag_of_words_model.fit_transform(corpus).todense())
bag_of_words_df.columns = sorted(bag_of_words_model.vocabulary_)
bag_of_words_df.head()

Unnamed: 0,14,against,already,among,are,at,bharat,biotech,companies,coronavirus,...,pharma,seven,than,that,the,to,vaccine,vaccines,virus,working
0,0,1,0,0,1,1,0,0,1,1,...,1,1,0,0,0,1,1,0,0,1
1,1,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,1,0
2,0,0,0,1,1,0,1,1,0,1,...,1,0,0,0,2,0,0,1,0,1


In [78]:
# Initialize CountVectorizer for top 5 frequent terms
bag_of_words_model_small = CountVectorizer(max_features=5)

# Transform the corpus into a Bag of Words representation
bag_of_words_matrix = bag_of_words_model_small.fit_transform(corpus)

# Convert the matrix to a DataFrame
bag_of_words_df_small = pd.DataFrame(bag_of_words_matrix.todense(), 
                                      columns=bag_of_words_model_small.get_feature_names_out())

# Display the first few rows
print(bag_of_words_df_small.head())

   are  coronavirus  indian  the  working
0    1            1       1    0        1
1    0            0       0    1        0
2    1            1       1    2        1


## TFIDF

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
tfidf_model = TfidfVectorizer()
print(tfidf_model.fit_transform(corpus).todense())

[[0.         0.30300252 0.         0.         0.23044123 0.30300252
  0.         0.         0.30300252 0.23044123 0.         0.30300252
  0.         0.         0.         0.         0.         0.
  0.         0.23044123 0.         0.30300252 0.         0.
  0.         0.23044123 0.30300252 0.         0.         0.
  0.30300252 0.30300252 0.         0.         0.23044123]
 [0.29388386 0.         0.29388386 0.         0.         0.
  0.         0.         0.         0.         0.29388386 0.
  0.         0.         0.29388386 0.29388386 0.         0.
  0.         0.         0.29388386 0.         0.29388386 0.29388386
  0.         0.         0.         0.29388386 0.29388386 0.22350625
  0.         0.         0.         0.29388386 0.        ]
 [0.         0.         0.         0.25644739 0.19503485 0.
  0.25644739 0.25644739 0.         0.19503485 0.         0.
  0.25644739 0.25644739 0.         0.         0.25644739 0.25644739
  0.25644739 0.19503485 0.         0.         0.         0.
  0.

In [81]:
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(corpus).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

Unnamed: 0,14,against,already,among,are,at,bharat,biotech,companies,coronavirus,...,pharma,seven,than,that,the,to,vaccine,vaccines,virus,working
0,0.0,0.303003,0.0,0.0,0.230441,0.303003,0.0,0.0,0.303003,0.230441,...,0.230441,0.303003,0.0,0.0,0.0,0.303003,0.303003,0.0,0.0,0.230441
1,0.293884,0.0,0.293884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.293884,0.293884,0.223506,0.0,0.0,0.0,0.293884,0.0
2,0.0,0.0,0.0,0.256447,0.195035,0.0,0.256447,0.256447,0.0,0.195035,...,0.195035,0.0,0.0,0.0,0.39007,0.0,0.0,0.256447,0.0,0.195035


In [82]:
# TFIDF for top 5 frequent terms
tfidf_model_small = TfidfVectorizer(max_features = 5)
tfidf_df_small = pd.DataFrame(tfidf_model_small.fit_transform(corpus).todense())
tfidf_df_small.columns = sorted(tfidf_model_small.vocabulary_)
tfidf_df_small.head()

Unnamed: 0,are,coronavirus,indian,the,working
0,0.5,0.5,0.5,0.0,0.5
1,0.0,0.0,0.0,1.0,0.0
2,0.353553,0.353553,0.353553,0.707107,0.353553


## Feature Engineering (Text Similarity)

In [83]:
from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
from earthy.nltk_wrappers import lemmatize_sent
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
pair1 = ["Do you have Covid-19","Your body temperature will tell you"]
pair2 = ["I travelled to Malaysia.", "Where did you travel?"]
pair3 = ["He is a programmer", "Is he not a programmer?"]

In [85]:
def extract_text_similarity_jaccard (text1, text2):
    words_text1 = tuple(zip(*lemmatize_sent(text1.lower())))[1]
    words_text2 = tuple(zip(*lemmatize_sent(text2.lower())))[1]
    nr = len(set(words_text1).intersection(set(words_text2)))
    dr = len(set(words_text1).union(set(words_text2)))
    jaccard_sim = nr/dr
    return jaccard_sim

In [86]:
extract_text_similarity_jaccard(pair1[0], pair1[1])
extract_text_similarity_jaccard(pair2[0], pair2[1])
extract_text_similarity_jaccard(pair3[0], pair3[1])

0.6666666666666666

In [87]:
tfidf_model = TfidfVectorizer()

In [88]:
# Creating a corpus which will have texts of pair1, pair2 and pair3 respectively
corpus = [pair1[0], pair1[1], pair2[0], pair2[1], pair3[0], pair3[1]]

In [89]:
tfidf_results = tfidf_model.fit_transform(corpus).todense()