# Text Mining and NLP Concepts

# Tokenization

In [1]:
import re

In [2]:
sentence = 'Tokenization is a fundamental step in natural language processing that breaks down a sentence into individual components, such as words, punctuation marks, and sometimes phrases, to make text easier to analyze and process by computers.'

In [3]:
sentence.split()
re.sub(r'([^\s\w]|_)+',' ',sentence).split()

## Extracting n-grams
## n-grams can be extracted from 3 different techniques:
### 1. Custom defined function
### 2. NLTK
### 3. TextBlob

## Extracting n-grams using customed defined function

In [4]:
import re

In [5]:
def n_gram_extractor(input_str,n):
    tokens = re.sub(r'([^\s\w]|_)+',' ',input_str).split()
    for i in range(len(tokens)-n+1):
        print(tokens[i:i+n])

In [6]:
n_gram_extractor('The cute little boy is playing with the kitten.',2)

In [7]:
n_gram_extractor('The cute little boy is playing with the kitten.',3)

## Extracting n-grams using NLTK

In [8]:
from nltk import ngrams
from nltk import word_tokenize

In [9]:
words = word_tokenize("I am reading NLP Fundamentals")
print(words)

In [10]:
list(ngrams('The cute little boy is playing with the kitten.'.split(),1))

In [11]:
list(ngrams('The cute little boy is playing with the kitten.'.split(),2))

In [12]:
list(ngrams("The cute little boy is playing with the kitten.".split(),3))

## Extracting n-grams using TextBlob
### TextBlob is  a library used in python for processing textual data.

In [13]:
from textblob import TextBlob
blob = TextBlob("The cute little boy is playing with the kitten.")

In [14]:
blob.ngrams(n = 2)

In [15]:
blob.ngrams(n = 3)

In [16]:
blob.ngrams(n =4)

## Tokenization using Keras

In [17]:
sentence1 = "The Indian-American scientist was chosen for the award from a list of nearly 50,000 nominations. The Padma Shri is India’s fourth largest civilian award and was given to Kak owing to his research in #AI and #cryptography."

In [18]:
from keras.preprocessing.text import text_to_word_sequence
text_to_word_sequence(sentence1)

## Tokenize sentences using other nltk tokenizers:
### 1.Tweet Tokenizer
### 2. MWE Tokenizer(Multi-Word Expression)
### 3. Regexp Tokenizer
### 4. Whitespace Tokenizer
### 5. Word Punct Tokenizer

## 1. Tweet Tokenizer

In [19]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence1)

## 2. MWE Tokenizer (Multi-Word Tokenizer)

In [20]:
from nltk.tokenize import MWETokenizer

In [21]:
mwe_tokenizer = MWETokenizer([('Indian-American'),('Padma','Shri')])
mwe_tokenizer.tokenize(sentence1.split())

## 3. RegExp Tokenizer

In [22]:
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence1)

## 4. Whitespace Tokenizer

In [23]:
from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence1)

## 5. WordPunct Tokenizer

In [24]:
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence1)

## Sentence Tokenization

In [25]:
from nltk.tokenize import sent_tokenize
sent_tokenize("We are learning NLP in Python. Python is a very useful tool in DS. We love NLP ")

## Parts of Speech Tagging (POS Tagging)

In [26]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(words)

## Stop Words

In [27]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [28]:
stop_words = stopwords.words('English')
print(stop_words)
len(stop_words) # There are 179 predefined English Stopwords

In [29]:
sentence2 = "I am learning NLP. It is one of the most popular library in Python"

In [30]:
sentence_words = word_tokenize(sentence2)
print(sentence_words)

## Filtering stop words from the input string

In [31]:
sentence_no_stops = ' '.join([word for word in sentence_words if word not in stop_words])
print(sentence_no_stops)

## Text Normalization

In [32]:
# Replace words in string
sentence3 = "I visited NY from IND on 31-12-24"

normalized_sentence = sentence3.replace("NY", "New York").replace("IND","India").replace("-24","-2024")
print(normalized_sentence)

## Spelling Corrections

In [33]:
from autocorrect import Speller

In [34]:
spell = Speller(lang = 'en')
help(Speller)

In [35]:
spell("Natureal") # Correct spelling is printed

In [36]:
sent1 = word_tokenize("Ntural Luanguage Processin deals with the art of extracting insightes from Natural Languaes")
print(sent1)

In [37]:
sentence_corrected = ' '.join([spell(word) for word in sent1])
print(sentence_corrected)

## Stemming

In [38]:
import nltk
stemmer = nltk.stem.PorterStemmer()

In [39]:
stemmer.stem("Programming")

In [40]:
stemmer.stem("Programs")

In [41]:
stemmer.stem("Jumping")

In [42]:
stemmer.stem("Jumps")

In [43]:
stemmer.stem("battling") # battl is not an actual word as stemming doesn't check in dictionary

In [44]:
stemmer.stem('amazing')

### Porter Stemmer

In [45]:
sent2 = "Before eating, it would be nice to sanitize your hands with a sanitizer"

In [46]:
from nltk.stem.porter import PorterStemmer

In [47]:
ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sent2.split()])

### Regexp Stemmer

In [48]:
sent3 = "I love playing Cricket. Cricket players practice hard in their inning"

In [49]:
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$')
' '.join([regex_stemmer.stem(wd) for wd in sent3.split()])

## Lemmatization

In [50]:
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

In [51]:
lemmatizer = WordNetLemmatizer()

In [52]:
lemmatizer.lemmatize("Programming")

In [53]:
lemmatizer.lemmatize('Programs')

In [54]:
lemmatizer.lemmatize('battling')

In [55]:
lemmatizer.lemmatize('amazing')

In [56]:
from earthy.nltk_wrappers import lemmatize_sent

In [57]:
sent4 = "The codes executed today are far better than what we execute generally."

In [58]:
lemmatize_sent(sent4)
words, lemmas, tags = zip(*lemmatize_sent(sent4))
lemmas

## Singularize and Pluralize words

In [59]:
from textblob import TextBlob

In [60]:
sent5 = TextBlob('She sells seashells on the seashore')

In [61]:
sent5.words

In [62]:
sent5.words[2].singularize()

In [63]:
sent5.words[5].pluralize()

## Language Translation

In [64]:
# From Spanish to English
from textblob import TextBlob
en_blob = TextBlob(u'muy bien')
en_blob.translate(from_lang = 'es', to = 'en') 

## Custom Stop Words removal

In [65]:
from nltk import word_tokenize

In [66]:
sent5 = "She sells seashells on the seashore"

In [67]:
custom_stop_word_list = ['she' , 'on' , 'the' , 'am' ,'is','not']
' '.join([word for word in word_tokenize(sent5) if word.lower() not in custom_stop_word_list])

## Extracting general features from raw texts

#### Number of words
#### Detect presence of wh words
#### Polarity
#### Subjectivity
#### Language identification

In [68]:
import pandas as pd
df = pd.DataFrame([['The vaccine for covid-19 will be announced on 1st August.'],
                   ['Do you know how much expectation the world population is having from this research?'],
                   ['This risk of virus will end on 31st July.']])

In [69]:
df.columns = ['text']
df

## Number of words

In [70]:
from textblob import TextBlob
df['number_of_words'] = df['text'].apply(lambda x : len(TextBlob(x).words))
df['number_of_words']

### Detect presence of wh words

In [71]:
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])
df['are_wh_words_present'] = df['text'].apply(lambda x: True if len(set(TextBlob(str(x)).words).intersection(wh_words)) > 0 else False)
df['are_wh_words_present']

### Polarity

In [72]:
df['polarity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['polarity']

### Subjectivity

In [73]:
df['subjectivity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
df['subjectivity']

### Language Detector

In [74]:
# Install required packages
# pip install spacy
# pip install spacy-langdetect

from langdetect import detect, detect_langs

# Input text
# text = 'This is an English text.'
text = 'muy bien'

# Detect the language
detected_language = detect(text)  # Returns the language code
detected_probabilities = detect_langs(text)  # Returns probabilities

print(f"Detected Language: {detected_language}")
print(f"Probabilities: {detected_probabilities}")

## Bag of Words

In [75]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['At least seven Indian pharma companies are working to develop a vaccine against coronavirus',
'the deadly virus that has already infected more than 14 million globally.',
'Bharat Biotech, Indian Immunologicals, are among the domestic pharma firms working on the coronavirus vaccines in India.'
]

In [76]:
bag_of_words_model = CountVectorizer()
print(bag_of_words_model.fit_transform(corpus).todense())

In [77]:
bag_of_words_df = pd.DataFrame(bag_of_words_model.fit_transform(corpus).todense())
bag_of_words_df.columns = sorted(bag_of_words_model.vocabulary_)
bag_of_words_df.head()

In [78]:
# Initialize CountVectorizer for top 5 frequent terms
bag_of_words_model_small = CountVectorizer(max_features=5)

# Transform the corpus into a Bag of Words representation
bag_of_words_matrix = bag_of_words_model_small.fit_transform(corpus)

# Convert the matrix to a DataFrame
bag_of_words_df_small = pd.DataFrame(bag_of_words_matrix.todense(), 
                                      columns=bag_of_words_model_small.get_feature_names_out())

# Display the first few rows
print(bag_of_words_df_small.head())

## TFIDF

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
tfidf_model = TfidfVectorizer()
print(tfidf_model.fit_transform(corpus).todense())

In [81]:
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(corpus).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

In [82]:
# TFIDF for top 5 frequent terms
tfidf_model_small = TfidfVectorizer(max_features = 5)
tfidf_df_small = pd.DataFrame(tfidf_model_small.fit_transform(corpus).todense())
tfidf_df_small.columns = sorted(tfidf_model_small.vocabulary_)
tfidf_df_small.head()

## Feature Engineering (Text Similarity)

In [83]:
from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
from earthy.nltk_wrappers import lemmatize_sent
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
pair1 = ["Do you have Covid-19","Your body temperature will tell you"]
pair2 = ["I travelled to Malaysia.", "Where did you travel?"]
pair3 = ["He is a programmer", "Is he not a programmer?"]

In [85]:
def extract_text_similarity_jaccard (text1, text2):
    words_text1 = tuple(zip(*lemmatize_sent(text1.lower())))[1]
    words_text2 = tuple(zip(*lemmatize_sent(text2.lower())))[1]
    nr = len(set(words_text1).intersection(set(words_text2)))
    dr = len(set(words_text1).union(set(words_text2)))
    jaccard_sim = nr/dr
    return jaccard_sim

In [86]:
extract_text_similarity_jaccard(pair1[0], pair1[1])
extract_text_similarity_jaccard(pair2[0], pair2[1])
extract_text_similarity_jaccard(pair3[0], pair3[1])

In [87]:
tfidf_model = TfidfVectorizer()

In [88]:
# Creating a corpus which will have texts of pair1, pair2 and pair3 respectively
corpus = [pair1[0], pair1[1], pair2[0], pair2[1], pair3[0], pair3[1]]

In [89]:
tfidf_results = tfidf_model.fit_transform(corpus).todense()