In [1]:
import nltk

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')



text = """The home side struck twice early in the morning session, leaving them 182 runs behind and with just two more wickets to get, with CricViz giving them a 47 per cent of winning the match at that point.

But when Root brought speedster Mark Wood into the attack, set fielders back on the rope and seemingly attempted to bounce out tail enders Jasprit Bumrah and Mohammed Shami, the match turned in India's favour.

The batting pair withstood the short-ball barrage and then flourished in a match-changing 89-run partnership that ruined any hopes England had of winning the game and paved the way for a famous Indian victory on 16/8/2021.

England's tactics, particularly to Bumrah, were seemingly in response to the Indian quick's own short-ball assault to Jimmy Anderson late on day three, which drew the ire of the English veteran."""

text

"The home side struck twice early in the morning session, leaving them 182 runs behind and with just two more wickets to get, with CricViz giving them a 47 per cent of winning the match at that point.\n\nBut when Root brought speedster Mark Wood into the attack, set fielders back on the rope and seemingly attempted to bounce out tail enders Jasprit Bumrah and Mohammed Shami, the match turned in India's favour.\n\nThe batting pair withstood the short-ball barrage and then flourished in a match-changing 89-run partnership that ruined any hopes England had of winning the game and paved the way for a famous Indian victory on 16/8/2021.\n\nEngland's tactics, particularly to Bumrah, were seemingly in response to the Indian quick's own short-ball assault to Jimmy Anderson late on day three, which drew the ire of the English veteran."

# 1. Sentence Tokenization

In [2]:
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

The home side struck twice early in the morning session, leaving them 182 runs behind and with just two more wickets to get, with CricViz giving them a 47 per cent of winning the match at that point.

But when Root brought speedster Mark Wood into the attack, set fielders back on the rope and seemingly attempted to bounce out tail enders Jasprit Bumrah and Mohammed Shami, the match turned in India's favour.

The batting pair withstood the short-ball barrage and then flourished in a match-changing 89-run partnership that ruined any hopes England had of winning the game and paved the way for a famous Indian victory on 16/8/2021.

England's tactics, particularly to Bumrah, were seemingly in response to the Indian quick's own short-ball assault to Jimmy Anderson late on day three, which drew the ire of the English veteran.



# 2. Word Tokenization

In [3]:
sample_sentence = sentences[0]

words = nltk.word_tokenize(sample_sentence)
print(words)
print()

['The', 'home', 'side', 'struck', 'twice', 'early', 'in', 'the', 'morning', 'session', ',', 'leaving', 'them', '182', 'runs', 'behind', 'and', 'with', 'just', 'two', 'more', 'wickets', 'to', 'get', ',', 'with', 'CricViz', 'giving', 'them', 'a', '47', 'per', 'cent', 'of', 'winning', 'the', 'match', 'at', 'that', 'point', '.']



# 3. Text Lemmatization and Stemming 

In [4]:
"""
The goal of both stemming and lemmatization is to reduce inflectional forms 
and sometimes derivationally related forms of a word to a common base form.
"""

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    print("Stemmer:", stemmer.stem(word))
    print("Lemmatizer:", lemmatizer.lemmatize(word, pos))
    print()

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "went", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "drove", pos = wordnet.VERB)


Stemmer: went
Lemmatizer: go

Stemmer: drove
Lemmatizer: drive



# 4 Stop Words

In [8]:
"""
Stop words are words which are filtered out before or after processing of text. 
When applying machine learning to text, these words can add a lot of noise. 
That’s why we want to remove these irrelevant words.
"""

from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
print(sentence)
print()
stop_words = set(stopwords.words("english"))

words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

England's tactics, particularly to Bumrah, were seemingly in response to the Indian quick's own short-ball assault to Jimmy Anderson late on day three, which drew the ire of the English veteran.

['England', "'s", 'tactics', ',', 'particularly', 'Bumrah', ',', 'seemingly', 'response', 'Indian', 'quick', "'s", 'short-ball', 'assault', 'Jimmy', 'Anderson', 'late', 'day', 'three', ',', 'drew', 'ire', 'English', 'veteran', '.']


# 5. Regular Expressions


### A regular expression, regex, or regexp is a sequence of characters that define a search pattern. Let’s see some basics.


- . - match any character except newline
- \w - match word
- \d - match digit
- \s - match whitespace
- \W - match not word
- \D - match not digit
- \S - match not whitespace
- [abc] - match any of a, b, or c
- [^abc] - not match a, b, or c
- [a-g] - match a character between a & g


In [28]:
import re

# print(sentences)
sentence = sentences[2]

print("Sentence: ", sentence)

print("\n\n")

pattern = r"\d\d"


print("Regex filter: ",re.sub(pattern, "### ", sentence))



Sentence:  The batting pair withstood the short-ball barrage and then flourished in a match-changing 89-run partnership that ruined any hopes England had of winning the game and paved the way for a famous Indian victory on 16/8/2021.



Regex filter:  The batting pair withstood the short-ball barrage and then flourished in a match-changing ### -run partnership that ruined any hopes England had of winning the game and paved the way for a famous Indian victory on ### /8/### ### .


In [None]:
pattern = r"89-"#removes , and '


print("Regex filter: ",re.sub(pattern, "eighty-nine ", sentence))


# 6. Bag of words

In [29]:
with open("simple_text.txt", "r") as file:
    documents = file.read().splitlines()
    
    
documents

['The home side struck twice early in the morning session, leaving them 182 runs behind and with just two more wickets to get, with CricViz giving them a 47 per cent of winning the match at that point.',
 '',
 "But when Root brought speedster Mark Wood into the attack, set fielders back on the rope and seemingly attempted to bounce out tail enders Jasprit Bumrah and Mohammed Shami, the match turned in India's favour.",
 '',
 'The batting pair withstood the short-ball barrage and then flourished in a match-changing 89-run partnership that ruined any hopes England had of winning the game and paved the way for a famous Indian victory.',
 '',
 "England's tactics, particularly to Bumrah, were seemingly in response to the Indian quick's own short-ball assault to Jimmy Anderson late on day three, which drew the ire of the English veteran."]

The bag-of-words model is a popular and simple feature extraction technique used when we work with text. It describes the occurrence of each word within a document.

To use this model, we need to:
- Design a vocabulary of known words (also called tokens)
- Choose a measure of the presence of known words

In [30]:
# Import the libraries we need
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Step 2. Design the Vocabulary
# The default token pattern removes tokens of a single character. That's why we don't have the "I" and "s" tokens in the output
count_vectorizer = CountVectorizer()

# Step 3. Create the Bag-of-Words Model
bag_of_words = count_vectorizer.fit_transform(documents)

# Show the Bag-of-Words Model as a pandas DataFrame
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,182,47,89,and,anderson,any,assault,at,attack,attempted,...,victory,way,were,when,which,wickets,winning,with,withstood,wood
0,1,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,1,2,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,2,0,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,2,0,1,0,0,0,0,...,1,1,0,0,0,0,1,0,1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0


# 7. TF-IDF



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf_vectorizer = TfidfVectorizer()
values = tfidf_vectorizer.fit_transform(documents)

# Show the Model as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)