In [1]:
import nltk

In [2]:
paragraph =""" In Natural Language Processing (NLP), 
               tokenization is the foundational step where raw text is split into smaller units called tokens. 
               These tokens can be words, subwords, or even characters, depending on the application. 
               Tokenization helps in understanding and processing text by breaking it down into manageable pieces.
               After tokenization, text often goes through stemming,
               which involves reducing words to their base or root form by removing suffixes. 
               For example, words like “playing”, “played”, and “plays” are reduced to “play”. 
               However, stemming is a rule-based and sometimes crude process, often leading to non-real words like “comput” from “computing”.
               To overcome this, lemmatization is used, which is a more sophisticated technique 
               that transforms words to their dictionary form (lemma), taking into account the context and parts of speech. 
               For example, “running” becomes “run”, and “better” becomes “good”. 
               Lemmatization provides more meaningful results compared to stemming, although it is computationally more intensive. 
               Together, these techniques are crucial in cleaning and normalizing text for downstream NLP tasks 
               like classification, sentiment analysis, and information retrieval."""

In [3]:
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()

In [5]:
sentences = nltk.sent_tokenize(paragraph)
words = nltk.word_tokenize(paragraph)   
sentences
words

['In',
 'Natural',
 'Language',
 'Processing',
 '(',
 'NLP',
 ')',
 ',',
 'tokenization',
 'is',
 'the',
 'foundational',
 'step',
 'where',
 'raw',
 'text',
 'is',
 'split',
 'into',
 'smaller',
 'units',
 'called',
 'tokens',
 '.',
 'These',
 'tokens',
 'can',
 'be',
 'words',
 ',',
 'subwords',
 ',',
 'or',
 'even',
 'characters',
 ',',
 'depending',
 'on',
 'the',
 'application',
 '.',
 'Tokenization',
 'helps',
 'in',
 'understanding',
 'and',
 'processing',
 'text',
 'by',
 'breaking',
 'it',
 'down',
 'into',
 'manageable',
 'pieces',
 '.',
 'After',
 'tokenization',
 ',',
 'text',
 'often',
 'goes',
 'through',
 'stemming',
 ',',
 'which',
 'involves',
 'reducing',
 'words',
 'to',
 'their',
 'base',
 'or',
 'root',
 'form',
 'by',
 'removing',
 'suffixes',
 '.',
 'For',
 'example',
 ',',
 'words',
 'like',
 '“',
 'playing',
 '”',
 ',',
 '“',
 'played',
 '”',
 ',',
 'and',
 '“',
 'plays',
 '”',
 'are',
 'reduced',
 'to',
 '“',
 'play',
 '”',
 '.',
 'However',
 ',',
 'stemming',

In [6]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

In [9]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X_CV = cv.fit_transform(corpus).toarray()

In [10]:
X_CV

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1

In [11]:
# Creating the TF_IDF Words model
from sklearn.feature_extraction.text import TfidfVectorizer
TF = TfidfVectorizer(max_features = 1500)
X_TF = cv.fit_transform(corpus).toarray()

In [12]:
X_TF

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1