In [5]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [7]:
paragraph = '''Google News is a news aggregator service developed by Google. It presents a continuous flow of articles organized from
thousands of publishers and magazines. Google News is available as an app on Android, iOS, and the Web. Google released a beta version
in September 2002 and the official app in January 2006.'''

In [6]:
ps = PorterStemmer()

In [9]:
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []

## using Stemming

In [10]:
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
corpus

['googl news news aggreg servic develop googl',
 'present continu flow articl organ thousand publish magazin',
 'googl news avail app android io web',
 'googl releas beta version septemb offici app januari']

# using lemmatize

In [17]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [18]:
corpus

['google news news aggregator service developed google',
 'present continuous flow article organized thousand publisher magazine',
 'google news available app android io web',
 'google released beta version september official app january']

# Create bag Of Words

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()

In [20]:
x

array([[1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
        0, 0],
       [0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
        1, 0]], dtype=int64)

In [21]:
x.shape

(4, 24)

In [22]:
print(x)

[[1 0 0 0 0 0 0 1 0 2 0 0 0 2 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0]
 [0 1 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0]]


# TF_ IDF 

In [23]:
paragraph

'Google News is a news aggregator service developed by Google. It presents a continuous flow of articles organized from\nthousands of publishers and magazines. Google News is available as an app on Android, iOS, and the Web. Google released a beta version\nin September 2002 and the official app in January 2006.'

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
x = cv.fit_transform(corpus).toarray()

In [25]:
x

array([[0.37487084, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.37487084, 0.        , 0.4785504 ,
        0.        , 0.        , 0.        , 0.59110495, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37487084, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.35355339, 0.        ,
        0.        , 0.35355339, 0.        , 0.35355339, 0.        ,
        0.        , 0.        , 0.35355339, 0.        , 0.        ,
        0.35355339, 0.35355339, 0.35355339, 0.        , 0.        ,
        0.        , 0.35355339, 0.        , 0.        ],
       [0.        , 0.42068099, 0.33166972, 0.        , 0.42068099,
        0.        , 0.        , 0.        , 0.        , 0.26851522,
        0.42068099, 0.        , 0.        , 0.33166972, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42068099],
       [0.   