In [1]:
corpus = ["John like to watch movies",
         "Mary likes to play football",
         "John likes to watch football games but does not like to play football",
         "Both John and Mary like to play video games"]

In [2]:
print(corpus)

['John like to watch movies', 'Mary likes to play football', 'John likes to watch football games but does not like to play football', 'Both John and Mary like to play video games']


In [3]:
# import all the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
# display all the english stop words
english_stopwords = set(stopwords.words('english'))

print(english_stopwords)

{"you've", 'all', "you'd", "mightn't", 'such', "she's", 'if', "don't", 'itself', 's', 'has', 'does', "shan't", 'is', 'above', 'once', 'of', "won't", 'he', 'there', "couldn't", 'just', 'y', 'very', 'against', 'should', 'because', "shouldn't", 'how', 'my', 'we', "that'll", 'until', 'each', 'herself', 'they', 'whom', 'then', "needn't", 'in', 'don', 'having', 'than', "you're", 'ma', 'over', 'again', 'no', "should've", 'shan', 'the', 'hadn', 're', 'wouldn', 'some', "hadn't", 'his', 'few', 'them', 'but', 'd', 'under', 'these', 'during', 'here', 'same', 'i', 'both', "aren't", 'it', 'from', 'their', 'be', 'a', 'him', 'out', 'yourself', 'what', "mustn't", 'about', 'himself', 'can', 'down', 'its', 'wasn', 'themselves', 'which', 'your', 'most', 'won', 'up', 'needn', 'yours', 'her', 'an', 'into', 'was', 'when', 'those', 'aren', 'shouldn', 'that', 'now', 'you', 'couldn', 'isn', 'as', 'too', 'why', 'our', 'are', 'mightn', "wasn't", 'had', 'and', 'who', 'to', "wouldn't", 'been', 'being', 'through', '

In [5]:
# define the function to remove all the stop words
def remove_stopwords(sentence):
    # tokenize the sentence
    word_tokens = nltk.word_tokenize(sentence)
    print(word_tokens)
    
    # remove all the stop words from the text
    cleaned_text = [w.lower() for w in word_tokens if(w not in english_stopwords)]
    return(cleaned_text)

In [6]:
remove_stopwords('Ram and Shyam likes to play')

['Ram', 'and', 'Shyam', 'likes', 'to', 'play']


['ram', 'shyam', 'likes', 'play']

In [7]:
# Build a vocabulary and remove all the duplicate words
def build_vocabulary(sentences):
    words = []
    
    # loop through each and every sentences
    for x in sentences:
        # call the function to remove all the stop words
        words_new = remove_stopwords(x)
        words.extend(words_new)
        
    # keep all the tokenized words and sort it
    sorted_words = list(sorted(words))
    return(sorted_words)

In [8]:
vocabulary = build_vocabulary(corpus)

['John', 'like', 'to', 'watch', 'movies']
['Mary', 'likes', 'to', 'play', 'football']
['John', 'likes', 'to', 'watch', 'football', 'games', 'but', 'does', 'not', 'like', 'to', 'play', 'football']
['Both', 'John', 'and', 'Mary', 'like', 'to', 'play', 'video', 'games']


In [9]:
# To create a bag of array of the frequency count
def bag_of_count_vectors(sentence,words):
    # remove all the stop words
    sent = remove_stopwords(sentence)
    
    # create an array for the bag of vectors
    bag = np.zeros(len(words))
    
    # To check if the word is present in the vocabulary
    for pol in sent:
        for i, word in enumerate(words):
            if(word == pol):
                bag[i] += 1
    
    # return an array for the bag of words vector
    return np.array(bag)

In [10]:
bag_of_count_vectors('John like to watch movies',vocabulary)

['John', 'like', 'to', 'watch', 'movies']


array([0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 1.])