# Remove stop words

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

(STOP_WORDS)

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

In [4]:
doc = nlp(text)

## Function to remove stop words

In [5]:
def preprocess(text):
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(no_stop_words)

In [6]:
no_stop_words = preprocess(doc)
no_stop_words

'\n Thor Love Thunder 2022 American superhero film based Marvel Comics featuring character Thor produced Marvel Studios \n distributed Walt Disney Studios Motion Pictures sequel Thor Ragnarok 2017 29th film Marvel Cinematic Universe MCU \n film directed Taika Waititi co wrote script Jennifer Kaytin Robinson stars Chris Hemsworth Thor alongside Christian Bale Tessa Thompson \n Jaimie Alexander Waititi Russell Crowe Natalie Portman film Thor attempts find inner peace return action recruit Valkyrie Thompson \n Korg Waititi Jane Foster Portman)—who Mighty Thor stop Gorr God Butcher Bale eliminating gods \n'

In [7]:
stop_words_count=0
token_words_count=0

for token in doc:
    if token.is_stop:
        stop_words_count += 1
    token_words_count += 1

In [8]:
print(stop_words_count)
print(token_words_count)

40
160


In [9]:
percentage_stop_words = (stop_words_count/token_words_count)*100
print(f"Percentage of Stop Words presented in the given text: {percentage_stop_words} %")

Percentage of Stop Words presented in the given text: 25.0 %


# Customize STOP WORDS

In [10]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

## Customization

In [11]:
STOP_WORDS
for word in STOP_WORDS:
    if word == 'not':
        nlp.vocab[word].is_stop=False

In [12]:
text = '''
This is a good movie
This is not a good movie
'''

In [13]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

In [14]:
no_stop_words = preprocess(text)

In [15]:
no_stop_words

'\n good movie \n not good movie \n'

### or

In [16]:
def preprocessing(text):
    doc=nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

nlp.vocab['not'].is_stop = False

positive_text = preprocessing('This is a good movie')
negative_text = preprocessing('This is not a good movie')

print(f"Text1: {positive_text}")
print(f"Text2: {negative_text}") 

Text1: good movie
Text2: not good movie


# Print filtered token frequency after removing stop words

In [17]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [18]:
nlp = spacy.load("en_core_web_sm")

In [19]:
text = '''The India men's national cricket team, also known as Team India or the Men in Blue, represents India in men's international cricket.
It is governed by the Board of Control for Cricket in India (BCCI), and is a Full Member of the International Cricket Council (ICC) with Test,
One Day International (ODI) and Twenty20 International (T20I) status. Cricket was introduced to India by British sailors in the 18th century, and the 
first cricket club was established in 1792. India's national cricket team played its first Test match on 25 June 1932 at Lord's, becoming the sixth team to be
granted test cricket status.
'''

In [20]:
doc = nlp(text)

## Function of removing stop worods

In [21]:
from collections import Counter
def preprocessing(text):
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return no_stop_words

In [22]:
filtered_tokens = preprocessing(doc)
filtered_tokens

['India',
 'men',
 'national',
 'cricket',
 'team',
 'known',
 'Team',
 'India',
 'Men',
 'Blue',
 'represents',
 'India',
 'men',
 'international',
 'cricket',
 '\n',
 'governed',
 'Board',
 'Control',
 'Cricket',
 'India',
 'BCCI',
 'Member',
 'International',
 'Cricket',
 'Council',
 'ICC',
 'Test',
 '\n',
 'Day',
 'International',
 'ODI',
 'Twenty20',
 'International',
 'T20I',
 'status',
 'Cricket',
 'introduced',
 'India',
 'British',
 'sailors',
 '18th',
 'century',
 '\n',
 'cricket',
 'club',
 'established',
 '1792',
 'India',
 'national',
 'cricket',
 'team',
 'played',
 'Test',
 'match',
 '25',
 'June',
 '1932',
 'Lord',
 'sixth',
 'team',
 '\n',
 'granted',
 'test',
 'cricket',
 'status',
 '\n']

## Function of counting each token

In [23]:
def token_frequency(tokens):
    token_counts = Counter(tokens)
    return token_counts

In [24]:
freq_of_tokens = token_frequency(filtered_tokens)
freq_of_tokens

Counter({'India': 6,
         'cricket': 5,
         '\n': 5,
         'team': 3,
         'Cricket': 3,
         'International': 3,
         'men': 2,
         'national': 2,
         'Test': 2,
         'status': 2,
         'known': 1,
         'Team': 1,
         'Men': 1,
         'Blue': 1,
         'represents': 1,
         'international': 1,
         'governed': 1,
         'Board': 1,
         'Control': 1,
         'BCCI': 1,
         'Member': 1,
         'Council': 1,
         'ICC': 1,
         'Day': 1,
         'ODI': 1,
         'Twenty20': 1,
         'T20I': 1,
         'introduced': 1,
         'British': 1,
         'sailors': 1,
         '18th': 1,
         'century': 1,
         'club': 1,
         'established': 1,
         '1792': 1,
         'played': 1,
         'match': 1,
         '25': 1,
         'June': 1,
         '1932': 1,
         'Lord': 1,
         'sixth': 1,
         'granted': 1,
         'test': 1})

## or

In [32]:
text = ''' The India men's national cricket team, also known as Team India or the Men in Blue, represents India in men's international cricket.
It is governed by the Board of Control for Cricket in India (BCCI), and is a Full Member of the International Cricket Council (ICC) with Test,
One Day International (ODI) and Twenty20 International (T20I) status. Cricket was introduced to India by British sailors in the 18th century, and the 
first cricket club was established in 1792. India's national cricket team played its first Test match on 25 June 1932 at Lord's, becoming the sixth team to be
granted test cricket status.
'''

# doc = nlp(text)

# remaining_tokens=[]

# for token in doc:
#     if token.is_stop or token.is_punct:
#         continue
#     remaining_tokens.append(token.text)

# frequency_tokens = {}
# for token in remaining_tokens:
#     if token != '\n' and token != ' ':
#         frequency_tokens[token]=1
#     else:
#         frequency_tokens[token]+=1

# max_freq_word = max(frequency_tokens.keys(), key=(lambda key: frequency_tokens[key]))

# print(f"Maximum frquency word: {max_freq_word}")

#step1: Create the object 'doc' for the given text using nlp()
doc = nlp(text)


#step2: remove all the stop words and punctuations and store all the remaining tokens in a new list
remaining_tokens = []
for token in doc:
  if token.is_stop or token.is_punct:    #check whether a given token is stop word or punctuations
    continue
  remaining_tokens.append(token.text)


#step3: create a new dictionary and get the frequency of words by iterating through the list which contains stored tokens  
frequency_tokens = {}
for token in remaining_tokens:
  if token != '\n' and token != ' ':      #As spacy considers new line and empty spaces as seperate token, it's better to ignore them
    if token not in frequency_tokens:     #if a particular token occurs for the first time, we initialise it to 1
      frequency_tokens[token] = 1
    else:
      frequency_tokens[token] += 1        #if a partcular token is already present, then increment by 1 based on value already presented


#step4: get the maximum frequency word
max_freq_word = max(frequency_tokens.keys(), key=(lambda key: frequency_tokens[key]))


#step5: finally print the result
print(f"Maximum frequency word: {max_freq_word}") 

Maximum frequency word: India
