# Stop words 

In [1]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
len(STOP_WORDS)

326

In [3]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")
for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [20]:
def preprocess(text):
    doc = nlp(text)

    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    return " ".join(no_stop_words)

In [21]:
preprocess("We just opened our wings, the flying part is coming soon")


'opened wings flying coming soon'

In [22]:
preprocess("Musk wants time to prepare for a trial over his")

'Musk wants time prepare trial'

# Remove stop words from pandas dataframe text column
Dataset is downloaded from: https://www.kaggle.com/datasets/jbencina/department-of-justice-20092018-press-releases It contains press releases of different court cases from depart of justice (DOJ). The releases contain information such as outcomes of criminal cases, notable actions taken against felons, or other updates about the current administration

In [7]:
import pandas as pd
df = pd.read_json("doj_press.json",lines=True)

In [8]:
df.shape

(13087, 6)

In [9]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


Filter out those rows that do not have any topics associated with the case

In [11]:
df = df[ df["topics"].str.len()!=0]

In [12]:
df.head(10)

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"
26,18-961,24 Defendants Sentenced in Multimillion Dolla...,Twenty-one members of a massive India-based fr...,2018-07-20T00:00:00-04:00,"[Consumer Protection, Elder Justice]","[Criminal Division, USAO - Texas, Southern]"
27,12-306,$25 Billion Mortgage Servicing Agreement Filed...,View the court documents. WASHINGTON – The Ju...,2012-03-12T00:00:00-04:00,"[Consumer Protection, StopFraud]",[Office of the Associate Attorney General]
29,17-1182,"30 Members and Associates of The ""Nine Trey Ga...",Federal agents have arrested 17 members and as...,2017-10-23T00:00:00-04:00,[Opioids],"[Criminal Division, USAO - Georgia, Northern]"
30,15-1560,32 Hospitals to Pay U.S. More Than $28 Million...,Thirty-two hospitals located throughout 15 sta...,2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
32,,34 Individuals Facing Federal and Tribal Charg...,Thirty-four individuals are facing federal and...,2015-12-14T00:00:00-05:00,"[Drug Trafficking, Indian Country Law and Just...",[USAO - New Mexico]


In [13]:
df.shape

(4688, 6)

In [14]:
df = df.head(100)

In [15]:
df.shape

(100, 6)

In [19]:
df

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"
...,...,...,...,...,...,...
316,15-1359,Alaska Plastic Surgeon Convicted of Wire Fraud...,Doctor Hid Millions in Secret Accounts in Pana...,2015-11-04T00:00:00-05:00,[Tax],[Tax Division]
318,16-396,Alaska Plastic Surgeon Sentenced to Prison for...,Defendant Concealed Bank Accounts in Panama an...,2016-04-04T00:00:00-04:00,[Tax],[Tax Division]
321,17-736,Alaskan Commercial Fishing Couple Charged with...,An Alaskan couple was charged in federal court...,2017-07-26T00:00:00-04:00,[Tax],"[Tax Division, USAO - Alaska]"
322,18-717,Alaskan Husband And Wife Plead Guilty To Willf...,A husband and wife pleaded guilty yesterday to...,2018-06-01T00:00:00-04:00,[Tax],[Tax Division]


In [18]:
len(df["contents"].iloc[4])

5504

Here w will remove all the stop words from the content column

In [25]:
df["contents_new"] = df["contents"].apply(preprocess)
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...


In [33]:
len(df["contents"].iloc[4])

5504

In [34]:
len(df["contents_new"].iloc[4])

4127

In [36]:
(df["contents"].iloc[4][:200])

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations'

In [37]:
(df["contents_new"].iloc[4][:200])

'21st Century Oncology Inc. certain subsidiaries affiliates agreed pay $ 26 million government resolve self disclosure relating submission false attestations company use electronic health records softw'

Examples where removing stop words can create a problem

(1) Sentiment detection: Not always but in some cases, 

based on your dataset it can change the sentiment of a sentence if you remove stop words

In [38]:
preprocess("this is a good movie")

'good movie'

(2) Language translation: Say you want to translate following sentence from english to telugu. 

Before actual translation if you remove stop words and then translate, it will produce horrible result

In [40]:
preprocess("how are you doing dhaval?")

'dhaval'


(3) Chat bot or any Q&A system

In [41]:
preprocess("I don't find yoga mat on your website. Can you help?")

'find yoga mat website help'