In [1]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [2]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I am creating this account to invest in U.S. stocks and ETFs, focusing on companies like NVIDIA and Nike.")
for token in doc:
    if token.is_stop:
        print(token)

I
am
this
to
in
and
on
and


In [3]:
def preprocess(text):
    doc = nlp(text)

    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return no_stop_words

In [4]:
preprocess("I am creating this account to invest in U.S. stocks and ETFs, focusing on companies like NVIDIA and Nike.")

['creating',
 'account',
 'invest',
 'U.S.',
 'stocks',
 'ETFs',
 'focusing',
 'companies',
 'like',
 'NVIDIA',
 'Nike']

In [5]:
import pandas as pd

df = pd.read_json("combined.json", lines=True)
df.shape

(13087, 6)

In [6]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [7]:
df = df[ df["topics"].apply(len)!=0 ]
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [8]:
df.shape

(4688, 6)

In [9]:
len(df["contents"].iloc[4])

5504

In [10]:
df["contents_new"] = df["contents"].apply(preprocess)
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],"[U.S., Department, Justice, U.S., Environmenta..."
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],"[131, count, criminal, indictment, unsealed, t..."
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...","[United, States, Attorney, Office, Middle, Dis..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],"[21st, Century, Oncology, LLC, agreed, pay, $,..."
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]","[21st, Century, Oncology, Inc., certain, subsi..."


In [15]:
df["contents"].iloc[4][:300]

'21st Century Oncology Inc. and certain of its subsidiaries and affiliates have agreed to pay $26 million to the government to resolve a self-disclosure relating to the submission of false attestations regarding the company’s use of electronic health records software and separate allegations that the'

In [16]:
df["contents_new"].iloc[4][:300]

['21st',
 'Century',
 'Oncology',
 'Inc.',
 'certain',
 'subsidiaries',
 'affiliates',
 'agreed',
 'pay',
 '$',
 '26',
 'million',
 'government',
 'resolve',
 'self',
 'disclosure',
 'relating',
 'submission',
 'false',
 'attestations',
 'company',
 'use',
 'electronic',
 'health',
 'records',
 'software',
 'separate',
 'allegations',
 'violated',
 'False',
 'Claims',
 'Act',
 'submitting',
 'causing',
 'submission',
 'claims',
 'certain',
 'services',
 'provided',
 'pursuant',
 'referrals',
 'physicians',
 'improper',
 'financial',
 'relationships',
 '\xa0 ',
 'Justice',
 'Department',
 'committed',
 'zealously',
 'investigating',
 'improper',
 'financial',
 'relationships',
 'potential',
 'compromise',
 'physicians',
 'medical',
 'judgment',
 'said',
 'Acting',
 'Assistant',
 'Attorney',
 'General',
 'Chad',
 'A.',
 'Readler',
 'Justice',
 'Department',
 'Civil',
 'Division',
 '\xa0 ',
 'work',
 'companies',
 'accept',
 'responsibility',
 'past',
 'compliance',
 'failures',
 'promptl