# News Cleaning From Lexis Nexis

<b> Search Terms: </b>

abortion OR legislative bodies OR abortion laws OR pregnancy & childbirth OR reproductive rights OR abortion rights OR reproductive health clinics

<b> Newspapers used: </b>

1. New York Times
2. USA Today and USA Today Online
3. Newsweek
4. The Politico
5. Chicago The Daily Herald
6. Florida Politics
7. Tampa Bay Times

<b> Timeline: </b>
24 June 2022 - 24 July 2022

In [2]:
# Load libraries
from striprtf.striprtf import rtf_to_text
import pandas as pd
import re
import os

# get current working directory to merge folder path
current_dir = os.getcwd()

In [35]:
# Function to clean articles
def process_file(file_path):
    with open(file_path) as infile:
        content = infile.read()
        text = rtf_to_text(content)
        cleaned_text = re.sub(r'^.*?Body', 'Body', text, flags=re.DOTALL)
        cleaned_text = cleaned_text.replace('Link to Image', '').replace('\n', ' ').replace('End of Document', '').replace('Byline', '').replace('Body', '')
        cleaned_text = cleaned_text.replace('( Florida Politics: http://floridapolitics.com Delivered by Newstex)','')
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
    return cleaned_text

### New York Times

In [13]:
nyt_dta = []

# Path to the folder containing NYT files
folder_path = os.path.join(current_dir, 'data', 'NYT')


for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        nyt_dta.append({'Article_Title': file_name, 'Body': cleaned_text})

nyt = pd.DataFrame(nyt_dta)

In [14]:
nyt['Publisher'] = 'New York Times'

In [15]:
nyt.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Here is how state abortion laws are changing,About half of all US states are expected to...,New York Times
1,An Abortion Rights Champion of the 1970s on Li...,Listen and follow The Daily Apple Podcasts ...,New York Times
2,'Disturbing' Views of Anti-Abortion Activists,Summary summary summary To the Editor Re...,New York Times
3,"How Mexico ’s Top Justice, Raised Catholic, Be...",Influenced by feminists close to him the ch...,New York Times
4,Where Does the Fight Over Abortion Rights Go A...,This article is part of the Debatable newsl...,New York Times


### USA Today/Online

In [16]:
us_dta = []

# Path to the folder containing USA_Today files
folder_path = os.path.join(current_dir, 'data', 'USA_Today')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        us_dta.append({'Article_Title': file_name, 'Body': cleaned_text})

usa_tday = pd.DataFrame(us_dta)

In [17]:
usa_tday['Publisher'] = 'USA Today'

In [18]:
usa_tday.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Abortion foes see much more work ahead For man...,Fifteenyearold Elizabeth Reed had already m...,USA Today
1,"Abortion rights must be line in sand as WNBA ,...",As American women seethe over the loss of t...,USA Today
2,"After Roe v. Wade overturned, battle over abor...",The Supreme Courts watershed decision to ov...,USA Today
3,As more companies cover abortion travel in hea...,Even before the Supreme Court overturned Ro...,USA Today
4,'This is not over'_ Harris warns Supreme Court...,WASHINGTON Vice President Kamala Harris wa...,USA Today


### Newsweek

In [19]:
newswk_dta = []

# Path to the folder containing Newsweek files
folder_path = os.path.join(current_dir, 'data', 'Newsweek')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        newswk_dta.append({'Article_Title': file_name, 'Body': cleaned_text})

newsweek = pd.DataFrame(newswk_dta)

In [20]:
newsweek['Publisher'] = 'Newsweek'

In [21]:
newsweek.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Democrats Can Save National Abortion Right on ...,Democrat lawmakers outraged by the Supreme ...,Newsweek
1,Democrats Want Google to Hide Care from Women ...,One of the many tragic ironies of the Lefts...,Newsweek
2,"In Overturning Roe, the Supreme Court Has Enda...",Progressive women across America are rightf...,Newsweek
3,Tucker Carlson Fumes at Employers' Abortion Ai...,Fox News host Tucker Carlson is lashing out...,Newsweek
4,Who Is Lynn Fitch_ The Woman Who Helped Take D...,Mississippi Attorney General Lynn Fitch may...,Newsweek


### Politico

In [22]:
polit_dta = []

# Path to the folder containing Politico files
folder_path = os.path.join(current_dir, 'data', 'Politico')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        polit_dta.append({'Article_Title': file_name, 'Body': cleaned_text})

politico = pd.DataFrame(polit_dta)

In [23]:
politico['Publisher'] = 'The Politico'

In [24]:
politico.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Democrats launch organizing hub to channel res...,Top Democratic campaign committees are laun...,The Politico
1,Democrats bet on abortion in bid to oust pro-i...,HANFORD Calif Republican Rep David Valadao...,The Politico
2,National Right to Life official_ 10-year-old s...,The 10yearold Ohio girl who crossed state l...,The Politico
3,Opinion _ I Am a Man With a Genetic Condition....,In June when the Supreme Court voted to ove...,The Politico
4,What's next for virtual abortions post-Roe,Telehealth is set to play a crucial role fo...,The Politico


### Chicago Daily Herald

In [25]:
chi_dta = []

# Path to the folder containing Chicago Herald files
folder_path = os.path.join(current_dir, 'data', 'Chicago_Herald')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        chi_dta.append({'Article_Title': file_name, 'Body': cleaned_text})

chicago_herald = pd.DataFrame(chi_dta)

In [26]:
chicago_herald['Publisher'] = 'Chicago Daily Herald'

In [27]:
chicago_herald.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Biden signs order on abortion access after Sup...,WASHINGTON President Joe Biden signed an e...,Chicago Daily Herald
1,Police at Arizona Capitol fire tear gas,PHOENIX Police fired tear gas to disperse ...,Chicago Daily Herald
2,Poll_ Most in U.S. want abortion legal,WASHINGTON A majority of Americans say Con...,Chicago Daily Herald
3,Doctor sets up abortion services in Rockford t...,ROCKFORD A Wisconsin doctor has purchased ...,Chicago Daily Herald
4,Illinois ' neighbors move quickly to ban abort...,Two of the five states bordering Illinois h...,Chicago Daily Herald


### Florida Politics

In [28]:
flo_pol = []

# Path to the folder containing Florida Politic files
folder_path = os.path.join(current_dir, 'data', 'Florida_Politics')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        flo_pol.append({'Article_Title': file_name, 'Body': cleaned_text})

florida_politics = pd.DataFrame(flo_pol)

In [29]:
florida_politics['Publisher'] = 'Florida Politics'

In [30]:
florida_politics.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Diagnosis for 7.18.22_ Checking the pulse of F...,Jul 18 2022 Welcome back to Diagnosis a ...,Florida Politics
1,'Emergency' Friday evening abortion rights ral...,Jun 24 2022 This is a major loss for gen...,Florida Politics
2,"Tampa Bay officials protest, react to Roe v. W...",Jun 25 2022 The Supreme Court1 has nulli...,Florida Politics
3,Ruth's List Florida backs Patricia Hawkins-Wil...,Jul 05 2022 Abortion rights group Ruths ...,Florida Politics
4,"AP-NORC poll_ Abortion, women's rights grow as...",Jul 02 2022 A new poll finds a growing p...,Florida Politics


### Tampa Bay Times

In [31]:
tamp = []

# Path to the folder containing Tampa Bay files
folder_path = os.path.join(current_dir, 'data', 'Tampa_Bay')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        tamp.append({'Article_Title': file_name, 'Body': cleaned_text})

tampa_bay = pd.DataFrame(tamp)

In [32]:
tampa_bay['Publisher'] = 'Tampa Bay Times'

In [33]:
tampa_bay.head()

Unnamed: 0,Article_Title,Body,Publisher
0,Many states are looking toward abortion bans w...,Most of the states that have acted quickly ...,Tampa Bay Times
1,House making 1st attempt to protect abortion i...,WASHINGTON The House on Friday is expected...,Tampa Bay Times
2,What do Americans think about abortion policy_...,The political fallout from the Supreme Cour...,Tampa Bay Times
3,I wish my Catholic Church would support contra...,For 95 years my mother Shirley Clark was a ...,Tampa Bay Times
4,Florida reacts to Roe v. Wade overturn by Supr...,The US Supreme Court overturned Roe v Wade ...,Tampa Bay Times


In [34]:
news = pd.concat([nyt,usa_tday,newsweek,politico,chicago_herald,florida_politics,tampa_bay],ignore_index = True)

In [36]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import sys  
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manasi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
#initialising stopwords
all_stopwords = stopwords.words('english')

#create POS tagging and initialise lemmatizer
def get_wordnet_pos(word):
    """Maps two-letter POS tag to a WordNet POS tag"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN) #if not one of four tags, default to noun tag

lemmatizer = WordNetLemmatizer()

In [38]:
cleaned_text = [] #creating empty list to store cleaned reviews
for text in news.iterrows():
    sent= text[1]['Body']
    sent= sent.lower() #convert to lowercase
    sent= contractions.fix(sent) #expands contractions
    sent= re.sub(r'[^a-zA-Z0-9]', ' ', sent) #removes punctuation and non-alphabetic characters, substitutes with space 
    
    words=nltk.word_tokenize(sent) #tokenizes string to a bag of words
    words= [i for i in words if not i in all_stopwords] #remove stopwords
    words= [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in words] #lemmatize 
    sent= ' '.join(words) #join back 
    cleaned_text.append(sent) # append the cleaned text to the list
    
news['cleaned_text'] = cleaned_text