# Raw news texts - preprocessing #

In [14]:
import os
import pandas as pd

# make a list of the raw news text files collected into a directory
# ensuring that Mac's autogenertaed, hidden '.DS_Store' file is not inlcuded in this list
news_files = [f for f in os.listdir(
    "/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_final/data/NOW_raw")
         if not f.startswith('.') | f.startswith('N')]

In [15]:
news_files

['20_07-us1.txt',
 '20_06-us4.txt',
 '20_06-us5.txt',
 '20_07-us2.txt',
 '20_06-us7.txt',
 '20_06-us6.txt',
 '20_07-us3.txt',
 '20_07-us7.txt',
 '20_06-us2.txt',
 '20_06-us3.txt',
 '20_07-us6.txt',
 '20_07-us4.txt',
 '20_06-us1.txt',
 '20_07-us5.txt',
 '20-05-us5.txt',
 '20-05-us4.txt',
 '20-05-us1.txt',
 '20-05-us3.txt',
 '20-05-us2.txt',
 '20_07-us8.txt',
 '20_06-us8.txt']

In [16]:
# checking current directory
%pwd

'/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_repo/data/NOW_raw'

In [17]:
# changing directory to the news_file list's directory
%cd "/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_final/data/NOW_raw"

# importing multiple raw news txt files into 1 pandas df
news_df =  pd.concat([pd.read_csv(item, sep = '\t', names=["text"]) for item in news_files], axis=0, ignore_index=True)

/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_repo/data/NOW_raw


In [18]:
print(len(news_df))

457980


In [6]:
# extracting the ID number from the beginning of each entry


# putting the ID number into a new column called 'ID'
news_df['ID'] = news_df["text"].str.split(' ').str[0]

# dropping the @@ from the beginning of each ID, with the help of regex
# guide: https://stackoverflow.com/questions/44117326/how-can-i-remove-all-non-numeric-characters-from-all-the-values-in-a-particular)
news_df['ID'] = news_df['ID'].str.extract('(\d+)', expand = False)

# dropping first row without a proper ID - this was not needed after all in this specific file
news_df = news_df[pd.to_numeric(news_df['ID'], errors='coerce').notnull()]

#checking whether ID is numeric, if not, making it into 'int'
print(news_df.dtypes)
news_df["ID"] = news_df["ID"].astype(int)
print(news_df.dtypes)


text    object
ID      object
dtype: object
text    object
ID       int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df["ID"] = news_df["ID"].astype(int)


In [7]:
# checking how the final outcome of these modifications to the news_df dataframe look like
news_df

Unnamed: 0,text,ID
0,@@31930100 <h> 9 powerful Jewish designs by Mi...,31930100
1,@@31930101 <h> Jewish organizations are in dan...,31930101
2,@@31930102 <h> Stop &amp; Shop rewards benefit...,31930102
3,@@31930103 <p> Colorado Senate Primary : John ...,31930103
4,@@31930104 <h> Vials of the drug remdesivir . ...,31930104
...,...,...
457975,@@85309293 <p> The capital loan is expected to...,85309293
457976,"@@85309296 <p> Each year , Dole grows billions...",85309296
457977,@@85309297 <h> The Motion Picture Academy Surp...,85309297
457978,@@85309298 <h> Oklahoma voters narrowly approv...,85309298


### Text cleaning ###

In [8]:
# checking one random newstext, what kind of weird systematic symbols are in it?
news_df.iloc[7]['text']

'@@31930108 <p> We are entering a new surge of coronavirus cases -- and of dangerous disinformation coming from the White House . <p> It originally was President Trump who ignored the threat , claiming in February that the virus would disappear " like a miracle . " Now it \'s Vice President Mike Pence , the head of the administration \'s Coronavirus Task Force , repeatedly insisting that the surge in cases is due to more testing . <p> Let \'s be clear : The reason the nation , California and the Bay Area are experiencing significant increases in COVID-19 cases is not just because we \'re testing more people . It \'s also because a greater portion of people being tested have the virus . <p> Two weeks ago , an average of the prior seven days showed that 4.4% of the tests nationwide were coming back positive . Today , that positivity rate has shot up to 6.9% . Put another way , tests for the coronavirus are 57% more likely to be positive today than they were just two weeks ago . <p> We \'

In [9]:
# text cleaning of one specific news text

import re
text = news_df.iloc[7]['text']

# regexing into a clean text:
# 1. remove some special characters
# 2. remove <p>
# 3. remove <h>
# 4. remove 8 digit numbers (ID)
# 5. remove unneccessary space in front of .:,?!'

text_cleaned = re.sub(r"[@*#()&_^]|<p[^>]*>|<h[^>]*>|\d{8}|\s+(?=[.:,?!'])|\s+(?=[.:,?!'])|\\\\", "", text)

# 6. replace many spaces next to each other, replace with 1 space
text_cleaned = re.sub(r"\s\s+", " ", text_cleaned)


text_cleaned

' We are entering a new surge of coronavirus cases -- and of dangerous disinformation coming from the White House. It originally was President Trump who ignored the threat, claiming in February that the virus would disappear " like a miracle. " Now it\'s Vice President Mike Pence, the head of the administration\'s Coronavirus Task Force, repeatedly insisting that the surge in cases is due to more testing. Let\'s be clear: The reason the nation, California and the Bay Area are experiencing significant increases in COVID-19 cases is not just because we\'re testing more people. It\'s also because a greater portion of people being tested have the virus. Two weeks ago, an average of the prior seven days showed that 4.4% of the tests nationwide were coming back positive. Today, that positivity rate has shot up to 6.9%. Put another way, tests for the coronavirus are 57% more likely to be positive today than they were just two weeks ago. We\'re seeing the same trend in California, where the ra

In [10]:
# making succesful text cleaning into a function

def text_cleaner(text):
    text_cleaned = re.sub(r"[@*#()&_^]|<p[^>]*>|<h[^>]*>|\d{8}|\s+(?=[.:,?!'])|\s+(?=[.:,?!'])", "", text)
    text_cleaned = re.sub(r"\s\s+", " ", text_cleaned)   
    return text_cleaned


In [11]:
# testing text cleaner function

selected_text = news_df.iloc[7]['text']
#selected_text

example_cleaned = text_cleaner(selected_text)
print(example_cleaned)

 We are entering a new surge of coronavirus cases -- and of dangerous disinformation coming from the White House. It originally was President Trump who ignored the threat, claiming in February that the virus would disappear " like a miracle. " Now it's Vice President Mike Pence, the head of the administration's Coronavirus Task Force, repeatedly insisting that the surge in cases is due to more testing. Let's be clear: The reason the nation, California and the Bay Area are experiencing significant increases in COVID-19 cases is not just because we're testing more people. It's also because a greater portion of people being tested have the virus. Two weeks ago, an average of the prior seven days showed that 4.4% of the tests nationwide were coming back positive. Today, that positivity rate has shot up to 6.9%. Put another way, tests for the coronavirus are 57% more likely to be positive today than they were just two weeks ago. We're seeing the same trend in California, where the rate has 

In [12]:
# applying text cleaning function to news dataset

news_df["text"] = news_df.apply(lambda row:
                                text_cleaner(row["text"]), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df["text"] = news_df.apply(lambda row:


In [13]:
news_df

Unnamed: 0,text,ID
0,9 powerful Jewish designs by Milton Glaser Th...,31930100
1,Jewish organizations are in danger Our legacy...,31930101
2,Stop amp; Shop rewards benefit Nanuet shul Th...,31930102
3,Colorado Senate Primary: John Hickenlooper Fa...,31930103
4,"Vials of the drug remdesivir. Until now, Gile...",31930104
...,...,...
457975,The capital loan is expected to be repaid wit...,85309293
457976,"Each year, Dole grows billions of bananas -- ...",85309296
457977,The Motion Picture Academy Surpassed Its Dive...,85309297
457978,Oklahoma voters narrowly approve Medicaid exp...,85309298


In [14]:
 ## adding ".." to the end of each text, so that SVO extractor works later
news_df['text'] = news_df['text'].astype(str) + '..'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['text'] = news_df['text'].astype(str) + '..'


In [15]:
# looking at 1 news text to see final look

news_df.iloc[7]['text']

' We are entering a new surge of coronavirus cases -- and of dangerous disinformation coming from the White House. It originally was President Trump who ignored the threat, claiming in February that the virus would disappear " like a miracle. " Now it\'s Vice President Mike Pence, the head of the administration\'s Coronavirus Task Force, repeatedly insisting that the surge in cases is due to more testing. Let\'s be clear: The reason the nation, California and the Bay Area are experiencing significant increases in COVID-19 cases is not just because we\'re testing more people. It\'s also because a greater portion of people being tested have the virus. Two weeks ago, an average of the prior seven days showed that 4.4% of the tests nationwide were coming back positive. Today, that positivity rate has shot up to 6.9%. Put another way, tests for the coronavirus are 57% more likely to be positive today than they were just two weeks ago. We\'re seeing the same trend in California, where the ra

# Source files - preprocessing #

In [19]:
# make a list of the source files collected into a directory
# ensuring that Mac's autogenertaed, hidden '.DS_Store' file is not inlcuded in this list
source_files = [f for f in os.listdir(
    "/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_final/data/NOW_raw/NOW_source_files")
         if not f.startswith('.')]

In [20]:
source_files

['sources-20-05.txt', 'sources-20-06.txt', 'sources-20-07.txt']

In [21]:
# changing directory to the source_file list's directory
%cd "/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_final/data/NOW_raw/NOW_source_files"

# importing multiple source txt files into 1 pandas df
source_df =  pd.concat([pd.read_csv(item, sep='\t', names=["ID", "weird_id", "Date", "Country", "News_source_name", "link", "title"]) for item in source_files], axis=0, ignore_index=True)

/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_repo/data/NOW_raw/NOW_source_files


In [22]:
print(len(source_df))

1226381


In [20]:
# filtering news sources for US based news
US_source_df = source_df[source_df["Country"] == "US"]

In [21]:
US_source_df

Unnamed: 0,ID,weird_id,Date,Country,News_source_name,link,title
0,84774542,287,20-05-01,US,Pagosa Springs SUN,http://www.pagosasun.com/dust2-team-registrati...,DUST2 team registration to open May 1
1,84774549,391,20-05-01,US,Bleeding Cool,https://bleedingcool.com/tv/the-umbrella-acade...,The Umbrella Academy Star Justin Min Posts Har...
2,84774534,351,20-05-01,US,WhatTheyThink,http://whattheythink.com/video/100671-frank-ar...,Frank Is Artless
3,84774553,179,20-05-01,US,Deadline.com,https://deadline.com/2020/04/wga-amptp-agree-t...,WGA & AMPTP Agree To Extend Current Film & TV ...
4,84774556,759,20-05-01,US,FOX6 Milwaukee,https://fox6now.com/2020/05/01/archdiocese-of-...,Archdiocese of Milwaukee reveals guidelines fo...
...,...,...,...,...,...,...,...
999816,85555025,123,20-07-31,US,YAHOO!,https://www.yahoo.com/entertainment/apartment-...,An apartment in London has a literal swimming ...
999817,85551398,794,20-07-31,US,Sports Illustrated,https://www.si.com/gambling/2020/07/31/nba-bes...,"NBA Bubble Best Bets and DFS Plays: Friday, Ju..."
999818,85551415,414,20-07-31,US,CNBC on MSN.com,https://www.msn.com/en-us/health/medical/world...,World Health Organization reports largest ever...
999819,85547456,597,20-07-31,US,KQED,https://www.kqed.org/arts/13884126/paint-us-in...,'Paint Us in a Beautiful Light': Photographing...


In [22]:
# finding out how many and what kind of duplicates there are in the source df

print(US_source_df.ID.duplicated().sum())
print(US_source_df.weird_id.duplicated().sum())
print(US_source_df.link.duplicated().sum())
print(US_source_df.title.duplicated().sum())


0
565942
17875
55518


# Merging news dataset with sources dataset #

In [23]:
# merge the news dataframe with the sources dataframe, then dropping nan-s to filter for US news

news_sources_merged = news_df.merge(US_source_df, on='ID', how='left')

print(news_sources_merged.dtypes)

text                 object
ID                    int64
weird_id            float64
Date                 object
Country              object
News_source_name     object
link                 object
title                object
dtype: object


In [24]:
news_sources_merged

Unnamed: 0,text,ID,weird_id,Date,Country,News_source_name,link,title
0,9 powerful Jewish designs by Milton Glaser Th...,31930100,858.0,20-07-01,US,timesofisrael.com,https://jewishchronicle.timesofisrael.com/9-po...,9 powerful Jewish designs by Milton Glaser | T...
1,Jewish organizations are in danger Our legacy...,31930101,870.0,20-07-01,US,timesofisrael.com,https://jewishchronicle.timesofisrael.com/jewi...,Jewish organizations are in danger | The Pitts...
2,Stop amp; Shop rewards benefit Nanuet shul Th...,31930102,129.0,20-07-01,US,timesofisrael.com,https://jewishstandard.timesofisrael.com/stop-...,Stop &amp; Shop rewards benefit Nanuet shul | ...
3,Colorado Senate Primary: John Hickenlooper Fa...,31930103,1492.0,20-07-01,US,npr.org,https://www.npr.org/2020/06/28/883922991/one-o...,Colorado Senate Primary: John Hickenlooper Fac...
4,"Vials of the drug remdesivir. Until now, Gile...",31930104,1150.0,20-07-01,US,marketwatch.com,https://www.marketwatch.com/story/global-healt...,Global health experts criticize U.S. deal to h...
...,...,...,...,...,...,...,...,...
457968,The capital loan is expected to be repaid wit...,85309293,366.0,20-06-30,US,San Diego Union-Tribune on MSN.com,https://www.msn.com/en-us/finance/news/measure...,Measure A tax dollars will fund new ambulance ...
457969,"Each year, Dole grows billions of bananas -- ...",85309296,561.0,20-06-30,US,Fast Company on MSN.com,https://www.msn.com/en-us/foodanddrink/foodnew...,Banana leaf packaging and pineapple powder: Ho...
457970,The Motion Picture Academy Surpassed Its Dive...,85309297,576.0,20-06-30,US,InStyle on MSN.com,https://www.msn.com/en-us/movies/other/the-mot...,The Motion Picture Academy Surpassed Its Diver...
457971,Oklahoma voters narrowly approve Medicaid exp...,85309298,167.0,20-06-30,US,,https://www.msn.com/en-us/news/politics/oklaho...,Oklahoma voters narrowly approve Medicaid expa...


In [25]:
# checking again duplicate news
# we check now as earlier the text column still inlcided unique ID, so the news texts were all unique
# removing news texts that are duplicate in their text

news_sources_merged.text.duplicated().sum()

news_sources_merged = news_sources_merged.drop_duplicates("text").reset_index(drop=True)

In [26]:
news_sources_merged

Unnamed: 0,text,ID,weird_id,Date,Country,News_source_name,link,title
0,9 powerful Jewish designs by Milton Glaser Th...,31930100,858.0,20-07-01,US,timesofisrael.com,https://jewishchronicle.timesofisrael.com/9-po...,9 powerful Jewish designs by Milton Glaser | T...
1,Jewish organizations are in danger Our legacy...,31930101,870.0,20-07-01,US,timesofisrael.com,https://jewishchronicle.timesofisrael.com/jewi...,Jewish organizations are in danger | The Pitts...
2,Stop amp; Shop rewards benefit Nanuet shul Th...,31930102,129.0,20-07-01,US,timesofisrael.com,https://jewishstandard.timesofisrael.com/stop-...,Stop &amp; Shop rewards benefit Nanuet shul | ...
3,Colorado Senate Primary: John Hickenlooper Fa...,31930103,1492.0,20-07-01,US,npr.org,https://www.npr.org/2020/06/28/883922991/one-o...,Colorado Senate Primary: John Hickenlooper Fac...
4,"Vials of the drug remdesivir. Until now, Gile...",31930104,1150.0,20-07-01,US,marketwatch.com,https://www.marketwatch.com/story/global-healt...,Global health experts criticize U.S. deal to h...
...,...,...,...,...,...,...,...,...
450220,The capital loan is expected to be repaid wit...,85309293,366.0,20-06-30,US,San Diego Union-Tribune on MSN.com,https://www.msn.com/en-us/finance/news/measure...,Measure A tax dollars will fund new ambulance ...
450221,"Each year, Dole grows billions of bananas -- ...",85309296,561.0,20-06-30,US,Fast Company on MSN.com,https://www.msn.com/en-us/foodanddrink/foodnew...,Banana leaf packaging and pineapple powder: Ho...
450222,The Motion Picture Academy Surpassed Its Dive...,85309297,576.0,20-06-30,US,InStyle on MSN.com,https://www.msn.com/en-us/movies/other/the-mot...,The Motion Picture Academy Surpassed Its Diver...
450223,Oklahoma voters narrowly approve Medicaid exp...,85309298,167.0,20-06-30,US,,https://www.msn.com/en-us/news/politics/oklaho...,Oklahoma voters narrowly approve Medicaid expa...


## Filtering for BLM news ## 

In [28]:
# filtering for news that include BLM relevant terms

term_list = ["Breonna Taylor", "George Floyd", "George Zimmerman", "Black Lives Matter", "BLM", "#BlackLivesMatter", "National Guard", "Derek Chauvin", "Blue Lives Matter", "#BlueLivesMatter", "White Lives Matter", "#WhiteLivesMatter", "All Lives Matter", "#AllLivesMatter", "police brutality"]


term_list_df = news_sources_merged[news_sources_merged.apply(lambda r: any([kw in r[0] for kw in term_list]), axis=1)]
term_list_df = term_list_df.reset_index(level = None, drop = False)


term_list_df

Unnamed: 0,index,text,ID,weird_id,Date,Country,News_source_name,link,title
0,3,Colorado Senate Primary: John Hickenlooper Fa...,31930103,1492.0,20-07-01,US,npr.org,https://www.npr.org/2020/06/28/883922991/one-o...,Colorado Senate Primary: John Hickenlooper Fac...
1,13,SACRAMENTO -- Public access to police discipl...,31930303,775.0,20-07-01,US,mercurynews.com,https://www.mercurynews.com/2020/06/29/bill-wo...,Bill would broaden and speed up access to Cali...
2,18,"Dolores Attea Sapienza, 88, queen of special ...",31930400,1056.0,20-07-01,US,buffalonews.com,https://buffalonews.com/news/local/dolores-att...,"Dolores Attea Sapienza, 88, queen of special e..."
3,19,Roswell Park plans expansion of cancer care i...,31930401,1014.0,20-07-01,US,buffalonews.com,https://buffalonews.com/news/local/roswell-par...,Roswell Park plans expansion of cancer care in...
4,25,Subscribers to The Climate Crisis newsletter ...,31930409,2318.0,20-07-01,US,newyorker.com,https://www.newyorker.com/news/annals-of-a-war...,At the Core of the Climate Crisis | The New Yo...
...,...,...,...,...,...,...,...,...,...
41724,450142,Colorado Rockies shortstop Ian Desmond became...,85307989,403.0,20-06-30,US,UPI.com,https://www.upi.com/Sports_News/MLB/2020/06/30...,"All-Star SS Ian Desmond to skip MLB season, ci..."
41725,450144,""" I have very, very strict standards about me...",85307991,443.0,20-06-30,US,Us Weekly,https://www.usmagazine.com/celebrity-moms/news...,Why Phaedra Parks Hasn't Introduced Her 2 Kids...
41726,450153,"Comedy legend Carl Reiner, one of the earlies...",85308187,980.0,20-06-30,US,NBC News,https://www.nbcnews.com/news/us-news/comedy-le...,"Comedy legend Carl Reiner, of 'The Dick Van Dy..."
41727,450185,Jon Stewart and Rose Byrne want' Irresistible...,85308688,1086.0,20-06-30,US,Houston Chronicle,https://www.chron.com/entertainment/article/Jo...,Jon Stewart and Rose Byrne want 'Irresistible'...


In [32]:
# exporting to csv file on my computer

term_list_df.to_csv("/Users/veronikajuhasz/Documents/University/Cognitive_science_AU/Semester_5/Thesis/analysis/bachelor_project_final/data/NOW_preprocessed/BLM_filtered_preprocessed_news_sources_df.csv", index = False)