In [21]:
import requests
from bs4 import BeautifulSoup
import pickle


def url_to_transcript(url):
    '''Returns transcript data specifically from nytimes.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="meteredContent css-1r7ky0e").find_all('p')]
    print(url)
    return text

urls = ['https://www.nytimes.com/2020/06/19/opinion/capitol-confederacy-statues.html/',
        'https://www.nytimes.com/2020/06/17/opinion/george-floyd-arab-muslims-racism.html/',
        'https://www.nytimes.com/2020/06/14/opinion/george-floyd-psalms-bible.html/',
        'https://www.nytimes.com/2020/06/01/opinion/george-floyd-protest-police.html/',
        'https://www.nytimes.com/2020/05/29/opinion/Minneapolis-police-George-Floyd.html/',
        'https://www.nytimes.com/2020/06/08/opinion/george-floyd-protests-race.html/',
        'https://www.nytimes.com/2020/06/02/opinion/george-floyd-protests-first-amendment.html/',
        'https://www.nytimes.com/2020/06/06/opinion/sunday/george-floyd-structural-racism.html/']

editorials = ['1', '2', '3', '4', '5', '6', '7', '8']


In [23]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://www.nytimes.com/2020/06/19/opinion/capitol-confederacy-statues.html/
https://www.nytimes.com/2020/06/17/opinion/george-floyd-arab-muslims-racism.html/
https://www.nytimes.com/2020/06/14/opinion/george-floyd-psalms-bible.html/
https://www.nytimes.com/2020/06/01/opinion/george-floyd-protest-police.html/
https://www.nytimes.com/2020/05/29/opinion/Minneapolis-police-George-Floyd.html/
https://www.nytimes.com/2020/06/08/opinion/george-floyd-protests-race.html/
https://www.nytimes.com/2020/06/02/opinion/george-floyd-protests-first-amendment.html/
https://www.nytimes.com/2020/06/06/opinion/sunday/george-floyd-structural-racism.html/


In [28]:
# # Pickle files for later use

# # Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(editorials):
     with open("transcripts/" + c + ".txt", "wb") as file:
         pickle.dump(transcripts[i], file)

In [30]:
# Load pickled files
data = {}
for i, c in enumerate(editorials):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [31]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8'])

In [34]:
# More checks
data['1'][:4]

['Confederate statues are being pulled down across the South — from Birmingham, Ala., to Decatur, Ga., to Richmond, Va., the Confederacy’s former capital. The U.S. Navy and the Marines have banned public displays of the Confederate battle flag — as has NASCAR.',
 'Now, Congress is taking its own halting steps forward. On Thursday, the House speaker, Nancy Pelosi, announced that portraits of four former House speakers who also served the Confederacy would be removed from display in the Capitol in observance of the Juneteenth holiday. (June 19 marks the day in 1865 when Union soldiers arrived in Galveston, Texas, with news of the end of slavery — two and a half years after the Emancipation Proclamation. It has come to be a more general celebration of liberation.)',
 'The portraits are of Robert M.T. Hunter of Virginia, who was speaker from 1839 to 1841 before serving in various high positions in the Confederacy, including secretary of state; Howell Cobb of Georgia, who was speaker from 1

In [35]:
# Let's take a look at our data again
next(iter(data.keys()))

'1'

In [36]:
# Notice that our dictionary is currently in key: editorial, value: list of text format
next(iter(data.values()))

['Confederate statues are being pulled down across the South — from Birmingham, Ala., to Decatur, Ga., to Richmond, Va., the Confederacy’s former capital. The U.S. Navy and the Marines have banned public displays of the Confederate battle flag — as has NASCAR.',
 'Now, Congress is taking its own halting steps forward. On Thursday, the House speaker, Nancy Pelosi, announced that portraits of four former House speakers who also served the Confederacy would be removed from display in the Capitol in observance of the Juneteenth holiday. (June 19 marks the day in 1865 when Union soldiers arrived in Galveston, Texas, with news of the end of slavery — two and a half years after the Emancipation Proclamation. It has come to be a more general celebration of liberation.)',
 'The portraits are of Robert M.T. Hunter of Virginia, who was speaker from 1839 to 1841 before serving in various high positions in the Confederacy, including secretary of state; Howell Cobb of Georgia, who was speaker from 1

In [37]:
# We are going to change this to key: editorial, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [38]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [39]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
1,"Confederate statues are being pulled down across the South — from Birmingham, Ala., to Decatur, Ga., to Richmond, Va., the Confederacy’s former ca..."
2,Ever since George Floyd was killed by Minneapolis police officers on May 25 after a grocery store reported that he had used a counterfeit $20 ther...
3,There are videos of Eric Garner and George Floyd being choked to death by police officers while pleading for their lives. There is a video of Tami...
4,"“Stop Killing Us!” Three words, scrawled on a sign held by a 3-year-old black boy at a Tampa protest against police brutality. Messages don’t get ..."
5,"A Minneapolis police officer, who was filmed kneeling on George Floyd’s neck for nearly nine minutes until the life left his body, has been fired,..."
6,"Our democracy hangs in the balance. This is not an overstatement. As protests, riots, and police violence roiled the nation last week, the preside..."
7,"When George Floyd died under the knee of a Minneapolis police officer, the scourge of police violence, festering for generations, became a rallyin..."
8,Imagine that no one had shot video of George Floyd being killed by the police in Minneapolis. There would have been a bland statement that he had ...


In [40]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['7']



In [41]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [42]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
1,confederate statues are being pulled down across the south — from birmingham ala to decatur ga to richmond va the confederacy’s former capital the...
2,ever since george floyd was killed by minneapolis police officers on may after a grocery store reported that he had used a counterfeit there mus...
3,there are videos of eric garner and george floyd being choked to death by police officers while pleading for their lives there is a video of tamir...
4,“stop killing us” three words scrawled on a sign held by a black boy at a tampa protest against police brutality messages don’t get any clearer t...
5,a minneapolis police officer who was filmed kneeling on george floyd’s neck for nearly nine minutes until the life left his body has been fired ar...
6,our democracy hangs in the balance this is not an overstatement as protests riots and police violence roiled the nation last week the president vo...
7,when george floyd died under the knee of a minneapolis police officer the scourge of police violence festering for generations became a rallying p...
8,imagine that no one had shot video of george floyd being killed by the police in minneapolis there would have been a bland statement that he had d...


In [43]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [44]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
1,confederate statues are being pulled down across the south — from birmingham ala to decatur ga to richmond va the confederacys former capital the ...
2,ever since george floyd was killed by minneapolis police officers on may after a grocery store reported that he had used a counterfeit there mus...
3,there are videos of eric garner and george floyd being choked to death by police officers while pleading for their lives there is a video of tamir...
4,stop killing us three words scrawled on a sign held by a black boy at a tampa protest against police brutality messages dont get any clearer than...
5,a minneapolis police officer who was filmed kneeling on george floyds neck for nearly nine minutes until the life left his body has been fired arr...
6,our democracy hangs in the balance this is not an overstatement as protests riots and police violence roiled the nation last week the president vo...
7,when george floyd died under the knee of a minneapolis police officer the scourge of police violence festering for generations became a rallying p...
8,imagine that no one had shot video of george floyd being killed by the police in minneapolis there would have been a bland statement that he had d...


In [45]:
# Let's take a look at our dataframe
data_df

Unnamed: 0,transcript
1,"Confederate statues are being pulled down across the South — from Birmingham, Ala., to Decatur, Ga., to Richmond, Va., the Confederacy’s former ca..."
2,Ever since George Floyd was killed by Minneapolis police officers on May 25 after a grocery store reported that he had used a counterfeit $20 ther...
3,There are videos of Eric Garner and George Floyd being choked to death by police officers while pleading for their lives. There is a video of Tami...
4,"“Stop Killing Us!” Three words, scrawled on a sign held by a 3-year-old black boy at a Tampa protest against police brutality. Messages don’t get ..."
5,"A Minneapolis police officer, who was filmed kneeling on George Floyd’s neck for nearly nine minutes until the life left his body, has been fired,..."
6,"Our democracy hangs in the balance. This is not an overstatement. As protests, riots, and police violence roiled the nation last week, the preside..."
7,"When George Floyd died under the knee of a Minneapolis police officer, the scourge of police violence, festering for generations, became a rallyin..."
8,Imagine that no one had shot video of George Floyd being killed by the police in Minneapolis. There would have been a bland statement that he had ...


In [47]:
# Let's add the editorias' full names as well
full_names = ['1', '2', '3', '4', '5', '6', '7', '8']

data_df['full_name'] = full_names
data_df

Unnamed: 0,transcript,full_name
1,"Confederate statues are being pulled down across the South — from Birmingham, Ala., to Decatur, Ga., to Richmond, Va., the Confederacy’s former ca...",1
2,Ever since George Floyd was killed by Minneapolis police officers on May 25 after a grocery store reported that he had used a counterfeit $20 ther...,2
3,There are videos of Eric Garner and George Floyd being choked to death by police officers while pleading for their lives. There is a video of Tami...,3
4,"“Stop Killing Us!” Three words, scrawled on a sign held by a 3-year-old black boy at a Tampa protest against police brutality. Messages don’t get ...",4
5,"A Minneapolis police officer, who was filmed kneeling on George Floyd’s neck for nearly nine minutes until the life left his body, has been fired,...",5
6,"Our democracy hangs in the balance. This is not an overstatement. As protests, riots, and police violence roiled the nation last week, the preside...",6
7,"When George Floyd died under the knee of a Minneapolis police officer, the scourge of police violence, festering for generations, became a rallyin...",7
8,Imagine that no one had shot video of George Floyd being killed by the police in Minneapolis. There would have been a bland statement that he had ...,8


In [48]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [49]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abatement,ablaze,able,abridge,absence,absorbed,abumayyaleh,abuse,abused,abusing,...,year,yearning,years,york,yorkers,yorks,young,younger,youre,zimmerman
1,0,0,0,0,0,0,0,0,0,0,...,0,0,2,1,0,0,1,0,0,0
2,7,0,0,0,0,0,2,0,0,0,...,0,0,0,4,2,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
5,0,1,0,0,1,0,0,2,1,0,...,3,0,1,1,0,0,1,0,0,0
6,0,0,0,0,0,0,0,2,0,0,...,1,0,2,1,0,0,3,1,1,1
7,0,0,0,1,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,2,0
8,0,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [50]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [51]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))