In [6]:
# Retrieves the dataset from https://www.kaggle.com/jannalipenkova/covid19-public-media-dataset
!kaggle datasets download -d jannalipenkova/covid19-public-media-dataset

covid19-public-media-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
from zipfile import ZipFile

# Creates a ZipFile Object and loads the dataset into it
with ZipFile('covid19-public-media-dataset.zip', 'r') as zipObj:
    # Extracts all the contents of the zip file in the current directory
    zipObj.extractall('datasets')

In [8]:
import pandas as pd
pd.set_option('max_colwidth', 150)

data_df = pd.read_csv('datasets/covid19_articles_20200504.csv', index_col=0)

data_df

Unnamed: 0,title,date,domain,url,author,content,topic_area
0,My experience of surviving cancer twice,2020-01-03,medicalnewstoday,https://www.medicalnewstoday.com/articles/327373,Helen Ziatyk,"“Helen, I’m so sorry to tell you that you have stage 4 ovarian cancer.” I will never forget hearing those words. Cancer treatment was pretty gruel...",healthcare
1,Ginger: Health benefits and dietary tips,2020-01-03,medicalnewstoday,https://www.medicalnewstoday.com/articles/265990.php,Jenna Fletcher,"If you buy something through a link on this page, we may earn a small commission. How this works. People have used ginger in cooking and medicine ...",healthcare
2,China pneumonia outbreak may be caused by Sars-type virus: WHO | Science | The Guardian,2020-01-08,theguardian,https://www.theguardian.com/science/2020/jan/09/china-pneumonia-outbreak-may-be-caused-by-sars-type-virus-who,Reuters,A cluster of more than 50 pneumonia cases in the central Chinese city of Wuhan may be due to a newly emerging member of the family of viruses that...,general
3,New virus identified as likely cause of mystery illness in China,2020-01-08,nature,https://www.nature.com/articles/d41586-020-00020-9,nature,Passengers arriving at Hong Kong's international airport are being monitored for signs a mystery illness that emerged in central China. Credit: An...,science
4,China's Sars-like illness worries health experts | World news | The Guardian,2020-01-09,theguardian,https://www.theguardian.com/world/2020/jan/09/chinas-sars-like-illness-worries-health-experts,https://www.theguardian.com/profile/sarahboseley,"The finding that the outbreak of viral pneumonia in China that has struck 59 people may be caused by a coronavirus, the family of viruses behind S...",general
...,...,...,...,...,...,...,...
55532,'It was only advice' Dr Hilary rejects blanket elderly lockdown as he issues stark warning | TV & Radio | Showbiz & TV | Express.co.uk,2020-05-04,express,https://www.express.co.uk/showbiz/tv-radio/1277406/ITV-GMB-Dr-Hilary-Jones-coronavirus-over-70s-death-rate-COVID-19-latest,Aurora Bosotti,"Good Morning Britain's Dr Hilary Jones said the Government had only ever ""advised"" elderly Britons to self-isolate but conceded no blanket order ...",general
55533,Liverpool star Divock Origi provides update on his future amid Timo Werner transfer talk | Football | Sport | Express.co.uk,2020-05-04,express,https://www.express.co.uk/sport/football/1277408/Liverpool-Divock-Origi-Timo-Werner-transfer-news,Mikael McKenzie,Liverpool forward Divock Origi insists his future lies at Anfield despite talk that Jurgen Klopp is considering a swoop for RB Leipzig star Timo ...,general
55534,New York: US states join forces to buy vital medical equipment | World | News | Express.co.uk,2020-05-04,express,https://www.express.co.uk/news/world/1277388/new-york-us-states-ppe-medical-equipment-coronavirus-latest-donald-trump-news-cuomo,Melanie Kaidan,"The deal was agreed by Connecticut, Delaware, Massachusetts, New Jersey, New York, Pennsylvania and Rhode Island. The adjacent states have also a...",general
55535,Coronavirus US: Millions more make joblessness claims | City & Business | Finance | Express.co.uk,2020-05-04,express,https://www.express.co.uk/finance/city/1277374/jobs-unemployment-lockdown-claims-us-coronavirus,Dylan Donnelly,Businesses around the world have seen major losses and redundancies amid the pandemic’s mass shut down of public life. Figures released this week...,general


In [9]:
# Apply first round of text cleaning
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [10]:
data_clean = pd.DataFrame(data_df.content.apply(round1))
data_clean

Unnamed: 0,content
0,“helen i’m so sorry to tell you that you have stage ovarian cancer” i will never forget hearing those words cancer treatment was pretty grueling ...
1,if you buy something through a link on this page we may earn a small commission how this works people have used ginger in cooking and medicine sin...
2,a cluster of more than pneumonia cases in the central chinese city of wuhan may be due to a newly emerging member of the family of viruses that c...
3,passengers arriving at hong kongs international airport are being monitored for signs a mystery illness that emerged in central china credit andy ...
4,the finding that the outbreak of viral pneumonia in china that has struck people may be caused by a coronavirus the family of viruses behind sars...
...,...
55532,good morning britains dr hilary jones said the government had only ever advised elderly britons to selfisolate but conceded no blanket order was ...
55533,liverpool forward divock origi insists his future lies at anfield despite talk that jurgen klopp is considering a swoop for rb leipzig star timo ...
55534,the deal was agreed by connecticut delaware massachusetts new jersey new york pennsylvania and rhode island the adjacent states have also agreed ...
55535,businesses around the world have seen major losses and redundancies amid the pandemic’s mass shut down of public life figures released this week ...


In [11]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [12]:
data_clean = pd.DataFrame(data_clean.content.apply(round2))
data_clean

Unnamed: 0,content
0,helen im so sorry to tell you that you have stage ovarian cancer i will never forget hearing those words cancer treatment was pretty grueling in ...
1,if you buy something through a link on this page we may earn a small commission how this works people have used ginger in cooking and medicine sin...
2,a cluster of more than pneumonia cases in the central chinese city of wuhan may be due to a newly emerging member of the family of viruses that c...
3,passengers arriving at hong kongs international airport are being monitored for signs a mystery illness that emerged in central china credit andy ...
4,the finding that the outbreak of viral pneumonia in china that has struck people may be caused by a coronavirus the family of viruses behind sars...
...,...
55532,good morning britains dr hilary jones said the government had only ever advised elderly britons to selfisolate but conceded no blanket order was ...
55533,liverpool forward divock origi insists his future lies at anfield despite talk that jurgen klopp is considering a swoop for rb leipzig star timo ...
55534,the deal was agreed by connecticut delaware massachusetts new jersey new york pennsylvania and rhode island the adjacent states have also agreed ...
55535,businesses around the world have seen major losses and redundancies amid the pandemics mass shut down of public life figures released this week s...


In [17]:
import nltk

def clean_text_round3(text):
    
    while True:
        try:
            words = set(nltk.corpus.words.words())
        except:
            nltk.download('words')
        break

    return " ".join(w for w in nltk.wordpunct_tokenize(text) \
             if w.lower() in words or not w.isalpha())

round3 = lambda x: clean_text_round3(x)

In [19]:
#data_clean = pd.DataFrame(data_clean.content.apply(round3))
data_clean.to_pickle('data_clean_r3.pkl')

In [22]:
data_clean = pd.read_pickle('data_clean_r3.pkl')

In [20]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.content)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aa,aal,aam,aba,aback,abacus,abalone,abandon,abandoned,abandonment,...,zowie,zucchini,zwanziger,zygomatic,³nmart,½pp,î²carotene,î²thalassemia,î¼g,⅘currently
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55532,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55533,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55534,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55535,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Find the top 30 words mentioned in each article
top_dict = {}
for c in data_dtm.iloc[:0].columns:
    top = data_dtm[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{'aa': [(46841, 7),
  (46643, 7),
  (24645, 6),
  (52432, 6),
  (41813, 6),
  (43313, 5),
  (28051, 5),
  (45331, 5),
  (42225, 4),
  (20091, 4),
  (21777, 3),
  (3523, 3),
  (3928, 3),
  (13721, 3),
  (27663, 3),
  (21771, 3),
  (23774, 3),
  (51064, 2),
  (23770, 2),
  (48939, 2),
  (55000, 2),
  (8507, 2),
  (49848, 2),
  (35495, 2),
  (42160, 2),
  (9216, 2),
  (40605, 2),
  (28456, 2),
  (28224, 2),
  (37340, 1)],
 'aal': [(12671, 2),
  (9182, 2),
  (6132, 1),
  (14278, 1),
  (477, 1),
  (36828, 1),
  (9106, 1),
  (54564, 1),
  (211, 1),
  (31637, 1),
  (54604, 1),
  (9660, 1),
  (1302, 1),
  (37591, 1),
  (18516, 0),
  (18517, 0),
  (18515, 0),
  (18514, 0),
  (18518, 0),
  (18513, 0),
  (18512, 0),
  (18509, 0),
  (18511, 0),
  (18510, 0),
  (18520, 0),
  (18508, 0),
  (18507, 0),
  (18506, 0),
  (18505, 0),
  (18504, 0)],
 'aam': [(37206, 1),
  (25708, 1),
  (35044, 1),
  (55536, 0),
  (18508, 0),
  (18519, 0),
  (18518, 0),
  (18517, 0),
  (18516, 0),
  (18515, 0),
  (18514, 0