Let's start by importing the required libraries

In [1]:
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pandas as pd

Let's set the urls and commands we'll use. We also set the search terms to be matched under 'cov_links' variables.

In [2]:
url = "https://www.health.go.ke"
r = requests.get(url)
soup = BeautifulSoup(r.content)


cov_links = lambda tag: (getattr(tag, 'name', None) == 'a' and
                           'href' in tag.attrs and
                           ('covid-19') in tag.get_text().lower())

cov_links1 = lambda tag: (getattr(tag, 'name', None) == 'a' and
                           'href' in tag.attrs and
                           ('covid 19') in tag.get_text().lower())

cov_links2 = lambda tag: (getattr(tag, 'name', None) == 'a' and
                           'href' in tag.attrs and
                           ('coronavirus') in tag.get_text().lower())

We use Beautiful Soup to finnd all the links, which are stored as ResultSet

In [3]:
results = soup.find_all(cov_links) + soup.find_all(cov_links1) + soup.find_all(cov_links2)

results

[<a aria-current="page" href="https://www.health.go.ke/"><span>COVID-19</span></a>,
 <a class="teaser-title" href="https://www.health.go.ke/cs-ict-launches-covid-19-call-centre-for-health-care-workers/" title="CS ICT launches Covid-19 call centre for health care workers">CS ICT launches Covid-19 call centre for health care workers</a>,
 <a class="teaser-title" href="https://www.health.go.ke/nairobi-leads-in-covid-19-cases/" title="Nairobi leads in Covid-19 cases">Nairobi leads in Covid-19 cases</a>,
 <a class="teaser-title" href="https://www.health.go.ke/nairobi-and-mombasa-records-high-cases-of-covid-19-nairobi-wednesday-june-24-2020/" title="Nairobi and Mombasa records high cases of Covid-19     Nairobi , Wednesday June 24, 2020">Nairobi and Mombasa records high cases of Covid-19     Nairobi , Wednesday June 24, 2020</a>,
 <a class="link_image" href="https://www.health.go.ke/cs-ict-launches-covid-19-call-centre-for-health-care-workers/" title="Permalink to CS ICT launches Covid-19 ca

We Strip the titles from each tag and store them to be organized later into a table

In [4]:
title = []
for tag in results:
    title.append(tag.text.strip())
title

['COVID-19',
 'CS ICT launches Covid-19 call centre for health care workers',
 'Nairobi leads in Covid-19 cases',
 'Nairobi and Mombasa records high cases of Covid-19     Nairobi , Wednesday June 24, 2020',
 'CS ICT launches Covid-19 call centre for health care workers',
 'Nairobi leads in Covid-19 cases',
 'COVID-19 Protocols and Guidelines',
 'COVID-19 Videos',
 'COVID-19 Campaign Posters',
 'Covid-19 Situation Reports (SITREP)',
 'COVID-19 Protocols and Guidelines',
 'A Comprehensive Guide on Mental Health & Psychosocial Support during COVID-19 Pandemic',
 'Circular on Treatment of TB Patients during COVID-19 pandemic',
 'COVID-19 SOP for Counsellors and Psychologists.',
 'COVID-19 Guidance on Comprehensive HIV Service Delivery.',
 'COVID-19 Q&A',
 'COVID-19 Quarantine Protocols',
 'Circular on NCDs Clinics During COVID-19 Outbreak',
 'Case Definition for COVID-19 Circular, March 13, 2020',
 'COVID-19 Community Health Response Standard',
 'Case Definition for COVID-19, March 25, 202

We Strip the urls from each tag and store them to be organized later into a table

In [5]:
result_l = [urljoin(url, tag['href']) for tag in results]
result_l

['https://www.health.go.ke/',
 'https://www.health.go.ke/cs-ict-launches-covid-19-call-centre-for-health-care-workers/',
 'https://www.health.go.ke/nairobi-leads-in-covid-19-cases/',
 'https://www.health.go.ke/nairobi-and-mombasa-records-high-cases-of-covid-19-nairobi-wednesday-june-24-2020/',
 'https://www.health.go.ke/cs-ict-launches-covid-19-call-centre-for-health-care-workers/',
 'https://www.health.go.ke/nairobi-leads-in-covid-19-cases/',
 'https://www.health.go.ke#1585137302557-b337f64d-c55873d1-981a',
 'https://www.health.go.ke#1589271239375-617f7f77-d8ee',
 'https://www.health.go.ke#1589273898229-e262c479-13b6',
 'https://www.health.go.ke#1591180376422-52af4c1e-256b',
 'https://www.health.go.ke#1585137302557-b337f64d-c55873d1-981a',
 'https://www.health.go.ke/wp-content/uploads/2020/05/GUIDE-ON-MENTAL-HEALTH-AND-PSYCHOSOCIAL-SUPPORT-DURING-THE-COVID-19-PANDEMIC-compressed.pdf',
 'https://www.health.go.ke/wp-content/uploads/2020/06/Adjustment-of-Scheduled-clinical-visits-for-TB-

We organize them into table using pandas

In [6]:
ds = {'Title': title, 'Link': result_l}
moh_l = pd.DataFrame(ds)
moh_l

Unnamed: 0,Title,Link
0,COVID-19,https://www.health.go.ke/
1,CS ICT launches Covid-19 call centre for healt...,https://www.health.go.ke/cs-ict-launches-covid...
2,Nairobi leads in Covid-19 cases,https://www.health.go.ke/nairobi-leads-in-covi...
3,Nairobi and Mombasa records high cases of Covi...,https://www.health.go.ke/nairobi-and-mombasa-r...
4,CS ICT launches Covid-19 call centre for healt...,https://www.health.go.ke/cs-ict-launches-covid...
...,...,...
71,COVID-19 Outbreak in Kenya daily report SITREP...,https://www.health.go.ke/wp-content/uploads/20...
72,COVID-19 outbreak in Kenya daily report SITREP...,https://www.health.go.ke/wp-content/uploads/20...
73,COVID-19 Outbreak in Kenya daily report-19th- ...,https://www.health.go.ke/wp-content/uploads/20...
74,COVID-19 Outbreak in Kenya Daily report- 1ST M...,https://www.health.go.ke/wp-content/uploads/20...


We scan each url and get the content type on each page to be added to the table

In [7]:
content = []
for link in moh_l['Link']:
    h = requests.head(link)
    header = h.headers
    content.append(header.get("content-type"))
content

['text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 'text/html; charset=

In [8]:
moh_l['Content'] = content
moh_l

Unnamed: 0,Title,Link,Content
0,COVID-19,https://www.health.go.ke/,text/html; charset=UTF-8
1,CS ICT launches Covid-19 call centre for healt...,https://www.health.go.ke/cs-ict-launches-covid...,text/html; charset=UTF-8
2,Nairobi leads in Covid-19 cases,https://www.health.go.ke/nairobi-leads-in-covi...,text/html; charset=UTF-8
3,Nairobi and Mombasa records high cases of Covi...,https://www.health.go.ke/nairobi-and-mombasa-r...,text/html; charset=UTF-8
4,CS ICT launches Covid-19 call centre for healt...,https://www.health.go.ke/cs-ict-launches-covid...,text/html; charset=UTF-8
...,...,...,...
71,COVID-19 Outbreak in Kenya daily report SITREP...,https://www.health.go.ke/wp-content/uploads/20...,application/pdf
72,COVID-19 outbreak in Kenya daily report SITREP...,https://www.health.go.ke/wp-content/uploads/20...,application/pdf
73,COVID-19 Outbreak in Kenya daily report-19th- ...,https://www.health.go.ke/wp-content/uploads/20...,application/pdf
74,COVID-19 Outbreak in Kenya Daily report- 1ST M...,https://www.health.go.ke/wp-content/uploads/20...,application/pdf


Let's now improve the readability of the contents collumn by stripping the undesired substrings

In [9]:
moh_l['Content'] = moh_l.Content.str.replace("; charset=UTF-8$","")
moh_l['Content'] = moh_l.Content.str.replace("application/","")

In [10]:
moh_l

Unnamed: 0,Title,Link,Content
0,COVID-19,https://www.health.go.ke/,text/html
1,CS ICT launches Covid-19 call centre for healt...,https://www.health.go.ke/cs-ict-launches-covid...,text/html
2,Nairobi leads in Covid-19 cases,https://www.health.go.ke/nairobi-leads-in-covi...,text/html
3,Nairobi and Mombasa records high cases of Covi...,https://www.health.go.ke/nairobi-and-mombasa-r...,text/html
4,CS ICT launches Covid-19 call centre for healt...,https://www.health.go.ke/cs-ict-launches-covid...,text/html
...,...,...,...
71,COVID-19 Outbreak in Kenya daily report SITREP...,https://www.health.go.ke/wp-content/uploads/20...,pdf
72,COVID-19 outbreak in Kenya daily report SITREP...,https://www.health.go.ke/wp-content/uploads/20...,pdf
73,COVID-19 Outbreak in Kenya daily report-19th- ...,https://www.health.go.ke/wp-content/uploads/20...,pdf
74,COVID-19 Outbreak in Kenya Daily report- 1ST M...,https://www.health.go.ke/wp-content/uploads/20...,pdf


Now let's save to csv

In [11]:
moh_l.to_csv('moh covid 19 links.csv', index=False)