This script was last run on August 16, 2021-- website structures likely do not look the same today and code will not output the same results if re-run. 

This script scrapes the WHO SARS-CoV-2 variants tracking website (https://www.who.int/en/activities/tracking-SARS-CoV-2-variants/) to collect list of variants of interest and concern and exports their names and VOI/VOC/VUM status as a .csv to a specified file.

In [1]:
#call packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime

In [2]:
#set url for WHO variants website
url="https://www.who.int/en/activities/tracking-SARS-CoV-2-variants/"

#retrieve HTML data with a get request
html_content = requests.get(url)
#print(html_content)

#use pandas read_html to convert tables to dataframes and replace breaks with ,
tables = pd.read_html(html_content.text.replace('<br />',' '))

In [3]:
#separate out variants of concern
#get dataframe
voc_df=tables[0]
voc_df

Unnamed: 0,WHO label,Pango lineage*,GISAID clade,Nextstrain clade,Additional amino acid changes monitored°,Earliest documented samples,Date of designation
0,Alpha,B.1.1.7 #,GRY,20I (V1),+S:484K +S:452R,"United Kingdom, Sep-2020",18-Dec-2020
1,Beta,B.1.351,GH/501Y.V2,20H (V2),+S:L18F,"South Africa, May-2020",18-Dec-2020
2,Gamma,P.1,GR/501Y.V3,20J (V3),+S:681H,"Brazil, Nov-2020",11-Jan-2021
3,Delta,B.1.617.2§,G/478K.V1,21A,+S:417N,"India, Oct-2020",VOI: 4-Apr-2021 VOC: 11-May-2021


In [7]:
#select lineages from voc df
voc_list=voc_df['Pango lineage*'].tolist()
voc_list

['B.1.1.7 #', 'B.1.351', 'P.1', 'B.1.617.2§']

In [18]:
#drop  extra symbols

nosymb=[]
for i in range(len(voc_list)):
    rmed=str(voc_list[i]).replace("#","")
    nosymb.append(rmed)

nosymb2=[]
for i in range(len(voc_list)):
    rmed=str(nosymb[i]).replace("§","")
    nosymb2.append(rmed)
    

#rename for next step
new_voc=nosymb2
new_voc

['B.1.1.7 ', 'B.1.351', 'P.1', 'B.1.617.2']

In [19]:
#make new VOC df with indicator column
voc_col=list(["VOC"]*len(new_voc))

new_voc_df=pd.DataFrame([new_voc,voc_col]).transpose()
new_voc_df

Unnamed: 0,0,1
0,B.1.1.7,VOC
1,B.1.351,VOC
2,P.1,VOC
3,B.1.617.2,VOC


In [21]:
#separate variants of interest
voi_df=tables[1]
voi_df

Unnamed: 0,WHO label,Pango lineage*,GISAID clade,Nextstrain clade,Earliest documented samples,Date of designation
0,Eta,B.1.525,G/484K.V3,21D,"Multiple countries, Dec-2020",17-Mar-2021
1,Iota,B.1.526,GH/253G.V1,21F,"United States of America, Nov-2020",24-Mar-2021
2,Kappa,B.1.617.1,G/452R.V3,21B,"India, Oct-2020",4-Apr-2021
3,Lambda,C.37,GR/452Q.V1,21G,"Peru, Dec-2020",14-Jun-2021
4,Mu,B.1.621,GH,21H,"Colombia, Jan-2021",30-Aug-2021


In [24]:
#select lineages from voi df
voi_list=voi_df['Pango lineage*'].tolist()

#doesn't require any cleaning! just rename for next piece
new_voi=voi_list
new_voi

['B.1.525', 'B.1.526', 'B.1.617.1', 'C.37', 'B.1.621']

In [25]:
#make VOI data frame with column VOI
voi_col=list(["VOI"]*len(new_voi))

new_voi_df=pd.DataFrame([new_voi,voi_col]).transpose()
new_voi_df

Unnamed: 0,0,1
0,B.1.525,VOI
1,B.1.526,VOI
2,B.1.617.1,VOI
3,C.37,VOI
4,B.1.621,VOI


In [26]:
#separate variants in alerts for further monitoring
vfm_df=tables[2]
vfm_df

Unnamed: 0,Pango lineage*,GISAID clade,Nextstrain clade,Earliest documented samples,Date of designation
0,B.1.427 B.1.429,GH/452R.V1,21C,"United States of America, Mar-2020",VOI: 5-Mar-2021 Alert: 6-Jul-2021
1,R.1,GR,-,"Multiple countries,Jan-2021",07-Apr-2021
2,B.1.466.2,GH,-,"Indonesia, Nov-2020",28-Apr-2021
3,B.1.1.318,GR,-,"Multiple countries, Jan-2021",02-Jun-2021
4,B.1.1.519,GR,20B/S.732A,"Multiple countries, Nov-2020",02-Jun-2021
5,C.36.3,GR,-,"Multiple countries, Jan-2021",16-Jun-2021
6,B.1.214.2,G,-,"Multiple countries, Nov-2020",30-Jun-2021
7,B.1.1.523,GR,-,"Multiple countries, May-2020",14-July-2021
8,B.1.619,G,20A/S.126A,"Multiple countries, May-2020",14-July-2021
9,B.1.620,G,-,"Multiple countries, November 2020",14-July-2021


In [27]:
#select lineages from voi df
vfm_list=vfm_df['Pango lineage*'].tolist()

vfm_list

['B.1.427 B.1.429',
 'R.1',
 'B.1.466.2',
 'B.1.1.318',
 'B.1.1.519',
 'C.36.3',
 'B.1.214.2',
 'B.1.1.523',
 'B.1.619',
 'B.1.620',
 'C.1.2']

In [30]:
#split B.1.427 B.1.429
vfm_list[0]

'B.1.427 B.1.429'

In [70]:
#replace original entry with new one
split_vfm[6]=C3C1
split_vfm

[['B.1.427', 'B.1.429'],
 ['R.1'],
 ['B.1.466.2'],
 ['B.1.621', 'B.1.621.1'],
 ['B.1.1.318'],
 ['B.1.1.519'],
 ['C.36.3', 'C.36.3.1'],
 ['B.1.214.2'],
 ['B.1.1.523'],
 ['B.1.619', 'B.1.619.1'],
 ['B.1.620']]

In [72]:
#flatten list
def unlist(e):
    if type(e) == list:
        for v2 in e:
            for v3 in unlist(v2):
                yield v3
    else:
        yield e
new_vfm=[]
for v in unlist(split_vfm):
    new_vfm.append(v.strip())
new_vfm

['B.1.427',
 'B.1.429',
 'R.1',
 'B.1.466.2',
 'B.1.621',
 'B.1.621.1',
 'B.1.1.318',
 'B.1.1.519',
 'C.36.3',
 'C.36.3.1',
 'B.1.214.2',
 'B.1.1.523',
 'B.1.619',
 'B.1.619.1',
 'B.1.620']

In [73]:
#make VFM data frame with column VFM
vfm_col=list(["VFM"]*len(new_vfm))

new_vfm_df=pd.DataFrame([new_vfm,vfm_col]).transpose()
new_vfm_df

Unnamed: 0,0,1
0,B.1.427,VFM
1,B.1.429,VFM
2,R.1,VFM
3,B.1.466.2,VFM
4,B.1.621,VFM
5,B.1.621.1,VFM
6,B.1.1.318,VFM
7,B.1.1.519,VFM
8,C.36.3,VFM
9,C.36.3.1,VFM


In [74]:
#combine to create one dataframe for export
two_df=new_voc_df.append(new_voi_df, ignore_index=True)

all_df=two_df.append(new_vfm_df, ignore_index=True)
all_df

Unnamed: 0,0,1
0,B.1.1.7,VOC
1,B.1.351,VOC
2,B.1.351.2,VOC
3,B.1.351.3,VOC
4,P.1,VOC
5,P.1.1,VOC
6,P.1.2,VOC
7,P.1.4,VOC
8,P.1.6,VOC
9,P.1.7,VOC


Date cell 1 was actually used for the project to record which day data was pulled. For purposes of this record, date was manually set to the date data was last pulled in Date cell 2, while Date cell 1 was commented out.

In [77]:
#Date cell 1
#get the date of data collection
#todayIs=datetime.date.today()
#dataDate = todayIs.strftime("%m%d%y")
#dataDate

'081621'

In [None]:
#Date cell 2
#Date cell 2
dataDate=("081621")
dataDate

In [None]:
#add date of data collection to get the file name before saving
#change <your_filepath> to actual desired file path
file_list = ["<your_filepath>",dataDate,".csv"]
filename="".join(file_list)
filename

In [80]:
#write file for later
all_df.to_csv(filename)