This script was last run on August 18, 2021-- website structures likely do not look the same today and code will not output the same results if re-run. It scrapes the CDC SARS-CoV-2 variants tracking website (https://www.cdc.gov/coronavirus/2019-ncov/variants/variant-info.html?CDC_AA_refVal=https%3A%2F%2Fwww.cdcgov%2Fcoronavirus%2F2019-ncov%2Fcases-updates%2Fvariant-surveillance%2Fvariant-info.html) and exports their names and VOI/VOC/VUM status as a .csv to a specified file. The scraping portion is loosely based on this tutorial https://www.pluralsight.com/guides/extracting-data-html-beautifulsoup/ with additional data wrangling written by myself. 

In [7]:
#install packages to open and scrape websites - don't need to do each time
#!pip install requests 
#!pip install beautifulsoup4 
#!pip install pandas
#!pip install numpy
#!pip install lxml
#!pip install datetime

In [1]:
#call packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import lxml
import datetime

In [3]:
#set url for CDC variants website
url="https://www.cdc.gov/coronavirus/2019-ncov/variants/variant-info.html?CDC_AA_refVal=https%3A%2F%2Fwww.cdc.gov%2Fcoronavirus%2F2019-ncov%2Fcases-updates%2Fvariant-surveillance%2Fvariant-info.html"

#retrieve HTML data with a get request
html_content = requests.get(url).text
#print(html_content)

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

I want to extract tables for just VOIs and VOCs. Below are HTML lines corresponding to these tables. both tables in (delete space after<): < div class="col-md-12 splash-col"> VOI heading:< h3> Selected Characteristics of SARS-CoV-2 Variants of Interest< /h3> VOI table:< div class="step-table m-3" role="table"> (right under heading) VOC heading: < h3> Selected Characteristics of SARS-CoV-2 Variants of Concern< /h3> VOC table: < div class="step-table m-3" role="table">

In [4]:
#pull tables by searching for associated HTML tag
table_obj = soup.find_all("div", attrs={"class": "col-md-12 splash-col"})

#select tables of interest
tables=table_obj[2]
#look at object
tables
#contains both VOI and VOC tables

In [9]:
#separate VOI
int_html=tables.find("div", attrs={"role":"rowgroup"})

#check all variants are there and save as list
int_names1=[]
for i in int_html.find_all("strong", attrs={"class": "pt-3 variant-name"}):
    int_names1.append(i.get_text().strip())
#Pulls WHO label now, so I guess we need to remove that

#remove WHO label lines
int_names2=[]
for i in int_names1:
    if "Pango" in i:
        int_names2.append(i)
int_names2

['Pango Lineage: B.1.525',
 'Pango Lineage: B.1.526',
 'Pango Lineage: B.1.617.1',
 'Pango Lineage: B.1.617.3']

In [10]:
#strip to just get lineage
split_voi=[]
for i in range(len(int_names2)):
    split_voi.append(int_names2[i].split(":",-1))
        
#just collect lineages
voi_new=[]
for i in range(len(split_voi)):
    voi_new.append(split_voi[i][1].strip())
voi_new

['B.1.525', 'B.1.526', 'B.1.617.1', 'B.1.617.3']

In [11]:
#make VOI data frame with column VOI
voi_col=list(["VOI"]*len(voi_new))

voi_df=pd.DataFrame([voi_new,voi_col]).transpose()
voi_df

Unnamed: 0,0,1
0,B.1.525,VOI
1,B.1.526,VOI
2,B.1.617.1,VOI
3,B.1.617.3,VOI


In [12]:
#separate VOC
con_html=int_html.find_next("div", attrs={"role":"rowgroup"})

#check all variants are there and save as list
con_names1=[]
for i in con_html.find_all("strong", attrs={"class":["pt-3 variant-name","variant-name"]}):
    con_names1.append(i.get_text().strip())
con_names1
#pulls all variants, but also needs to be cleaned up in fun new ways


['Pango Lineage: B.1.1.7',
 'WHO Label: Beta',
 'Pango Lineage(s): B.1.351, B.1.351.2, B.1.351.3',
 'Pango Lineage: B.1.617.2, AY.1, AY.2, AY.3, AY.4, AY.5, AY.6, AY.7, AY.8, AY.9, AY.10, AY.11, AY.12',
 'WHO Label: Gamma',
 'Pango Lineage(s): P.1, P.1.1, P.1.2']

In [13]:
#remove WHO label lines from voc list
con_names2=[]
for i in con_names1:
    if "Pango" in i:
        con_names2.append(i)
con_names2

['Pango Lineage: B.1.1.7',
 'Pango Lineage(s): B.1.351, B.1.351.2, B.1.351.3',
 'Pango Lineage: B.1.617.2, AY.1, AY.2, AY.3, AY.4, AY.5, AY.6, AY.7, AY.8, AY.9, AY.10, AY.11, AY.12',
 'Pango Lineage(s): P.1, P.1.1, P.1.2']

In [14]:
#split to remove text
split_voc1=[]
for i in range(len(con_names2)):
    split_voc1.append(con_names2[i].split(":",-1))        

#just collect lineages
split_voc2=[]
for i in range(len(split_voc1)):
    split_voc2.append(split_voc1[i][1].strip())
split_voc2

['B.1.1.7',
 'B.1.351, B.1.351.2, B.1.351.3',
 'B.1.617.2, AY.1, AY.2, AY.3, AY.4, AY.5, AY.6, AY.7, AY.8, AY.9, AY.10, AY.11, AY.12',
 'P.1, P.1.1, P.1.2']

In [15]:
#split lineages to get all in one column
#do the split
split_voc3=[]
for i in range(len(split_voc2)):
    split_voc3.append(split_voc2[i].split(" ",-1))        
split_voc3

[['B.1.1.7'],
 ['B.1.351,', 'B.1.351.2,', 'B.1.351.3'],
 ['B.1.617.2,',
  'AY.1,',
  'AY.2,',
  'AY.3,',
  'AY.4,',
  'AY.5,',
  'AY.6,',
  'AY.7,',
  'AY.8,',
  'AY.9,',
  'AY.10,',
  'AY.11,',
  'AY.12'],
 ['P.1,', 'P.1.1,', 'P.1.2']]

In [16]:
#flatten list
def unlist(e):
    if type(e) == list:
        for v2 in e:
            for v3 in unlist(v2):
                yield v3
    else:
        yield e
voc_list=[]
for v in unlist(split_voc3):
    voc_list.append(v.strip())
voc_list

['B.1.1.7',
 'B.1.351,',
 'B.1.351.2,',
 'B.1.351.3',
 'B.1.617.2,',
 'AY.1,',
 'AY.2,',
 'AY.3,',
 'AY.4,',
 'AY.5,',
 'AY.6,',
 'AY.7,',
 'AY.8,',
 'AY.9,',
 'AY.10,',
 'AY.11,',
 'AY.12',
 'P.1,',
 'P.1.1,',
 'P.1.2']

In [17]:
#delete extra commas
voc_new=[]
for i in range(len(voc_list)):
    voc_new.append(voc_list[i].replace(","," ").strip())
voc_new

['B.1.1.7',
 'B.1.351',
 'B.1.351.2',
 'B.1.351.3',
 'B.1.617.2',
 'AY.1',
 'AY.2',
 'AY.3',
 'AY.4',
 'AY.5',
 'AY.6',
 'AY.7',
 'AY.8',
 'AY.9',
 'AY.10',
 'AY.11',
 'AY.12',
 'P.1',
 'P.1.1',
 'P.1.2']

In [18]:
#make VOC data frame with column VOC
voc_col=list(["VOC"]*len(voc_new))

voc_df=pd.DataFrame([voc_new,voc_col]).transpose()
voc_df

Unnamed: 0,0,1
0,B.1.1.7,VOC
1,B.1.351,VOC
2,B.1.351.2,VOC
3,B.1.351.3,VOC
4,B.1.617.2,VOC
5,AY.1,VOC
6,AY.2,VOC
7,AY.3,VOC
8,AY.4,VOC
9,AY.5,VOC


In [19]:
#smush them together to create one dataframe for export
all_df=voi_df.append(voc_df, ignore_index=True)
all_df

Unnamed: 0,0,1
0,B.1.525,VOI
1,B.1.526,VOI
2,B.1.617.1,VOI
3,B.1.617.3,VOI
4,B.1.1.7,VOC
5,B.1.351,VOC
6,B.1.351.2,VOC
7,B.1.351.3,VOC
8,B.1.617.2,VOC
9,AY.1,VOC


Date cell 1 was actually used for the project to record which day data was pulled. For purposes of this record, date was manually set to the date data was last pulled in Date cell 2, while Date cell 1 was commented out.

In [20]:
#Date cell 1

#get the date of data collection
#todayIs=datetime.date.today()
#dataDate = todayIs.strftime("%m%d%y")
#dataDate

'081821'

In [5]:
#Date cell 2
dataDate=("081821")
dataDate

'081821'

In [6]:
#add date of data collection to get the file name before saving
#change <your_filepath> to actual desired file path
file_list = ["<your_filepath>",dataDate,".csv"]
filename="".join(file_list)
filename

'<your_filepath>081821.csv'

In [22]:
#write file for later
all_df.to_csv(filename)