# Main purpose
This jupyter notebook show you how to download the approved drug list by extracting the information from the cancer.gov website

import packages

In [1]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
## 
url = "https://www.cancer.gov/about-cancer/treatment/drugs/leukemia#4"
# url = "https://www.cancer.gov/about-cancer/treatment/drugs/lung#4"
##

page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
soup.title

<title>Drugs Approved for Leukemia - NCI</title>

In [4]:
## 
query = "acute myeloid leukemia"
# query = " small cell lung cancer"
##

# 1. Find all the subtypes based on the query
found_info = {}
for each_subtype in soup.find_all(name="h2"):
    each_subtype_string = each_subtype.get_text().lower()
    if query in each_subtype_string:
        found_info[each_subtype_string] = each_subtype.find_next_sibling()
    else:
        pass


In [5]:
# 2. Look at the matched subtypes
found_info.keys()

dict_keys(['drugs approved for acute myeloid leukemia (aml)', 'drug combinations used in acute myeloid leukemia (aml)'])

In [6]:
# 3. Extract the information and the drug list
key_name = [each for each in found_info.keys() if 'drugs approved' in each]
print(key_name[0])
approved_drugs = found_info[key_name[0]].get_text()
approved_drugs_list = approved_drugs.split('\n')[:-1]
print(approved_drugs_list[0:5])

drugs approved for acute myeloid leukemia (aml)
['Arsenic Trioxide', 'Azacitidine', 'Cerubidine (Daunorubicin Hydrochloride)', 'Cyclophosphamide', 'Cytarabine']


In [7]:
# 4. Extract the URL for the drug information
a_tags = found_info[key_name[0]].find_all(name="a")
root_url = "https://www.cancer.gov"
approved_drugs_info_url = []
for tag in a_tags:
    if not tag.get('href').startswith(root_url):
        approved_drugs_info_url.append(root_url + tag.get('href'))
    else:
        approved_drugs_info_url.append(tag.get('href'))

In [8]:
# 5. Create a dataframe
approved_drugs_df = pd.DataFrame([approved_drugs_list, approved_drugs_info_url]).T
approved_drugs_df.columns = ['drug_name', 'resource_url']
approved_drugs_df.drop_duplicates(subset="resource_url", inplace=True)
approved_drugs_df.head()

Unnamed: 0,drug_name,resource_url
0,Arsenic Trioxide,https://www.cancer.gov/about-cancer/treatment/...
1,Azacitidine,https://www.cancer.gov/about-cancer/treatment/...
2,Cerubidine (Daunorubicin Hydrochloride),https://www.cancer.gov/about-cancer/treatment/...
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...
4,Cytarabine,https://www.cancer.gov/about-cancer/treatment/...


In [9]:
# 6. Save the dataframe
approved_drugs_df.to_csv(f'{query.replace(" ", "_")}_approved_drugs.csv', index=False)