In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [10]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant div and h3 content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    content_divs = soup.find_all("div", attrs={"class": 'cdc-textblock'})
    content = []
    for div in content_divs:
        h3_tag = div.find('h3')
        if h3_tag:
            topic = h3_tag.text.strip()
            paragraphs = div.find_all('p')
            text = ' '.join([p.text.strip() for p in paragraphs])
            content.append({'topic': topic, 'content': text})
    return content


def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url, year in zip(urls, years):
        contents = scraping(url)
        for content in contents:
            data['Year'].append(year)
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = [ 
    "https://www.cdc.gov/flu/about/season/flu-season-2018-2019.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2019-2020.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2020-2021.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2021-2022.htm", 
    "https://www.cdc.gov/flu/season/faq-flu-season-2022-2023.htm",
    
]
years = ['2018-2019','2019-2020', '2020-2021', '2021-2022', '2022-2023']
df = extract_and_clean_content(urls, years)
df



Unnamed: 0,Year,Topic,Content
0,2018-2019,What’s new this flu season?,A few things are new this season: For the 2018...
1,2018-2019,How does CDC estimate the burden of seasonal f...,The burden of influenza on the United States c...
2,2019-2020,What’s new this flu season?,A few things are new this season: For more inf...
3,2019-2020,When should I get vaccinated?,"Because the timing of the onset, peak and end ..."
4,2019-2020,Implications of Cell-Based Vaccines,For people who need only one dose for the seas...
5,2019-2020,Flu Activity,What sort of flu season is expected this year?...
6,2019-2020,Vaccine and Vaccination,How much flu vaccine will be available this se...
7,2019-2020,Flu Vaccine Effectiveness,How effective will flu vaccines be this season...
8,2019-2020,If You Get Sick,What happens in the body when someone has flu?...
9,2019-2020,Surveillance,How does CDC track flu activity? The Epidemiol...


In [19]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant div and h3 content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    content_divs = soup.find_all("div", attrs={"class": 'cdc-textblock'})
    content = []
    for div in content_divs:
        h3_tag = div.find('h2')
        if h3_tag:
            topic = h3_tag.text.strip()
            paragraphs = div.find_all('p')
            text = ' '.join([p.text.strip() for p in paragraphs])
            content.append({'topic': topic, 'content': text})
    return content


def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url, year in zip(urls, years):
        contents = scraping(url)
        for content in contents:
            data['Year'].append(year)
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = [ 
    "https://www.cdc.gov/flu/season/faq-flu-season-2023-2024.htm"]
years = ['2023-2024']
df1 = extract_and_clean_content(urls, years)
df1



Unnamed: 0,Year,Topic,Content
0,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...
1,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ..."
2,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...
3,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...
4,2023-2024,"Flu, RSV, and COVID-19 Coinfection Data: 2023-...",One way CDC collects data on coinfections with...


In [20]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant h3 (containing <a> tags) and div content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    # Using CSS selectors to target <a> tags within <h3> elements directly
    h3_links = soup.select("h3 a")


    content_divs = soup.find("div", attrs={"class": 'container d-flex flex-wrap body-wrapper bg-white'})
    cont = content_divs.get_text(strip=True)


    print(cont)
    content = []
    for link in h3_links:
        # Check if the <a> tag has a parent <h3> tag
        h3_tag = link.find_parent('h3')
        if h3_tag:
            topic = h3_tag.get_text(strip=True)  # Extract text from <h3>, which includes <a> tag text
            print(topic)
            # Find the next sibling <div> that contains the relevant content
            next_div = h3_tag.find_next_sibling('div', attrs={"class": "container d-flex flex-wrap body-wrapper bg-white"})
            
            if next_div:
                paragraphs = next_div.find_all('p')
                text = ' '.join(p.get_text(strip=True) for p in paragraphs)
                content.append({'topic': topic, 'content': text})
                
    return content


def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url, year in zip(urls, years):
        contents = scraping(url)
        for content in contents:
            data['Year'].append(year)
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = [ 
    "https://archive.cdc.gov/www_cdc_gov/flu/about/season/flu-season-2015-2016.htm"]
years = ['2015-2016']
df2 = extract_and_clean_content(urls, years)
df2



Summary of the 2015-2016 Influenza SeasonEspañol|Other LanguagesPrintMinusRelated PagesOn This PageWhat was the 2015-2016 flu season like?When did the 2015-2016 flu season peak?How many people died from flu during the 2015-2016 season?How many children died from the flu during the 2015-2016 season?What flu viruses circulated during the 2015-2016 season?Were infections with novel (non-human) influenza viruses detected during 2015-2016?How much flu vaccine was produced and distributed during the 2015-2016 season?How effective was the 2015-2016 flu vaccine?Was this season's vaccine a good match for circulating viruses?How many antiviral resistant viruses were detected during the 2015-2016 season?Did CDC do anything different in terms of virologic surveillance during the 2015-2016 season?PublicationsNote:SeeFrequently Asked Flu Questions 2017-2018 Influenza Seasonfor flu and flu vaccine information specific to the current flu season.Season Summary ReportsSeasonal Influenza Vaccine Effectiv

Unnamed: 0,Year,Topic,Content


In [53]:
def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant h3 (containing <a> tags) and div content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find all h3 tags
    h3_tags = soup.find_all("h3")
    content = []
    
    # Iterate over each h3 tag
    for h3 in h3_tags:
        # Get the topic from the h3 tag
        topic = h3.get_text(strip=True)
        # Initialize content for the current topic
        topic_content = ''
        
        # Find all p tags until the next h3 tag
        next_element = h3.find_next_sibling()
        while next_element and next_element.name != 'h3':
            if next_element.name == 'p':
                topic_content += next_element.get_text(strip=True) + ' '
            next_element = next_element.find_next_sibling()
        
        # Append the topic and content to the list
        content.append({'topic': topic, 'content': topic_content.strip()})
    
    return content

def extract_and_clean_content(urls):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year from the URL,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url in urls:
        year_range = re.findall(r'(\d{4})-(\d{4})', url)[0]  # Extract year range from URL
        start_year = year_range[0]
        end_year = year_range[1]
        contents = scraping(url)
        for content in contents:
            data['Year'].append(f"{start_year}-{end_year}")
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = ["https://archive.cdc.gov/www_cdc_gov/flu/about/season/flu-season-2015-2016.htm"]
df2 = extract_and_clean_content(urls)
df2


Unnamed: 0,Year,Topic,Content
0,2015-2016,Season Summary Reports,
1,2015-2016,What was the 2015-2016 flu season like?,"Flu seasons can vary in their timing, severity..."
2,2015-2016,When did the 2015-2016 flu season peak?,The timing of flu is unpredictable and can var...
3,2015-2016,How many people died from flu during the 2015-...,CDC does not count how many people die from fl...
4,2015-2016,How many children died from the flu during the...,"As of September 15, 2016, a total of 85 labora..."
5,2015-2016,What flu viruses circulated during the 2015-20...,"Overall, influenza A(H1N1)pdm09 viruses predom..."
6,2015-2016,Were infections with novel (non-human) influen...,"As of September 15, 2016, seven human infectio..."
7,2015-2016,How much flu vaccine was produced and distribu...,Flu vaccine is produced by private manufacture...
8,2015-2016,How effective was the 2015-2016 flu vaccine?,CDC’s end-of-season influenza vaccine effectiv...
9,2015-2016,Was this season’s vaccine a good match for cir...,Yes. Laboratory data show that most of the cir...


In [54]:

urls = ["https://archive.cdc.gov/www_cdc_gov/flu/about/season/flu-season-2016-2017.htm"]
df3 = extract_and_clean_content(urls)
df3

Unnamed: 0,Year,Topic,Content
0,2016-2017,Season Summary Reports,
1,2016-2017,Information for 2016-2017,Getting an annual flu vaccine is the first and...
2,2016-2017,Flu Activity,As of a flu activity update published in theMo...
3,2016-2017,Protective Actions,CDC recommends a yearlyflu vaccinefor everyone...
4,2016-2017,Vaccine and Vaccination,Flu vaccine is produced by private manufacture...
5,2016-2017,If You Get Sick,Antiviral drugs are prescription drugs that ca...
6,2016-2017,Surveillance,The Epidemiology and Prevention Branch in the ...
7,2016-2017,Flu and Parotitis,Acute parotitis is recent swelling of one of t...
8,2016-2017,Publications,


In [55]:
urls = ["https://archive.cdc.gov/www_cdc_gov/flu/about/season/flu-season-2017-2018.htm"]
df4 = extract_and_clean_content(urls)
df4

Unnamed: 0,Year,Topic,Content
0,2017-2018,Season Summary Reports,
1,2017-2018,What was the 2017-2018 flu season like?,The 2017-2018 influenza season was a high seve...
2,2017-2018,When did the 2017-2018 flu season peak?,"During the 2017-2018 season, influenza-like-il..."
3,2017-2018,How many people died from flu during the 2017-...,While flu deaths in children are reported to C...
4,2017-2018,How many children died from flu during the 201...,"As of April 19, 2019, a total of 186 pediatric..."
5,2017-2018,How many people were hospitalized from flu dur...,"From October 1, 2017 through April 28, 2018, 3..."
6,2017-2018,What flu viruses circulated during the 2017-20...,Influenza A(H3N2) viruses predominated overall...
7,2017-2018,How much flu vaccine was produced and distribu...,Flu vaccine is produced by private manufacture...
8,2017-2018,How effective was the 2017-2018 flu vaccine?,The overall vaccine effectiveness (VE) of the ...
9,2017-2018,Was this season’s flu vaccine a good match for...,Yes. The majority of the influenza viruses col...


In [56]:
combined_df = pd.concat([df2, df3,df4,df, df1], axis=0)
combined_df

Unnamed: 0,Year,Topic,Content
0,2015-2016,Season Summary Reports,
1,2015-2016,What was the 2015-2016 flu season like?,"Flu seasons can vary in their timing, severity..."
2,2015-2016,When did the 2015-2016 flu season peak?,The timing of flu is unpredictable and can var...
3,2015-2016,How many people died from flu during the 2015-...,CDC does not count how many people die from fl...
4,2015-2016,How many children died from the flu during the...,"As of September 15, 2016, a total of 85 labora..."
5,2015-2016,What flu viruses circulated during the 2015-20...,"Overall, influenza A(H1N1)pdm09 viruses predom..."
6,2015-2016,Were infections with novel (non-human) influen...,"As of September 15, 2016, seven human infectio..."
7,2015-2016,How much flu vaccine was produced and distribu...,Flu vaccine is produced by private manufacture...
8,2015-2016,How effective was the 2015-2016 flu vaccine?,CDC’s end-of-season influenza vaccine effectiv...
9,2015-2016,Was this season’s vaccine a good match for cir...,Yes. Laboratory data show that most of the cir...


In [67]:
combined_df.to_csv('flu_data_topic.csv', index=False)

In [66]:
from selenium import webdriver
from bs4 import BeautifulSoup

# เปิดเว็บไซต์ในเบราว์เซอร์
driver = webdriver.Chrome()
driver.get("https://archive.cdc.gov/#/details?url=https://www.cdc.gov/flu/pastseasons/1415season.htm")
driver.implicitly_wait(10)

# ดึง HTML จากหน้าเว็บ
html_content = driver.page_source

# ปิดเบราว์เซอร์
driver.quit()

# ใช้ BeautifulSoup เพื่อวิเคราะห์ HTML
soup = BeautifulSoup(html_content, "html.parser")

# ดึงเฉพาะแท็ก <p> ออกมา
paragraphs = soup.find_all('p')

# พิมพ์เฉพาะเนื้อหาของแท็ก <p>
for p in paragraphs:
    print(p.get_text())


This is archived content from the CDC website. The information here may be outdated and links may no longer function. Go to CDC Home for all other recent information.
CDC Archive


In [63]:
pip install selenium

Collecting selenium
  Using cached selenium-4.19.0-py3-none-any.whl.metadata (6.9 kB)
Collecting trio~=0.17 (from selenium)
  Using cached trio-0.25.0-py3-none-any.whl.metadata (8.7 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting cffi>=1.14 (from trio~=0.17->selenium)
  Downloading cffi-1.16.0-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
 