In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [10]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant div and h3 content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    content_divs = soup.find_all("div", attrs={"class": 'cdc-textblock'})
    content = []
    for div in content_divs:
        h3_tag = div.find('h3')
        if h3_tag:
            topic = h3_tag.text.strip()
            paragraphs = div.find_all('p')
            text = ' '.join([p.text.strip() for p in paragraphs])
            content.append({'topic': topic, 'content': text})
    return content


def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url, year in zip(urls, years):
        contents = scraping(url)
        for content in contents:
            data['Year'].append(year)
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = [ 
    "https://www.cdc.gov/flu/about/season/flu-season-2018-2019.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2019-2020.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2020-2021.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2021-2022.htm", 
    "https://www.cdc.gov/flu/season/faq-flu-season-2022-2023.htm",
    
]
years = ['2018-2019','2019-2020', '2020-2021', '2021-2022', '2022-2023']
df = extract_and_clean_content(urls, years)
df



Unnamed: 0,Year,Topic,Content
0,2018-2019,What’s new this flu season?,A few things are new this season: For the 2018...
1,2018-2019,How does CDC estimate the burden of seasonal f...,The burden of influenza on the United States c...
2,2019-2020,What’s new this flu season?,A few things are new this season: For more inf...
3,2019-2020,When should I get vaccinated?,"Because the timing of the onset, peak and end ..."
4,2019-2020,Implications of Cell-Based Vaccines,For people who need only one dose for the seas...
5,2019-2020,Flu Activity,What sort of flu season is expected this year?...
6,2019-2020,Vaccine and Vaccination,How much flu vaccine will be available this se...
7,2019-2020,Flu Vaccine Effectiveness,How effective will flu vaccines be this season...
8,2019-2020,If You Get Sick,What happens in the body when someone has flu?...
9,2019-2020,Surveillance,How does CDC track flu activity? The Epidemiol...


In [19]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant div and h3 content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    content_divs = soup.find_all("div", attrs={"class": 'cdc-textblock'})
    content = []
    for div in content_divs:
        h3_tag = div.find('h2')
        if h3_tag:
            topic = h3_tag.text.strip()
            paragraphs = div.find_all('p')
            text = ' '.join([p.text.strip() for p in paragraphs])
            content.append({'topic': topic, 'content': text})
    return content


def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url, year in zip(urls, years):
        contents = scraping(url)
        for content in contents:
            data['Year'].append(year)
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = [ 
    "https://www.cdc.gov/flu/season/faq-flu-season-2023-2024.htm"]
years = ['2023-2024']
df1 = extract_and_clean_content(urls, years)
df1



Unnamed: 0,Year,Topic,Content
0,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...
1,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ..."
2,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...
3,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...
4,2023-2024,"Flu, RSV, and COVID-19 Coinfection Data: 2023-...",One way CDC collects data on coinfections with...


In [16]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant h3 (containing <a> tags) and div content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    # Using CSS selectors to target <a> tags within <h3> elements directly
    h3_links = soup.select("h3 a")
    
    content = []
    for link in h3_links:
        # Check if the <a> tag has a parent <h3> tag
        h3_tag = link.find_parent('h3')
        if h3_tag:
            topic = h3_tag.get_text(strip=True)  # Extract text from <h3>, which includes <a> tag text
            print(topic)
            # Find the next sibling <div> that contains the relevant content
            next_div = h3_tag.find_next_sibling('div', attrs={"class": "container d-flex flex-wrap body-wrapper bg-white"})
            
            if next_div:
                paragraphs = next_div.find_all('p')
                text = ' '.join(p.get_text(strip=True) for p in paragraphs)
                content.append({'topic': topic, 'content': text})
                
    return content


def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Topic': [], 'Content': []}
    for url, year in zip(urls, years):
        contents = scraping(url)
        for content in contents:
            data['Year'].append(year)
            data['Topic'].append(content['topic'])
            data['Content'].append(content['content'])
    
    return pd.DataFrame(data)

# Example usage
urls = [ 
    "https://archive.cdc.gov/www_cdc_gov/flu/about/season/flu-season-2015-2016.htm"]
years = ['2015-2016']
df2 = extract_and_clean_content(urls, years)
df2



What was the 2015-2016 flu season like?
When did the 2015-2016 flu season peak?
How many people died from flu during the 2015-2016 season?
How many children died from the flu during the 2015-2016 season?
What flu viruses circulated during the 2015-2016 season?
Were infections with novel (non-human) influenza viruses detected during 2015-2016?
How much flu vaccine was produced and distributed during the 2015-2016 season?
How effective was the 2015-2016 flu vaccine?
Was this season’s vaccine a good match for circulating viruses?
How many antiviral resistant viruses were detected during the 2015-2016 season?
Did CDC do anything different in terms of virologic surveillance during the 2015-2016 season?
Publications


Unnamed: 0,Year,Topic,Content


In [4]:
text = df2["Content"]

In [5]:
text

0    Note: See Frequently Asked Flu Questions 2017-...
Name: Content, dtype: object

In [20]:
combined_df = pd.concat([df, df1], axis=0)
combined_df

Unnamed: 0,Year,Topic,Content
0,2018-2019,Why is it significant that cell-grown vaccine ...,A few things are new this season: For the 2018...
1,2018-2019,How does CDC classify flu season severity?,The burden of influenza on the United States c...
2,2019-2020,What’s new this flu season?,A few things are new this season: For more inf...
3,2019-2020,When should I get vaccinated?,"Because the timing of the onset, peak and end ..."
4,2019-2020,Implications of Cell-Based Vaccines,For people who need only one dose for the seas...
5,2019-2020,Flu Activity,What sort of flu season is expected this year?...
6,2019-2020,Vaccine and Vaccination,How much flu vaccine will be available this se...
7,2019-2020,Flu Vaccine Effectiveness,How effective will flu vaccines be this season...
8,2019-2020,If You Get Sick,What happens in the body when someone has flu?...
9,2019-2020,Surveillance,How does CDC track flu activity? The Epidemiol...


In [21]:
df.to_csv('flu_data_topic.csv', index=False)