In [6]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [30]:
def scraping(url: str):
    '''
    Fetches content from a given URL and returns the relevant div content.
    '''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find_all("div", attrs={"class": 'cdc-textblock'})
    return content

def clean_text(content):
    '''
    Cleans the text extracted from HTML tags, removing extra spaces.
    '''
    text = ""
    for i in content:
        text += i.text
    # Compact multiple spaces into a single space.
    text = re.sub(r'\s+', ' ', text)
    return text

def extract_and_clean_content(urls, years):
    '''
    Iterates over a list of URLs, scrapes and cleans content from each, inputs the year manually,
    and returns a DataFrame with the results.
    '''
    data = {'Year': [], 'Content': []}
    for url, year in zip(urls, years):
        content = scraping(url)
        cleaned_content = clean_text(content)
        data['Year'].append(year)
        data['Content'].append(cleaned_content)
    
    return pd.DataFrame(data)

In [31]:
urls = [
    # "https://archive.cdc.gov/#/details?url=https://www.cdc.gov/flu/pastseasons/1415season.htm",
    # "https://archive.cdc.gov/#/details?url=https://www.cdc.gov/flu/about/season/flu-season-2015-2016.htm",
    # "https://archive.cdc.gov/#/details?url=https://www.cdc.gov/flu/about/season/flu-season-2016-2017.htm", 
    # "https://archive.cdc.gov/#/details?url=https://www.cdc.gov/flu/about/season/flu-season-2017-2018.htm",
    "https://www.cdc.gov/flu/about/season/flu-season-2018-2019.htm", 
    "https://www.cdc.gov/flu/season/faq-flu-season-2019-2020.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2020-2021.htm",
    "https://www.cdc.gov/flu/season/faq-flu-season-2021-2022.htm", 
    "https://www.cdc.gov/flu/season/faq-flu-season-2022-2023.htm"
]

In [32]:
years = [
    "2018-2019",
    "2019-2020",
    "2020-2021",
    "2021-2022",
    "2022-2023"
]

In [33]:
df = extract_and_clean_content(urls, years)

In [34]:
print(df)

        Year                                            Content
0  2018-2019  What’s new this flu season? A few things are n...
1  2019-2020  What’s new this flu season? A few things are n...
2  2020-2021  2020-21 Flu Season Summary FAQ Summary What wa...
3  2021-2022  Summary What was the 2021-2022 flu season like...
4  2022-2023  What’s New for 2022-2023 A few things are diff...


## Save to csv

In [15]:
df.to_csv('flu_data.csv', index=False)