In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
def scraping(url: str):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find_all("div", attrs={"class": 'cdc-textblock'})
    return content

def clean_text(content):
    '''
    Only clean the text from html tags and extra spaces
    '''
    text = ""
    for i in content:
        text += i.text
    text = re.sub(r'\s+', ' ', text)
    return text

In [3]:
content = scraping("https://www.cdc.gov/flu/season/faq-flu-season-2020-2021.htm")

In [20]:
print(content)

[<div class="cdc-textblock"><h2>2020-21 Flu Season Summary FAQ</h2>
</div>, <div class="cdc-textblock"><h3><a class="onThisPageAnchor" id="anchor_1627000307956" title="Summary"></a>Summary</h3>
<h4>What was the 2020-2021 flu season like?</h4>
<p>Flu activity was unusually low throughout the 2020-2021 flu season both in the United States and globally, despite high levels of testing. During September 28, 2020–May 22, 2021 in the United States, 1,675 (0.2%) of 818,939 respiratory specimens tested by U.S. clinical laboratories were positive for an influenza virus. The low level of flu activity during this past season contributed to dramatically fewer flu illnesses, hospitalizations, and deaths compared with previous flu seasons. For comparison, during the last three seasons before the pandemic, the proportion of respiratory specimens testing positive for influenza peaked between 26.2% and 30.3%. In terms of hospitalizations, the cumulative rate of laboratory-confirmed influenza-associated 

In [19]:
content_stop = clean_text(content)
print(content_stop)

2020-21 Flu Season Summary FAQ Summary What was the 2020-2021 flu season like? Flu activity was unusually low throughout the 2020-2021 flu season both in the United States and globally, despite high levels of testing. During September 28, 2020–May 22, 2021 in the United States, 1,675 (0.2%) of 818,939 respiratory specimens tested by U.S. clinical laboratories were positive for an influenza virus. The low level of flu activity during this past season contributed to dramatically fewer flu illnesses, hospitalizations, and deaths compared with previous flu seasons. For comparison, during the last three seasons before the pandemic, the proportion of respiratory specimens testing positive for influenza peaked between 26.2% and 30.3%. In terms of hospitalizations, the cumulative rate of laboratory-confirmed influenza-associated hospitalizations in the 2020-2021 season was the lowest recorded since this type of data collection began in 2005. For pediatric deaths, CDC received one report of a p

## Save to csv

In [17]:
import pandas as pd

data = {
    'Year': [content_stop[:6]],
    'Content': [content_stop],
}

df = pd.DataFrame(data)

print(df)

df.to_csv('flu_data.csv', index=False)


     Year                                            Content
0  2020-2  2020-21 Flu Season Summary FAQ Summary What wa...


## Text Cleaning

### Stopwords removal

In [73]:
!pip install nltk



In [74]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tongfah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tongfah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [79]:
def clean_punctuation(text):
    punctuation_pattern = re.compile(r'[^\w\s]|_')
    cleaned_text = re.sub(punctuation_pattern, '', text)

    return cleaned_text

cleaned_text = clean_punctuation(content_stop)

In [81]:
print(content_stop)
print(cleaned_text)

2020-21 Flu Season Summary FAQ Summary What was the 2020-2021 flu season like? Flu activity was unusually low throughout the 2020-2021 flu season both in the United States and globally, despite high levels of testing. During September 28, 2020–May 22, 2021 in the United States, 1,675 (0.2%) of 818,939 respiratory specimens tested by U.S. clinical laboratories were positive for an influenza virus. The low level of flu activity during this past season contributed to dramatically fewer flu illnesses, hospitalizations, and deaths compared with previous flu seasons. For comparison, during the last three seasons before the pandemic, the proportion of respiratory specimens testing positive for influenza peaked between 26.2% and 30.3%. In terms of hospitalizations, the cumulative rate of laboratory-confirmed influenza-associated hospitalizations in the 2020-2021 season was the lowest recorded since this type of data collection began in 2005. For pediatric deaths, CDC received one report of a p

In [75]:
def stopwords_removal(text: str):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

text = stopwords_removal(content_stop)

In [76]:
print(content_stop)
print(text)

2020-21 Flu Season Summary FAQ Summary What was the 2020-2021 flu season like? Flu activity was unusually low throughout the 2020-2021 flu season both in the United States and globally, despite high levels of testing. During September 28, 2020–May 22, 2021 in the United States, 1,675 (0.2%) of 818,939 respiratory specimens tested by U.S. clinical laboratories were positive for an influenza virus. The low level of flu activity during this past season contributed to dramatically fewer flu illnesses, hospitalizations, and deaths compared with previous flu seasons. For comparison, during the last three seasons before the pandemic, the proportion of respiratory specimens testing positive for influenza peaked between 26.2% and 30.3%. In terms of hospitalizations, the cumulative rate of laboratory-confirmed influenza-associated hospitalizations in the 2020-2021 season was the lowest recorded since this type of data collection began in 2005. For pediatric deaths, CDC received one report of a p