In [1]:
import requests
import re
import nltk

In [2]:
def get_content(article_name):
    url = "https://en.wikipedia.org/w/api.php"
    
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "exsectionformat": "wiki",
        "titles": article_name
    }
    
    response = requests.get(url, params=params)
    response_json = response.json()
    
    page_id = next(iter(response_json["query"]["pages"]))
    article_text = response_json["query"]["pages"][page_id]["extract"]

    return article_text

In [3]:
data = get_content("Ozone_layer")
print(data)

<p class="mw-empty-elt">
</p>

<p>The <b>ozone layer</b> or <b>ozone shield</b> is a region of Earth's stratosphere that absorbs most of the Sun's ultraviolet  radiation. It contains a high concentration of ozone (O<sub>3</sub>) in relation to other parts of the atmosphere, although still small in relation to other gases in the stratosphere. The ozone layer contains less than 10 parts per million of ozone, while the average ozone concentration in Earth's atmosphere as a whole is about 0.3 parts per million. The ozone layer is mainly found in the lower portion of the stratosphere, from approximately 15 to 35 kilometers (9 to 22 mi) above Earth, although its thickness varies seasonally and geographically.</p><p>The ozone layer was discovered in 1913 by the French physicists Charles Fabry and Henri Buisson. Measurements of the sun showed that the radiation sent out from its surface and reaching the ground on Earth is usually consistent with the spectrum of a black body with a temperature 

In [4]:
def merge_contents(data):
    # Remove HTML tags and replace special characters
    clean_text = re.sub(r'<[^>]+>', '', data)
    clean_text = re.sub(r'\n+', '\n', clean_text)
    clean_text = clean_text.replace('\xa0', ' ')
    clean_text = clean_text.strip()

    # Remove table of contents and other extraneous information
    clean_text = re.sub(r'\n?\[edit\]\n?', '\n', clean_text)
    clean_text = re.sub(r'^.*Contents\n\n', '', clean_text)
    clean_text = re.sub(r'^.*See also\n\n', '', clean_text)
    clean_text = re.sub(r'^.*References\n\n', '', clean_text)
    clean_text = re.sub(r'^.*Further reading\n\n', '', clean_text)
    clean_text = re.sub(r'^.*External links\n\n', '', clean_text)

    return clean_text

In [5]:
data = get_content("Ozone_layer")
merge_content = merge_contents(data)
print(merge_content)

The ozone layer or ozone shield is a region of Earth's stratosphere that absorbs most of the Sun's ultraviolet  radiation. It contains a high concentration of ozone (O3) in relation to other parts of the atmosphere, although still small in relation to other gases in the stratosphere. The ozone layer contains less than 10 parts per million of ozone, while the average ozone concentration in Earth's atmosphere as a whole is about 0.3 parts per million. The ozone layer is mainly found in the lower portion of the stratosphere, from approximately 15 to 35 kilometers (9 to 22 mi) above Earth, although its thickness varies seasonally and geographically.The ozone layer was discovered in 1913 by the French physicists Charles Fabry and Henri Buisson. Measurements of the sun showed that the radiation sent out from its surface and reaching the ground on Earth is usually consistent with the spectrum of a black body with a temperature in the range of 5,500–6,000 K (5,230–5,730 °C), except that there 

In [9]:
def tokenize(content):
    nltk.download('punkt')
    tokens = nltk.word_tokenize(content)# Tokenize the text using NLTK's word_tokenize() function
    print(tokens) # Print the list of tokens
    
collection = tokenize(merge_content)

['The', 'ozone', 'layer', 'or', 'ozone', 'shield', 'is', 'a', 'region', 'of', 'Earth', "'s", 'stratosphere', 'that', 'absorbs', 'most', 'of', 'the', 'Sun', "'s", 'ultraviolet', 'radiation', '.', 'It', 'contains', 'a', 'high', 'concentration', 'of', 'ozone', '(', 'O3', ')', 'in', 'relation', 'to', 'other', 'parts', 'of', 'the', 'atmosphere', ',', 'although', 'still', 'small', 'in', 'relation', 'to', 'other', 'gases', 'in', 'the', 'stratosphere', '.', 'The', 'ozone', 'layer', 'contains', 'less', 'than', '10', 'parts', 'per', 'million', 'of', 'ozone', ',', 'while', 'the', 'average', 'ozone', 'concentration', 'in', 'Earth', "'s", 'atmosphere', 'as', 'a', 'whole', 'is', 'about', '0.3', 'parts', 'per', 'million', '.', 'The', 'ozone', 'layer', 'is', 'mainly', 'found', 'in', 'the', 'lower', 'portion', 'of', 'the', 'stratosphere', ',', 'from', 'approximately', '15', 'to', '35', 'kilometers', '(', '9', 'to', '22', 'mi', ')', 'above', 'Earth', ',', 'although', 'its', 'thickness', 'varies', 'seaso

[nltk_data] Downloading package punkt to /home/docode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def lower_collection(collection):
    if collection is not None:
        c = [i.lower() for i in collection]
        return c

print(lower_collection(collection))

None
