In [4]:
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [5]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
# Function to fetch Wikipedia articles based on a topic

def fetch_wikipedia_articles(topic, num_articles):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": topic,
        "srlimit": num_articles
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    articles = []
    for item in data['query']['search']:
        article_title = item['title']
        article_url = "https://en.wikipedia.org/wiki/" + article_title.replace(" ", "_")
        articles.append(article_url)
    return articles

In [11]:
# Example usage of the functions to fetch and preprocess articles

topic = "Geography" 
num_articles = 5  
articles = fetch_wikipedia_articles(topic, num_articles)
print(articles)

['https://en.wikipedia.org/wiki/Geography', 'https://en.wikipedia.org/wiki/Geographic_coordinate_system', 'https://en.wikipedia.org/wiki/Geography_of_India', 'https://en.wikipedia.org/wiki/Meridian_(geography)', 'https://en.wikipedia.org/wiki/National_Geographic']


In [8]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    # tokenize
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    # stem
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

preprocessed_articles = [preprocess_text(article) for article in articles]

In [9]:
for article in preprocessed_articles:
    print(article)


['http']
['http']
['http']
['http', 'geographi']
['http']
