In [24]:
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [25]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# function to fetch Wikipedia articles based on a topic

def fetch_wikipedia_articles(topic, num_articles):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": topic,
        "srlimit": num_articles
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    articles_content = []
    for item in data['query']['search']:
        article_title = item['title']
        article_url = "https://en.wikipedia.org/wiki/" + article_title.replace(" ", "_")
        article_page = requests.get(article_url)
        soup = BeautifulSoup(article_page.content, 'html.parser')
        text = soup.find('div', class_='mw-parser-output').get_text()
        articles_content.append(text)
    return articles_content

In [30]:
# example usage of the functions to fetch and preprocess articles

topic = "Geography"
num_articles = 5  
articles = fetch_wikipedia_articles(topic, num_articles)


In [39]:

def preprocess_text(text):
    tokens = word_tokenize(text)
    # tokenize
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    porter = PorterStemmer()
     # stem
    tokens = [porter.stem(word) for word in tokens]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens) 

preprocessed_articles = [preprocess_text(article) for article in articles]

In [41]:
labels = ['geographic', 'non-geographic', 'geographic', 'non-geographic', 'geographic']  

# convert text data into a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(preprocessed_articles)
y = labels

# aplit the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# predict the labels on the test set
y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))


                precision    recall  f1-score   support

    geographic       0.50      1.00      0.67         1
non-geographic       0.00      0.00      0.00         1

      accuracy                           0.50         2
     macro avg       0.25      0.50      0.33         2
  weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
