<a href="https://colab.research.google.com/github/nad3261/AI-Application/blob/main/Fetching_data_from_Wikipedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests



In [2]:
!pip install wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.7.1.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.7.1-py3-none-any.whl size=14347 sha256=6223b266a37a66603856f371fc6323ddafeb4f44dca8740b0e683796097b65b1
  Stored in directory: /root/.cache/pip/wheels/4c/96/18/b9201cc3e8b47b02b510460210cfd832ccf10c0c4dd0522962
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.7.1


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)
import wikipediaapi
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
user_agent = "Wikipedia searchign bot"
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent=user_agent
)

def fetch_articles_from_category(category_name, max_depth=1, current_depth=0, max_articles=20):
    category = wiki_wiki.page("Category:" + category_name)
    articles = []

    if not category.exists():
        print(f"Category '{category_name}' does not exist.")
        return articles

    for c in category.categorymembers.values():
        if len(articles) >= max_articles:
            break
        if c.ns == wikipediaapi.Namespace.MAIN:
            print(f"Fetching article: {c.title} from {category_name}")
            articles.append((c.title, c.text, category_name))
        elif c.ns == wikipediaapi.Namespace.CATEGORY and current_depth < max_depth:
            articles.extend(fetch_articles_from_category(c.title[9:], max_depth, current_depth + 1, max_articles - len(articles)))

    return articles


categories = ["Mathematics", "Biology", "Geography", "History", "Science"]
max_articles_per_category = 100


all_articles = []
for category in categories:
    articles = fetch_articles_from_category(category, max_depth=1, max_articles=max_articles_per_category)
    all_articles.extend(articles)


df = pd.DataFrame(all_articles, columns=["Title", "Content", "Category"])

df.to_csv("category_wikipedia_articles.csv", index=False)

print(f"{len(all_articles)} articles downloaded and saved to 'category_wikipedia_articles.csv'.")

In [7]:
class CategoryMapper:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')

        self.category_mapping = {
            'Fields of mathematics': 'Mathematics',
            'Mathematics-related lists': 'Mathematics',
            'Branches of biology': 'Biology',
            'Organisms': 'Biology',
            'Biologists': 'Biology',
            'Biology-related lists': 'Biology',
            'Geographers': 'Geography',
            'Geography-related lists': 'Geography',
            'History by ethnic group': 'History',
            'History by period': 'History',
            'Fields of history': 'History',
            'Historiography': 'History',
            'Branches of science': 'Science',
            'Scientific disciplines': 'Science',
            'Scientists': 'Science',
            'Science in society': 'Science'
        }
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(r'\w+')

    def map_categories(self, df):
        #Replace categories in the DataFrame using the mapping.
        df['Category'] = df['Category'].replace(self.category_mapping)
        return df

    def get_unique_categories(self, df):
        #Return unique categories after mapping.
        return df['Category'].unique()

    def remove_stop_words(self, text):
        #Remove stop words from the given text.
        text = text.lower()
        words = text.split()
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)

    def process_content(self, df):
        #Apply stop word removal to the Content column.
        df['Content'] = df['Content'].apply(self.remove_stop_words)
        return df

    def metadata_fix(self, text):
        #Clean HTML tags and unwanted characters from the text.
        soup = BeautifulSoup(text, "html.parser")
        cleaned_text = soup.get_text()
        cleaned_text = re.sub(r'http\S+|www\S+|https\S+', '', cleaned_text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
        return cleaned_text

    def fix_metadata(self, df):
        #Apply metadata cleaning to the Content column.
        df['Content'] = df['Content'].apply(self.metadata_fix)
        return df

    def lemmatize_words(self, text):
        #Lemmatize words in the given text.
        words = word_tokenize(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def lemmatize_content(self, df):
        #Apply lemmatization to the Content column
        df['Content'] = df['Content'].apply(self.lemmatize_words)
        return df

    def tokenize_content_spacy(self, df):
        #Tokenize content using spaCy.
        df['Content'] = df['Content'].apply(lambda x: [token.text for token in self.nlp(x)])
        return df


