# Data collection



## Import Python packages

In [20]:
pip show spacy

Name: spacy
Version: 3.7.2
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /opt/homebrew/Caskroom/miniforge/base/envs/jiyun-py39/lib/python3.9/site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, preshed, pydantic, requests, setuptools, smart-open, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi, weasel
Required-by: en-core-web-sm, fr-core-news-sm
Note: you may need to restart the kernel to use updated packages.


In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import etree 
import os
import urllib.parse
import wikipediaapi
import concurrent.futures
import pandas as pd

## Keyword search for getting Wikipedia URLs 

In [None]:
def decode_url_encoded_string(urls):
    return [urllib.parse.unquote(url.replace('/wiki/', '').replace('_', ' ')) for url in urls]

In [None]:
batch_size = 500
max_limit = 10000
keyword_search = "protest"

urls = []

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

for offset in range(0, max_limit, batch_size):
    url = "https://en.wikipedia.org/w/index.php?limit="+str(batch_size)+"&offset="+str(offset)+"&profile=default&search="+keyword_search
    dom = etree.HTML(str(BeautifulSoup(requests.get(url, headers).content, 'html.parser')))
    urls.extend(decode_url_encoded_string(dom.xpath('//div[@class="mw-search-result-heading"]/a/@href')))

In [None]:
len(urls)

In [None]:
urls[:10] 

* Saving the URLs in a txt file

In [None]:
with open(os.path.join('data','wikipedia_titles.txt'), 'w') as f:
    for url in urls:
        f.write("%s\n" % url)

## Getting Wikipedia pages content

* Loading the title pages

In [None]:
# load the list of wikipedia titles
with open(os.path.join('data','wikipedia_titles.txt'), 'r') as f:
    urls = f.readlines()
    urls = [url.strip() for url in urls]

In [None]:
urls[:10] 

In [None]:
def get_wiki_content(index, title, wiki_api):
    try:
        page = wiki_api.page(title)
        return (index, page.text) if page.exists() else (index, "Page not found.")
    except Exception as e:
        print(f"Error retrieving page {title}: {e}")
        return (index, None)

In [None]:
wiki_api = wikipediaapi.Wikipedia('ProjectName (userAgent)', 'en') 

all_contents = [None] * len(urls)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(get_wiki_content, i, title, wiki_api) for i, title in enumerate(urls)]
    for future in concurrent.futures.as_completed(futures):
        index, content = future.result()
        all_contents[index] = content if content else "No content or error"

In [None]:
all_contents[:10]

* Save the dataset as a csv file

In [None]:
pd.DataFrame({'title': urls,'content': all_contents}).to_csv(os.path.join('data','wikipedia_content.csv'), index=False)