In [11]:
import requests
import pandas as pd
import concurrent.futures

# Function to fetch base page information
def fetch_wikipedia_page_info(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "info",
        "format": "json",
        "titles": title,
        "inprop": "url|id"
    }
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    page_info = {}
    for page_id, page in pages.items():
        page_info['id'] = page['pageid']
        page_info['title'] = page['title']
        page_info['url'] = f"https://en.wikipedia.org/?curid={page['pageid']}"
    return page_info

# Function to fetch links from a page
def fetch_wikipedia_links(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "links",
        "format": "json",
        "titles": title,
        "pllimit": "max"
    }
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    links = []
    for page_id, page in pages.items():
        link_list = page.get('links', [])
        for link in link_list:
            links.append({'title': link['title']})
    return links

# Function to fetch page ID for a given title
def fetch_page_id(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "info",
        "format": "json",
        "titles": title,
        "inprop": "id"
    }
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    page_info = {}
    for page_id, page in pages.items():
        page_info['title'] = page['title']
        page_info['pageid'] = page.get('pageid', None)
    return page_info

# Function to fetch page IDs for all links in parallel
def fetch_all_link_ids(links):
    link_ids = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_link = {executor.submit(fetch_page_id, link['title']): link for link in links}
        for future in concurrent.futures.as_completed(future_to_link):
            link = future_to_link[future]
            try:
                page_info = future.result()
                if page_info['pageid']:
                    link_ids.append({'title': page_info['title'], 'pageid': page_info['pageid']})
            except Exception as exc:
                print(f'Error fetching page ID for {link["title"]}: {exc}')
    return link_ids

# Function to save data to CSV using pandas
def save_to_csv(base_page_info, links_with_ids):
    base_title = base_page_info['id']
    filename = str(base_page_info['id'])+".csv"

    data = []
    for link in links_with_ids:
        data.append([
            base_page_info['id'],
            base_page_info['title'],
            base_page_info['url'],
            link['title'],
            link['pageid']
        ])

    df = pd.DataFrame(data, columns=['Base Page ID', 'Base Page Title', 'Base Page URL', 'Link Title', 'Link Page ID'])
    df.to_csv(filename, index=False, encoding='utf-8')

    print(f"Data saved to {filename}")

# Function to read data from CSV using pandas
def read_from_csv(filename):
    df = pd.read_csv(filename)
    return df

# Main execution
if __name__ == "__main__":
    base_page_title = "Tunisia"
    base_page_info = fetch_wikipedia_page_info(base_page_title)
    links = fetch_wikipedia_links(base_page_title)
    links_with_ids = fetch_all_link_ids(links)
    save_to_csv(base_page_info, links_with_ids)

    # Optionally read and print the CSV data
    df = read_from_csv(str(base_page_info['id'])+".csv")
    print(df)


Data saved to 30188.csv
     Base Page ID Base Page Title                          Base Page URL  \
0           30188         Tunisia  https://en.wikipedia.org/?curid=30188   
1           30188         Tunisia  https://en.wikipedia.org/?curid=30188   
2           30188         Tunisia  https://en.wikipedia.org/?curid=30188   
3           30188         Tunisia  https://en.wikipedia.org/?curid=30188   
4           30188         Tunisia  https://en.wikipedia.org/?curid=30188   
..            ...             ...                                    ...   
489         30188         Tunisia  https://en.wikipedia.org/?curid=30188   
490         30188         Tunisia  https://en.wikipedia.org/?curid=30188   
491         30188         Tunisia  https://en.wikipedia.org/?curid=30188   
492         30188         Tunisia  https://en.wikipedia.org/?curid=30188   
493         30188         Tunisia  https://en.wikipedia.org/?curid=30188   

                                 Link Title  Link Page ID  
0  