In [1]:
import pandas as pd
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from tqdm import tqdm  # Import tqdm for the progress bar
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.utils import filter_complex_metadata
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def fetch_docs(title):
    try:
        # Initialize the loader for the given Wikipedia title
        loader = WikipediaLoader(query=title, load_max_docs=1, doc_content_chars_max=999999999)

        # Load returns a list of Documents
        docs = loader.load()
        docs = filter_complex_metadata(docs)

        if docs:
            return docs
        else:
            print(f"Warning: No documents found for title '{title}'.")
            return []

    except Exception as e:
        print(f"Error fetching '{title}': {e}")
        return []

In [3]:
csv_file = "../data/qid_to_titles.csv"
df = pd.read_csv(csv_file)  # The file has columns: QID, Wikipedia_Title

all_docs = []
successful_fetches = 0  # Counter for successful fetches

In [4]:
# Define the number of worker threads
MAX_WORKERS = 20  # Adjust based on your requirements and Colab's capabilities

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks to the executor
    future_to_title = {executor.submit(fetch_docs, row["Wikipedia_Title"]): idx for idx, row in df.iterrows()}

    # Use tqdm to display the progress bar
    for future in tqdm(as_completed(future_to_title), total=len(future_to_title), desc="Fetching Wikipedia pages"):
        docs = future.result()
        if docs:
            all_docs.extend(docs)
            successful_fetches += 1

print(f"Successfully fetched {successful_fetches} documents out of {len(df)}.")

Fetching Wikipedia pages:   4%|▍         | 584/13891 [00:50<19:15, 11.52it/s]

Error fetching 'Michaela Pejzlová': HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?prop=extracts&explaintext=&exintro=&titles=Michaela+Pejzlov%C3%A1&format=json&action=query (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x1396b1bb0>: Failed to resolve 'en.wikipedia.org' ([Errno 8] nodename nor servname provided, or not known)"))




  lis = BeautifulSoup(html).find_all('li')
Fetching Wikipedia pages:   6%|▌         | 805/13891 [01:20<18:58, 11.49it/s]



Fetching Wikipedia pages:   7%|▋         | 914/13891 [01:27<11:55, 18.15it/s]

Error fetching 'Matúš Vallo': HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?prop=extracts&explaintext=&exintro=&titles=Mat%C3%BA%C5%A1+Vallo&format=json&action=query (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x13972bdf0>: Failed to resolve 'en.wikipedia.org' ([Errno 8] nodename nor servname provided, or not known)"))


Fetching Wikipedia pages:   9%|▊         | 1205/13891 [01:49<18:07, 11.67it/s]



Fetching Wikipedia pages:  10%|█         | 1429/13891 [02:08<17:50, 11.64it/s]



Fetching Wikipedia pages:  13%|█▎        | 1814/13891 [02:40<16:33, 12.16it/s]



Fetching Wikipedia pages:  19%|█▊        | 2597/13891 [03:40<09:54, 19.00it/s]



Fetching Wikipedia pages:  21%|██        | 2891/13891 [04:01<13:02, 14.05it/s]



Fetching Wikipedia pages:  21%|██        | 2908/13891 [04:03<13:00, 14.08it/s]



Fetching Wikipedia pages:  28%|██▊       | 3947/13891 [05:20<10:31, 15.74it/s]



Fetching Wikipedia pages:  39%|███▉      | 5403/13891 [07:03<08:39, 16.33it/s]



Fetching Wikipedia pages:  43%|████▎     | 6024/13891 [07:47<08:23, 15.63it/s]



Fetching Wikipedia pages:  50%|█████     | 7003/13891 [08:53<08:18, 13.83it/s]



Fetching Wikipedia pages:  51%|█████     | 7087/13891 [08:59<08:05, 14.02it/s]



Fetching Wikipedia pages:  67%|██████▋   | 9363/13891 [11:38<05:50, 12.93it/s]



Fetching Wikipedia pages:  71%|███████   | 9839/13891 [12:11<04:31, 14.90it/s]



Fetching Wikipedia pages:  73%|███████▎  | 10096/13891 [12:30<04:55, 12.86it/s]



Fetching Wikipedia pages:  75%|███████▌  | 10424/13891 [12:53<04:05, 14.12it/s]



Fetching Wikipedia pages:  78%|███████▊  | 10846/13891 [13:22<02:44, 18.48it/s]



Fetching Wikipedia pages:  78%|███████▊  | 10886/13891 [13:25<03:50, 13.04it/s]



Fetching Wikipedia pages:  80%|███████▉  | 11073/13891 [13:38<03:49, 12.27it/s]

Error fetching 'Free University of Berlin': HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?prop=extracts%7Crevisions&explaintext=&rvprop=ids&titles=Free+University+of+Berlin&format=json&action=query (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x13ae1aa60>: Failed to resolve 'en.wikipedia.org' ([Errno 8] nodename nor servname provided, or not known)"))


Fetching Wikipedia pages:  84%|████████▎ | 11607/13891 [14:16<02:20, 16.29it/s]



Fetching Wikipedia pages:  84%|████████▎ | 11630/13891 [14:17<02:07, 17.69it/s]



Fetching Wikipedia pages:  85%|████████▌ | 11864/13891 [14:34<02:05, 16.12it/s]



Fetching Wikipedia pages:  86%|████████▌ | 11904/13891 [14:37<03:05, 10.74it/s]



Fetching Wikipedia pages:  89%|████████▉ | 12394/13891 [15:11<01:19, 18.87it/s]



Fetching Wikipedia pages:  92%|█████████▏| 12774/13891 [15:36<01:11, 15.53it/s]



Fetching Wikipedia pages:  99%|█████████▉| 13754/13891 [16:48<00:10, 13.33it/s]



Fetching Wikipedia pages: 100%|██████████| 13891/13891 [16:57<00:00, 13.65it/s]

Successfully fetched 13863 documents out of 13891.





In [5]:
import pickle

# After fetching all_docs
with open("../data/all_docs.pkl", "wb") as f:
    pickle.dump(all_docs, f)
    print("Successfully saved all_docs to ../data/all_docs.pkl")

Successfully saved all_docs to ../data/all_docs.pkl
