# Extracting and Embedding Wikipedia Data

## Overview
The parquet file containing the 10,000 wikipedia articles contains 700,000+ rows of text data. This demo file simply embeds a small sample of rows (1,000) using BERT, then it runs a vector similarity search using FAISS. The working BERT -> FAISS pipeline should involve grouping the rows of text with their appropriate articles and efficiently running the embedding on them.

In [1]:
import os
import time
import torch
from transformers import BertTokenizer, BertModel
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

# Set the model to evaluation mode 
model.eval()

cpu


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [3]:
def get_total_rows(parquet_file_path):
    table = pq.read_table(parquet_file_path)
    return table.num_rows

def sample_parquet_data(parquet_file_path, sample_size):
    total_rows = get_total_rows(parquet_file_path)
    
    if sample_size >= total_rows:
        print(f"Sample size {sample_size} is >= total rows {total_rows}. Processing all data.")
        return pq.read_table(parquet_file_path).to_pandas()
    
    print(f"Sampling {sample_size} rows from {total_rows} total rows...")
    # Sample indices
    sampled_indices = np.random.choice(total_rows, size=sample_size, replace=False)
    sampled_indices.sort()  # Sort for efficient reading
    
    # Read the entire table and filter by sampled indices
    table = pq.read_table(parquet_file_path)
    sampled_table = table.take(sampled_indices)  # Take rows at the sampled indices
    return sampled_table.to_pandas()


In [4]:
def inspect_data(df):
    """
    Print information about the dataframe and display the first few rows.
    """
    print("\nData Inspection:")
    print(f"Shape of the dataframe: {df.shape}")
    print("\nColumn names:")
    print(df.columns.tolist())
    print("\nData types:")
    print(df.dtypes)
    print("\nFirst few rows:")
    print(df.head())
    print("\nSample of text data:")
    print(df.iloc[0, 0])  # Assuming text is in the first column


In [5]:
def embed_text_batch(text_batch, batch_size=32):
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(text_batch), batch_size):
            batch = text_batch[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embedding)
    return np.concatenate(embeddings)


In [6]:
def process_and_embed_data(parquet_file_path, output_file_path, sample_size=10000, embedding_batch_size=32):
    print(f"Sampling and loading data...")
    start_time = time.time()
    sampled_data = sample_parquet_data(parquet_file_path, sample_size)
    print(f"Data sampled and loaded in {time.time() - start_time:.2f} seconds")
    
    inspect_data(sampled_data)
    
    user_input = input("\nDo you want to continue with the embedding process? (yes/no): ").lower()
    if user_input != 'yes':
        print("Embedding process cancelled.")
        return

    all_embeddings = []
    texts = sampled_data.iloc[:, 0].tolist()  # Assuming text is in the first column
    
    print(f"Embedding {len(texts)} texts...")
    for i in tqdm(range(0, len(texts), embedding_batch_size), desc="Processing batches"):
        batch = texts[i:i+embedding_batch_size]
        batch_embeddings = embed_text_batch(batch, batch_size=embedding_batch_size)
        all_embeddings.append(batch_embeddings)
    
    print("Concatenating all embeddings...")
    final_embeddings = np.concatenate(all_embeddings)
    
    print(f"Saving embeddings to {output_file_path}...")
    np.save(output_file_path, final_embeddings)
    print(f"Embeddings saved. Shape of embeddings: {final_embeddings.shape}")


In [7]:
# Usage
parquet_file_path = os.path.abspath('../../src/data/simpleWikiData.parquet')
output_file_path = 'wikipedia_embeddings_sample.npy'
print(f"Starting process with parquet file: {parquet_file_path}")

# You can adjust these parameters
sample_size = 1000  # Adjust this to your desired sample size
embedding_batch_size = 32

process_and_embed_data(parquet_file_path, output_file_path, sample_size=sample_size, embedding_batch_size=embedding_batch_size)


Starting process with parquet file: c:\Users\chand\ACME10-HE-RAGApp\src\data\simpleWikiData.parquet
Sampling and loading data...
Sampling 1000 rows from 769764 total rows...
Data sampled and loaded in 0.16 seconds

Data Inspection:
Shape of the dataframe: (1000, 1)

Column names:
['text']

Data types:
text    object
dtype: object

First few rows:
                                                text
0  This kind of intensive agriculture comes with ...
1  Farmers select plants with better yield, taste...
2  As of 2004, there are thirty-four provinces. E...
3  E2 users create pages called "nodes" and add s...
4  Not all paradoxes are true logical paradoxes, ...

Sample of text data:
This kind of intensive agriculture comes with its own set of problems. Farmers use a lot of chemical fertilizers, pesticides (chemicals that kill bugs), and herbicides (chemicals that kill weeds). These chemicals can pollute the soil or the water. They can also create bugs and weeds that are more resistant to 

Processing batches: 100%|██████████| 32/32 [01:26<00:00,  2.70s/it]

Concatenating all embeddings...
Saving embeddings to wikipedia_embeddings_sample.npy...
Embeddings saved. Shape of embeddings: (1000, 768)





### Simple Similarity Search Example

In [2]:
import numpy as np

# Load the embeddings
output_file_path = 'wikipedia_embeddings_sample.npy'
embeddings = np.load(output_file_path)

print(f"Loaded embeddings shape: {embeddings.shape}")

Loaded embeddings shape: (1000, 768)


In [3]:
import faiss

# Assuming your embeddings have shape (num_vectors, dim)
dim = embeddings.shape[1]  # Number of dimensions

# Create a FAISS index (L2 distance)
index = faiss.IndexFlatL2(dim)

# Add your embeddings to the index
index.add(embeddings)

In [4]:
# Select a query vector (for example, the first embedding)
query_vector = embeddings[0].reshape(1, -1)  # Reshape to (1, dim)

# Number of nearest neighbors to search for
k = 5  # You can change this to find more neighbors

# Perform the search
distances, indices = index.search(query_vector, k)

# Output the results
print("Nearest neighbors (indices):", indices)
print("Distances to nearest neighbors:", distances)


Nearest neighbors (indices): [[ 0  1 58  7 30]]
Distances to nearest neighbors: [[ 0.      71.1909  71.89008 76.17949 76.93776]]
