In [1]:
#Load all chunks into a dataframe, text as one column, id is the other
#Create a variable with all the things broken up by page number in pdf view
#Search chunk through the text variable (can either break by list or just raw text variable)
#Label the chunk to the pages it covers
#When extract top 5 ids, run it through the pages and print them out at the end

In [47]:
from pinecone import Pinecone
import os
import openai
from dotenv import load_dotenv
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
import pandas as pd


# Initialize environment and API clients
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")



In [5]:
# Define the path to your PDF file
pdf_path = 'allAsha.pdf'
output_text_path = 'output_text.txt'

# Open the PDF file
with open(pdf_path, 'rb') as file:
    # Create PDF Reader object
    reader = PyPDF2.PdfReader(file)
    
    # Open an output text file to save the extracted text
    with open(output_text_path, 'w', encoding='utf-8') as output_file:
        # Iterate through each page in the PDF
        for page_number in range(len(reader.pages)):
            try:
                # Extract the text from each page
                page = reader.pages[page_number]
                text = page.extract_text()
                
                # Write the extracted text to the output file
                if text:
                    output_file.write(text.strip() + "\n\n")
            except Exception as e:
                print(f"Error reading page {page_number + 1}: {e}")
            
            # Write the page indicator in the desired format
            output_file.write(f'PAGE {page_number + 1}\n\n')

print("Text extraction complete. The output has been saved in", output_text_path)

Text extraction complete. The output has been saved in output_text.txt


In [27]:
def getIndex():
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    index = pc.Index("final-asha")
    return index

def get_embedding(text):
    response = openai.Embedding.create(input=text, engine="text-embedding-3-small")
    return response['data'][0]['embedding']

def getRes(query_embedding, index):
    res = index.query(vector=query_embedding, top_k=211, include_metadata=True)
    return res

def vectorQuotes(query_embedding, index):
    similarity = getRes(query_embedding, index)
    for entry in similarity['matches']:
        print(entry['id'])
    return [{"text": match['metadata']['text'], "id": match['id']} for match in similarity['matches']]

In [None]:
statement = "How is your day"
embedding = get_embedding(statement)
index = getIndex()
quotes = vectorQuotes(embedding, index)
print(quotes)


In [30]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 211}},
 'total_vector_count': 211}

In [11]:

# Create a pandas DataFrame from the quotes list
df = pd.DataFrame(quotes)

df.head()

Unnamed: 0,text,id
0,day must contribute to your overall goals in s...,114
1,"your needs after that of other people, to lear...",113
2,Hospital/Private Hospital 4 a) Nature of deliv...,184
3,Birth_________________________________2) Pre-t...,185
4,and do not interrupt the woman while she is sp...,152


In [22]:

# Load the extracted text file
with open(output_text_path, 'r', encoding='utf-8') as file:
    full_text = file.read()

# Split the text into pages using the page markers
pages = re.split(r'PAGE \d+\n\n', full_text)

# Function to find the pages each quote spans using cosine similarity
def find_citations(quote_text, pages):
    citations = []
    vectorizer = TfidfVectorizer().fit_transform([quote_text] + pages)
    cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
    
    for i, similarity in enumerate(cosine_similarities):
        if similarity > 0.8:  # Threshold for considering text as similar
            citations.append(i + 1)
    return citations

# Add a new column 'citations' to the DataFrame
df['citations'] = df['text'].apply(lambda x: find_citations(x, pages))

df.head()

Unnamed: 0,text,id,citations
0,day must contribute to your overall goals in s...,114,[]
1,"your needs after that of other people, to lear...",113,[219]
2,Hospital/Private Hospital 4 a) Nature of deliv...,184,"[337, 339]"
3,Birth_________________________________2) Pre-t...,185,"[339, 341]"
4,and do not interrupt the woman while she is sp...,152,[285]


In [21]:
print(df['citations'])

0         []
1         []
2      [339]
3      [341]
4         []
       ...  
206       []
207       []
208       []
209       []
210       []
Name: citations, Length: 211, dtype: object


In [31]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("ALTERNATE_PINECONE"))
index = pc.Index("asha-done")

In [33]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [40]:
import re
from typing import List, Dict
import math
from tqdm import tqdm

def process_and_chunk_text(file_path: str, tokens_per_chunk: int = 500, overlap_percent: float = 0.1) -> List[Dict[str, str]]:
    """
    Read entire text file and chunk it into segments with overlap.
    
    Args:
        file_path (str): Path to the text file
        tokens_per_chunk (int): Number of tokens per chunk
        overlap_percent (float): Percentage of overlap between chunks
        
    Returns:
        List[Dict[str, str]]: List of dictionaries containing chunks and their IDs
    """
    print(f"Reading file: {file_path}")
    # Read entire file content
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            full_text = file.read()
        print(f"Successfully read file. Total characters: {len(full_text)}")
        
        # Print the entire text
        print("\n========== FULL TEXT CONTENT ==========")
        print(full_text)
        print("======================================\n")
        
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return []

    # Calculate words per chunk (assuming 0.75 tokens per word)
    words_per_chunk = math.ceil(tokens_per_chunk / 0.75)
    overlap_words = math.ceil(words_per_chunk * overlap_percent)
    
    # Split text into words while preserving newlines
    print("Processing text into words...")
    words = []
    lines = full_text.split('\n')
    
    with tqdm(total=len(lines), desc="Processing lines") as pbar:
        for line in lines:
            if not line.strip():
                words.append('\n')
            else:
                words.extend(line.split())
                words.append('\n')
            pbar.update(1)
    
    # Calculate total expected chunks
    total_words = len(words)
    expected_chunks = math.ceil(total_words / (words_per_chunk - overlap_words))
    
    print(f"\nTotal words: {total_words}")
    print(f"Estimated chunks to create: {expected_chunks}")
    print("Creating chunks...")
    
    chunks = []
    current_pos = 0
    chunk_id = 1
    
    # Process chunks with progress bar
    with tqdm(total=expected_chunks, 
             desc="Generating chunks", 
             unit='chunk',
             bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') as pbar:
        while current_pos < len(words):
            # Calculate end position for current chunk
            end_pos = min(current_pos + words_per_chunk, len(words))
            
            # Look ahead to check if we're splitting a "PAGE X" sequence
            look_ahead = 10
            for i in range(end_pos, min(end_pos + look_ahead, len(words))):
                if i > 0 and words[i-1] == "PAGE":
                    # If we find "PAGE", include its number in the current chunk
                    end_pos = i + 1
                    break
            
            # Look backwards if we're about to split "PAGE X"
            if end_pos < len(words) and words[end_pos-1] == "PAGE":
                end_pos -= 1
            
            # Create chunk text
            chunk_text = ' '.join(words[current_pos:end_pos])
            
            # Clean up formatting
            chunk_text = re.sub(r'\n\s*\n', '\n\n', chunk_text)
            chunk_text = re.sub(r' +', ' ', chunk_text)
            chunk_text = chunk_text.replace(' \n', '\n').replace('\n ', '\n')
            
            # Add chunk to results
            chunks.append({
                "text": chunk_text.strip(),
                "id": chunk_id
            })
            
            # Move position for next chunk, accounting for overlap
            current_pos = end_pos - overlap_words
            chunk_id += 1
            
            # Update progress bar with detailed information
            pbar.set_postfix({
                'Chunk': f'{chunk_id-1}/{expected_chunks}',
                'Words': f'{min(end_pos, len(words))}/{total_words}'
            }, refresh=True)
            pbar.update(1)
    
    print(f"\nChunking complete. Total chunks created: {len(chunks)}")
    return chunks, full_text  # Now also returning the full text

# Example usage:
if __name__ == "__main__":
    file_path = "output_text.txt"
    chunks, full_text = process_and_chunk_text(file_path)
    
    # Save full text to a separate file
    print("\nSaving full text to 'full_text_content.txt'...")
    with open('full_text_content.txt', 'w', encoding='utf-8') as f:
        f.write(full_text)
    
    # Optionally print chunks
    if chunks:
        print("\nWould you like to print the individual chunks? (y/n)")
        if input().lower() == 'y':
            print("\nPrinting chunks...")
            for chunk in tqdm(chunks, desc="Printing chunks", unit='chunk'):
                print(f"\nChunk {chunk['id']}:")
                print("-" * 50)
                print(chunk['text'])
                print("-" * 50)

Reading file: output_text.txt
Successfully read file. Total characters: 536680

MINISTRY OF HEALTH AND FAMILY WELFARE
GOVERNMENT OF INDIAMINISTRY OF HEALTH AND FAMILY WELFARE
GOVERNMENT OF INDIA
Book No-1Book No-1
                  
      forASHAASHAReading
Material

PAGE 1

S.No. Chapter Name Pages
Preface 2
Acknowledgement 4
1. National Rural Health Mission–The Vision 5
2. Introduction: The Asha Learning Programme 6
3. ASHA: My Eight Tasks 3
4. Staying Healthy 9
5. Personal Hygiene 12
6. Water Safety at Home 15
7. Disposal of Waste Water 17
8. Our Health Depends on Food We Eat! 19
9. Body Mapping and Overview of Organs 22
10. Women and Health 25
11. Organising a Group Meeting 27
12. Know Health Services 29
13. Anganwadi Centre 32
14. Village Birth Attendant or Dai 34
15. Illness and Healing 36

PAGE 2

S.No. Chapter Name Pages
16. Using Remedies 38
17. Home Remedies 40
18. Preventing Unwanted Pregnancies 43
19. Condoms: An Option for Men 45
20. Registration of Pregnant Women 46
21. J

Processing lines: 100%|██████████| 16458/16458 [00:00<00:00, 497777.95it/s]



Total words: 96489
Estimated chunks to create: 161
Creating chunks...


Generating chunks: |                    | 3543231/? [46:34<00:00, 1268.12chunk/s, Chunk=3543232/161, Words=96489/96489]


KeyboardInterrupt: 

In [42]:
def file_to_string(filename):
    """
    Reads a text file and converts its contents into a single string.
    
    Args:
        filename (str): Path to the text file to be read
        
    Returns:
        str: Contents of the file as a single string
        
    Raises:
        FileNotFoundError: If the specified file doesn't exist
        IOError: If there's an error reading the file
    """
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            # Read entire file and store as string
            content_string = file.read()
        return content_string
    except FileNotFoundError:
        raise FileNotFoundError(f"The file '{filename}' was not found")
    except IOError as e:
        raise IOError(f"Error reading the file: {str(e)}")

# Example usage
if __name__ == "__main__":
    try:
        result = file_to_string('output_text.txt')
        print("File contents:", result)
    except Exception as e:
        print(f"An error occurred: {str(e)}")

File contents: MINISTRY OF HEALTH AND FAMILY WELFARE
GOVERNMENT OF INDIAMINISTRY OF HEALTH AND FAMILY WELFARE
GOVERNMENT OF INDIA
Book No-1Book No-1
                  
      forASHAASHAReading
Material

PAGE 1

S.No. Chapter Name Pages
Preface 2
Acknowledgement 4
1. National Rural Health Mission–The Vision 5
2. Introduction: The Asha Learning Programme 6
3. ASHA: My Eight Tasks 3
4. Staying Healthy 9
5. Personal Hygiene 12
6. Water Safety at Home 15
7. Disposal of Waste Water 17
8. Our Health Depends on Food We Eat! 19
9. Body Mapping and Overview of Organs 22
10. Women and Health 25
11. Organising a Group Meeting 27
12. Know Health Services 29
13. Anganwadi Centre 32
14. Village Birth Attendant or Dai 34
15. Illness and Healing 36

PAGE 2

S.No. Chapter Name Pages
16. Using Remedies 38
17. Home Remedies 40
18. Preventing Unwanted Pregnancies 43
19. Condoms: An Option for Men 45
20. Registration of Pregnant Women 46
21. Janani Suraksha Yojana 48
22. HIV and AIDS 49
23. Breast Feeding 5

In [43]:
from tqdm import tqdm

def create_chunks_with_overlap(text, chunk_size=375, overlap_percentage=10):
    """
    Creates chunks of text with specified overlap, ensuring "PAGE X" stays together.
    Shows progress using tqdm.
    
    Args:
        text (str): Input text to be chunked
        chunk_size (int): Number of words per chunk (default: 375)
        overlap_percentage (int): Percentage of overlap between chunks (default: 10)
        
    Returns:
        list: List of text chunks with specified overlap
    """
    # Split the text into words
    words = text.split()
    
    # Calculate overlap size
    overlap_size = int(chunk_size * (overlap_percentage / 100))
    stride = chunk_size - overlap_size
    
    # Calculate total number of chunks (approximate)
    total_chunks = max(1, len(words) // stride)
    
    chunks = []
    start_idx = 0
    
    # Create progress bar
    with tqdm(total=total_chunks, desc="Creating chunks", unit="chunk") as pbar:
        while start_idx < len(words):
            # Calculate initial end index for this chunk
            end_idx = min(start_idx + chunk_size, len(words))
            
            # Check if we need to adjust for "PAGE" keyword
            if end_idx < len(words):
                # Look at the last few words of the current chunk
                for i in range(max(end_idx - 5, start_idx), end_idx):
                    if i + 1 < len(words) and words[i] == "PAGE" and words[i + 1].isdigit():
                        # If "PAGE" would be split, adjust end_idx to include both words
                        if i == end_idx - 1:
                            end_idx = i + 2
                        # If "PAGE" would be split between chunks, move it to next chunk
                        elif i == end_idx - 2:
                            end_idx = i
                        break
            
            # Create the chunk
            chunk = " ".join(words[start_idx:end_idx])
            chunks.append(chunk)
            
            # Move start_idx for next chunk, ensuring we don't split "PAGE X"
            start_idx += stride
            
            # If we're near "PAGE", adjust start_idx
            if start_idx < len(words) - 1:
                while start_idx < len(words) - 1 and words[start_idx - 1] == "PAGE" and words[start_idx].isdigit():
                    start_idx += 1
            
            # Update progress bar
            pbar.update(1)
    
    return chunks

# Example usage
if __name__ == "__main__":
    # Example text with PAGE numbers
    sample_text = result
    
    try:
        # Create chunks from the sample text
        chunks = create_chunks_with_overlap(sample_text)
        
        # Print chunks with their lengths
        for i, chunk in enumerate(chunks):
            print(f"\nChunk {i + 1} ({len(chunk.split())} words):")
            print(chunk)
            print("-" * 80)
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Creating chunks: 237chunk [00:00, 29620.08chunk/s]         


Chunk 1 (375 words):
MINISTRY OF HEALTH AND FAMILY WELFARE GOVERNMENT OF INDIAMINISTRY OF HEALTH AND FAMILY WELFARE GOVERNMENT OF INDIA Book No-1Book No-1 forASHAASHAReading Material PAGE 1 S.No. Chapter Name Pages Preface 2 Acknowledgement 4 1. National Rural Health Mission–The Vision 5 2. Introduction: The Asha Learning Programme 6 3. ASHA: My Eight Tasks 3 4. Staying Healthy 9 5. Personal Hygiene 12 6. Water Safety at Home 15 7. Disposal of Waste Water 17 8. Our Health Depends on Food We Eat! 19 9. Body Mapping and Overview of Organs 22 10. Women and Health 25 11. Organising a Group Meeting 27 12. Know Health Services 29 13. Anganwadi Centre 32 14. Village Birth Attendant or Dai 34 15. Illness and Healing 36 PAGE 2 S.No. Chapter Name Pages 16. Using Remedies 38 17. Home Remedies 40 18. Preventing Unwanted Pregnancies 43 19. Condoms: An Option for Men 45 20. Registration of Pregnant Women 46 21. Janani Suraksha Yojana 48 22. HIV and AIDS 49 23. Breast Feeding 51 24. Infant Nutrition




In [44]:
print(chunks)



In [45]:
import pandas as pd
from tqdm import tqdm

def create_data_structures(chunks):
    """
    Converts a list of text chunks into both a pandas DataFrame and a list of dictionaries.
    
    Args:
        chunks (list): List of text strings (chunks)
        
    Returns:
        tuple: (pandas DataFrame, list of dictionaries)
    """
    # Create the list of dictionaries with progress bar
    data = [
        {"id": i + 1, "text": chunk}
        for i, chunk in tqdm(enumerate(chunks), total=len(chunks), desc="Creating data structures")
    ]
    
    # Create pandas DataFrame from the data
    df = pd.DataFrame(data)
    
    return df, data

if __name__ == "__main__":
    # Example usage with sample chunks
    sample_chunks = chunks
    
    try:
        # Create both data structures
        df, data = create_data_structures(sample_chunks)
        
        # Display the first few rows of the DataFrame
        print("\nFirst few rows of the DataFrame:")
        print(df.head())
        
        # Display the first few items in the data list
        print("\nFirst few items in the data list:")
        for item in data[:2]:
            print(item)
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Creating data structures: 100%|██████████| 237/237 [00:00<?, ?it/s]


First few rows of the DataFrame:
   id                                               text
0   1  MINISTRY OF HEALTH AND FAMILY WELFARE GOVERNME...
1   2  for about two days once every alternate month....
2   3  organising thereading material. Dr. Pramod Sam...
3   4  health like sanitation & hygiene, nutrition an...
4   5  of seven days, you will have acquired prelimin...

First few items in the data list:
{'id': 1, 'text': 'MINISTRY OF HEALTH AND FAMILY WELFARE GOVERNMENT OF INDIAMINISTRY OF HEALTH AND FAMILY WELFARE GOVERNMENT OF INDIA Book No-1Book No-1 forASHAASHAReading Material PAGE 1 S.No. Chapter Name Pages Preface 2 Acknowledgement 4 1. National Rural Health Mission–The Vision 5 2. Introduction: The Asha Learning Programme 6 3. ASHA: My Eight Tasks 3 4. Staying Healthy 9 5. Personal Hygiene 12 6. Water Safety at Home 15 7. Disposal of Waste Water 17 8. Our Health Depends on Food We Eat! 19 9. Body Mapping and Overview of Organs 22 10. Women and Health 25 11. Organising a Grou




In [52]:
print(data)





In [54]:
import pinecone
import openai
import time
import os
from tqdm import tqdm
from pinecone import Pinecone

def get_embeddings(texts, model="text-embedding-3-small", batch_size=100):
    """
    Get embeddings from OpenAI API in batches.
    """
    embeddings = []
    total_texts = len(texts)
    
    total_batches = (total_texts + batch_size - 1) // batch_size
    with tqdm(total=total_texts, desc="Creating embeddings", unit="texts") as pbar:
        for i in range(0, total_texts, batch_size):
            batch = texts[i:min(i + batch_size, total_texts)]
            batch_size_actual = len(batch)
            
            try:
                response = openai.Embedding.create(
                    input=batch,
                    model=model
                )
                batch_embeddings = [item.embedding for item in response.data]
                embeddings.extend(batch_embeddings)
            except Exception as e:
                print(f"\nError creating embeddings for batch {i//batch_size + 1}/{total_batches}: {str(e)}")
                embeddings.extend([None] * batch_size_actual)
            
            pbar.update(batch_size_actual)
            time.sleep(0.1)  # Rate limit protection
    
    return embeddings

def upsert_to_pinecone(data, index, embedding_model="text-embedding-3-small", batch_size=100):
    """
    Upserts text chunks to Pinecone index with OpenAI embeddings.
    """
    # Create main progress bar for overall process
    with tqdm(total=3, desc="Overall progress", position=0) as main_pbar:
        
        # Step 1: Prepare embeddings
        main_pbar.set_description("Creating embeddings")
        texts = [d['text'] for d in data]
        embeddings = get_embeddings(texts, model=embedding_model, batch_size=batch_size)
        main_pbar.update(1)
        
        # Step 2: Prepare vectors
        main_pbar.set_description("Preparing vectors")
        valid_vectors = []
        for d, emb in tqdm(zip(data, embeddings), 
                          total=len(data), 
                          desc="Processing vectors", 
                          position=1, 
                          leave=False):
            if emb is not None:
                valid_vectors.append({
                    "id": str(d['id']),
                    "values": emb,
                    "metadata": {'text': d['text']}
                })
        main_pbar.update(1)
        
        if not valid_vectors:
            raise ValueError("No valid embeddings were created")
        
        # Step 3: Upsert to Pinecone
        main_pbar.set_description("Upserting to Pinecone")
        total_vectors = len(valid_vectors)
        failed_batches = 0
        
        print("\nChecking index status...")
        # Removed index status check as it's not needed in new Pinecone client
        
        with tqdm(total=total_vectors, 
                 desc="Upserting vectors", 
                 unit="vectors",
                 position=1,
                 leave=False) as upsert_pbar:
            
            for i in range(0, total_vectors, batch_size):
                batch = valid_vectors[i:i + batch_size]
                batch_size_actual = len(batch)
                
                try:
                    index.upsert(
                        vectors=batch,
                        namespace="ns1"
                    )
                except Exception as e:
                    failed_batches += 1
                    print(f"\nError upserting batch {i//batch_size + 1}: {str(e)}")
                    continue
                finally:
                    upsert_pbar.update(batch_size_actual)
        
        main_pbar.update(1)
    
    # Final statistics
    print("\nUpsert Summary:")
    print(f"Total vectors processed: {total_vectors}")
    print(f"Successfully upserted: {total_vectors - (failed_batches * batch_size)}")
    if failed_batches > 0:
        print(f"Failed batches: {failed_batches}")
    if len(valid_vectors) < len(data):
        print(f"Skipped vectors due to failed embeddings: {len(data) - len(valid_vectors)}")

def main(data):
    # Initialize Pinecone with new client
    print("Initializing Pinecone...")
    pc = Pinecone(api_key=os.getenv("ALTERNATE_PINECONE"))
    index = pc.Index("asha-done")
    
    try:
        upsert_to_pinecone(data, index)
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")

if __name__ == "__main__":
    data = data  # Your data should be defined before this point
    main(data)

Initializing Pinecone...


Creating embeddings: 100%|██████████| 237/237 [00:05<00:00, 40.55texts/s]
Upserting to Pinecone:  67%|██████▋   | 2/3 [00:05<00:05,  5.85s/it]


Checking index status...


Upserting to Pinecone: 100%|██████████| 3/3 [00:10<00:00,  3.38s/it]


Upsert Summary:
Total vectors processed: 237
Successfully upserted: 237





In [55]:
import pandas as pd
import re
from tqdm import tqdm

def extract_page_numbers(text):
    """
    Extracts all page numbers from a text chunk that follow the pattern 'PAGE X'
    
    Args:
        text (str): The text chunk to analyze
        
    Returns:
        list: List of page numbers found in the chunk
    """
    # Find all instances of "PAGE" followed by a number
    pattern = r'PAGE\s+(\d+)'
    matches = re.finditer(pattern, text)
    page_numbers = [int(match.group(1)) for match in matches]
    return page_numbers

def add_page_numbers_column(df):
    """
    Adds a new column to the DataFrame containing page numbers found in each chunk
    
    Args:
        df (pandas.DataFrame): DataFrame with 'text' column containing chunks
        
    Returns:
        pandas.DataFrame: DataFrame with new 'page_numbers' column
    """
    # Create the new column with progress bar
    print("Extracting page numbers from chunks...")
    tqdm.pandas()
    df['page_numbers'] = df['text'].progress_apply(extract_page_numbers)
    
    # Print summary statistics
    total_chunks = len(df)
    chunks_with_pages = len(df[df['page_numbers'].str.len() > 0])
    total_page_numbers = sum(len(pages) for pages in df['page_numbers'])
    
    print("\nPage Number Extraction Summary:")
    print(f"Total chunks processed: {total_chunks}")
    print(f"Chunks containing page numbers: {chunks_with_pages}")
    print(f"Total page numbers found: {total_page_numbers}")
    
    return df

if __name__ == "__main__":
    # Assuming df is your DataFrame with 'id' and 'text' columns
    try:
        # Process the DataFrame
        df_with_pages = add_page_numbers_column(df)
        
        # Display sample results
        print("\nSample results:")
        sample_rows = df_with_pages[['id', 'page_numbers']].head()
        print(sample_rows)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Extracting page numbers from chunks...


100%|██████████| 237/237 [00:00<00:00, 47426.05it/s]


Page Number Extraction Summary:
Total chunks processed: 237
Chunks containing page numbers: 236
Total page numbers found: 473

Sample results:
   id  page_numbers
0   1  [1, 2, 3, 4]
1   2     [4, 5, 6]
2   3        [7, 8]
3   4       [9, 10]
4   5          [11]





In [56]:
df_with_pages.head()

Unnamed: 0,id,text,page_numbers
0,1,MINISTRY OF HEALTH AND FAMILY WELFARE GOVERNME...,"[1, 2, 3, 4]"
1,2,for about two days once every alternate month....,"[4, 5, 6]"
2,3,organising thereading material. Dr. Pramod Sam...,"[7, 8]"
3,4,"health like sanitation & hygiene, nutrition an...","[9, 10]"
4,5,"of seven days, you will have acquired prelimin...",[11]


In [57]:
df = df_with_pages

In [58]:
# Method 1: Using mean with list comprehension
average_length = sum(len(x) for x in df['page_numbers']) / len(df)

# Method 2: Using pandas apply
average_length = df['page_numbers'].apply(len).mean()

print(f"Average number of page numbers per chunk: {average_length:.2f}")

# To also get additional statistics:
page_number_lengths = df['page_numbers'].apply(len)
print("\nPage Numbers per Chunk Statistics:")
print(f"Mean: {page_number_lengths.mean():.2f}")
print(f"Median: {page_number_lengths.median():.2f}")
print(f"Max: {page_number_lengths.max()}")
print(f"Min: {page_number_lengths.min()}")

Average number of page numbers per chunk: 2.00

Page Numbers per Chunk Statistics:
Mean: 2.00
Median: 2.00
Max: 11
Min: 0


In [59]:
# Method 1: Using list comprehension and flatten the lists
max_page = max(page_num for sublist in df['page_numbers'] for page_num in sublist)

# Method 2: Using pandas explode (more pandas-like approach)
max_page = df['page_numbers'].explode().max()

# Print both max page and some context
print(f"Maximum page number found: {max_page}")

# To get more details about the distribution:
all_pages = df['page_numbers'].explode()
print("\nPage Number Statistics:")
print(f"Max page: {all_pages.max()}")
print(f"Min page: {all_pages.min()}")
print(f"Unique pages found: {len(all_pages.unique())}")

Maximum page number found: 430

Page Number Statistics:
Max page: 430
Min page: 1
Unique pages found: 431


In [61]:
# First, let's see where this nan appears in our DataFrame
print("Chunks with nan page numbers:")
nan_chunks = df[df['page_numbers'].apply(lambda x: any(pd.isna(num) for num in x))]
print(nan_chunks[['id', 'text', 'page_numbers']])

# Clean up the page_numbers column by removing nans
def clean_page_numbers(page_list):
    return [x for x in page_list if pd.notna(x)]

# Apply the cleaning
df['page_numbers'] = df['page_numbers'].apply(clean_page_numbers)

# Verify the cleanup
print("\nAfter cleaning:")
unique_pages = sorted(df['page_numbers'].explode().unique())
print(f"Number of unique pages after cleaning: {len(unique_pages)}")
print(f"Maximum page number: {max(unique_pages)}")
print(f"Minimum page number: {min(unique_pages)}")

# Double check there are no more nans
nan_check = df[df['page_numbers'].apply(lambda x: any(pd.isna(num) for num in x))]
print(f"\nRemaining chunks with nan values: {len(nan_check)}")

Chunks with nan page numbers:
Empty DataFrame
Columns: [id, text, page_numbers]
Index: []

After cleaning:
Number of unique pages after cleaning: 431
Maximum page number: 430
Minimum page number: 1

Remaining chunks with nan values: 0


In [62]:
# First, let's find where nan appears in the ordered list
page_series = df['page_numbers'].explode()
page_series = pd.Series(page_series).reset_index(drop=True)

# Find the index of nan
nan_index = page_series[page_series.isna()].index[0]

# Get some context around the nan
context_range = 5  # Show 5 numbers before and after
start_idx = max(0, nan_index - context_range)
end_idx = min(len(page_series), nan_index + context_range + 1)

print(f"Context around nan (showing {context_range} numbers before and after):")
print("\nSequence:")
for idx in range(start_idx, end_idx):
    marker = "**" if idx == nan_index else "  "
    print(f"{marker}{idx}: {page_series[idx]}{marker}")

# Also find the chunks where these page numbers appear
print("\nChunks containing this sequence:")
relevant_pages = list(page_series[start_idx:end_idx].dropna())
chunks_with_sequence = df[df['page_numbers'].apply(lambda x: any(page in x for page in relevant_pages))]
print(chunks_with_sequence[['id', 'page_numbers']].to_string())

# Optionally, print the text of these chunks to see the full context
print("\nText from relevant chunks:")
for idx, row in chunks_with_sequence.iterrows():
    print(f"\nChunk {row['id']} (Pages {row['page_numbers']}):")
    print(row['text'][:200] + "...")  # Print first 200 chars

Context around nan (showing 5 numbers before and after):

Sequence:
  246: 221  
  247: 222  
  248: 223  
  249: 224  
  250: 225  
**251: nan**
  252: 226  
  253: 227  
  254: 228  
  255: 229  
  256: 230  

Chunks containing this sequence:
      id page_numbers
127  128   [220, 221]
128  129   [221, 222]
129  130        [223]
130  131        [224]
131  132        [225]
133  134   [226, 227]
134  135        [228]
135  136        [229]
136  137        [230]

Text from relevant chunks:

Chunk 128 (Pages [220, 221]):
way of working, etc.) zWhat did I do today for which I am not happy? Why? zWhat did I do today which has made me happy? Remember , each day must contribute to your overall goals in some way. Each day ...

Chunk 129 (Pages [221, 222]):
plan and follow the same steps . PAGE 221 9Some Tips to Achieve your Goal zBe passionate about your goal. Passion is the energy that drives us. Let your passion carry you, because it will carry you fa...

Chunk 130 (Pages [223]):
your point 

In [64]:
# Get the specific chunks around this transition
print("Text ending page 225:")
chunk_225 = df[df['page_numbers'].apply(lambda x: 225 in x)]['text'].iloc[0]
print(chunk_225[-200:], "\n")  # Last 200 chars of chunk containing page 225

print("Text starting page 226:")
chunk_226 = df[df['page_numbers'].apply(lambda x: 226 in x)]['text'].iloc[0]
print(chunk_226[:200])  # First 200 chars of chunk containing page 226

# Also check if there's any chunk between these that might contain malformed page numbers
chunks_between = df[
    (df.index > df[df['page_numbers'].apply(lambda x: 225 in x)].index[0]) &
    (df.index < df[df['page_numbers'].apply(lambda x: 226 in x)].index[0])
]
if not chunks_between.empty:
    print("\nChunks between page 225 and 226:")
    print(chunks_between['text'])

Text ending page 225:
and women had to slog evenmore for their fuel, fodder and water. All in all, it was the women whowere the main victims of India's deforestation policies. Against these harmful deforestation policies a 

Text starting page 226:
of the authoritiesthrough letters, appeals or direct dialogue. For instance, you can meet someone of importance concerning the issue, like the Sarpanch, the ANM, the schoolteacher, the Talati and disc

Chunks between page 225 and 226:
132    houses too were destroyed, and women had to sl...
Name: text, dtype: object


In [65]:
# Clean the page_numbers column by removing nan values
df['page_numbers'] = df['page_numbers'].apply(lambda x: [p for p in x if pd.notna(p)])

# Verify the cleanup
unique_pages = sorted(df['page_numbers'].explode().unique())
print("After cleaning:")
print(f"Number of unique pages: {len(unique_pages)}")
print(f"Maximum page number: {max(unique_pages)}")
print(f"Minimum page number: {min(unique_pages)}")

# Double check there are no more nans
nan_check = df[df['page_numbers'].apply(lambda x: any(pd.isna(num) for num in x))]
print(f"\nRemaining chunks with nan values: {len(nan_check)}")

After cleaning:
Number of unique pages: 431
Maximum page number: 430
Minimum page number: 1

Remaining chunks with nan values: 0


In [66]:
# Get all unique values and sort them to see exactly what we have
unique_sorted = sorted(df['page_numbers'].explode().unique())
print("All unique page numbers in order:")
print(unique_sorted)

# Count frequency of each page number
value_counts = df['page_numbers'].explode().value_counts().sort_index()
print("\nFrequency of each page number:")
print(value_counts)

# Check for any non-integer values
non_integers = df['page_numbers'].explode()[~df['page_numbers'].explode().astype(str).str.match(r'^\d+$')]
if not non_integers.empty:
    print("\nNon-integer values found:")
    print(non_integers)

All unique page numbers in order:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215

In [67]:
# Remove any empty lists or nans from the sorted unique values
unique_sorted = [x for x in unique_sorted if pd.notna(x)]

# Verify the cleanup
print("After removing nan:")
print(f"Number of unique pages: {len(unique_sorted)}")
print(f"Maximum page number: {max(unique_sorted)}")
print(f"Minimum page number: {min(unique_sorted)}")

# Verify the sequence is complete from 1 to 430
expected_sequence = set(range(1, 431))
actual_sequence = set(unique_sorted)
if expected_sequence == actual_sequence:
    print("\nSequence is complete and clean: pages 1-430")
else:
    missing = expected_sequence - actual_sequence
    extra = actual_sequence - expected_sequence
    if missing:
        print(f"\nMissing pages: {sorted(missing)}")
    if extra:
        print(f"\nExtra pages: {sorted(extra)}")

After removing nan:
Number of unique pages: 430
Maximum page number: 430
Minimum page number: 1

Sequence is complete and clean: pages 1-430


In [68]:
# Clean the page_numbers column to ensure no nans
df['page_numbers'] = df['page_numbers'].apply(lambda x: [int(p) for p in x if pd.notna(p)])

# Save to CSV
# Note: since page_numbers is a list, we'll need to convert it to string for CSV storage
df_to_save = df.copy()
df_to_save['page_numbers'] = df_to_save['page_numbers'].apply(str)
df_to_save.to_csv('chunks_with_pages.csv', index=False)

# Print verification
print("DataFrame saved. Quick verification:")
print(f"Total rows: {len(df)}")
print("\nFirst few rows:")
print(df[['id', 'page_numbers']].head())
print("\nLast few rows:")
print(df[['id', 'page_numbers']].tail())

# Verify unique pages again
all_pages = sorted(set(page for pages in df['page_numbers'] for page in pages))
print(f"\nNumber of unique pages in cleaned data: {len(all_pages)}")
print(f"Page range: {min(all_pages)} to {max(all_pages)}")

DataFrame saved. Quick verification:
Total rows: 237

First few rows:
   id  page_numbers
0   1  [1, 2, 3, 4]
1   2     [4, 5, 6]
2   3        [7, 8]
3   4       [9, 10]
4   5          [11]

Last few rows:
      id          page_numbers
232  233                 [421]
233  234                 [422]
234  235            [423, 424]
235  236            [425, 426]
236  237  [427, 428, 429, 430]

Number of unique pages in cleaned data: 430
Page range: 1 to 430


In [69]:
import pandas as pd
import ast

def count_empty_page_numbers(file_path):
    """
    Count rows where the page numbers array is empty in a CSV file.
    
    Parameters:
    file_path (str): Path to the CSV file
    
    Returns:
    tuple: (count of empty arrays, total rows, percentage empty)
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Convert string representations of arrays to actual arrays
    def parse_array(x):
        try:
            # Handle NaN/empty values
            if pd.isna(x):
                return []
            # Parse string representation of array
            return ast.literal_eval(x)
        except:
            return []
    
    df['page_numbers'] = df['page_numbers'].apply(parse_array)
    
    # Count empty arrays
    empty_count = sum(len(arr) == 0 for arr in df['page_numbers'])
    total_rows = len(df)
    percentage_empty = (empty_count / total_rows) * 100
    
    return empty_count, total_rows, percentage_empty

# Example usage
if __name__ == "__main__":
    file_path = "HealthAssistantBackend\chunks_with_pages.csv"
    empty_count, total_rows, percentage = count_empty_page_numbers(file_path)
    
    print(f"Number of rows with empty page numbers: {empty_count}")
    print(f"Total number of rows: {total_rows}")
    print(f"Percentage of rows with empty page numbers: {percentage:.2f}%")

Number of rows with empty page numbers: 1
Total number of rows: 237
Percentage of rows with empty page numbers: 0.42%


In [71]:
import pandas as pd
import ast

# Read the CSV file
df = pd.read_csv('HealthAssistantBackend\chunks_with_pages.csv')

# Convert and check for empty page numbers
df['page_numbers'] = df['page_numbers'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
print(df[df['page_numbers'].apply(len) == 0])

      id                                               text page_numbers
132  133  houses too were destroyed, and women had to sl...           []
