In [3]:
import pandas as pd
import json
import uuid
import re

# --- Configuration ---
# Ensure this path points correctly to your Excel file
file_path = 'DUOLIFE Ingredients.xlsx'

# --- Load all sheets from the Excel file ---
# This creates a dictionary where keys are sheet names and values are DataFrames
try:
    xls = pd.ExcelFile(file_path)
    sheet_names = xls.sheet_names
    # Sanitize sheet names to make them easier to use as dictionary keys
    sanitized_sheet_names = {sheet: sheet.replace('.csv', '').strip() for sheet in sheet_names}
    data_frames = {sanitized_sheet_names[sheet]: pd.read_excel(xls, sheet_name=sheet) for sheet in sheet_names}
    
    print(f"Successfully loaded {len(sheet_names)} sheets from '{file_path}'.")
    print("\nAvailable data frames (with sanitized names):")
    for name in data_frames.keys():
        print(f"- {name}")
        
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred while loading the Excel file: {e}")

# This master list will hold all our final structured data chunks
all_chunks = []

Successfully loaded 18 sheets from 'DUOLIFE Ingredients.xlsx'.

Available data frames (with sanitized names):
- Sheet1
- Sheet2
- Sheet3
- Sheet4
- Import Duolife products
- Import Duolife Ingredients
- Products
- Ingredients
- Cosmetics
- Ingredients v2
- Old Business Q&A
- Club Ranks
- Career Structure
- Compensatory Bonus
- Rentier Bonus
- MB Bonus
- Sheet17
- Incentive Program


In [4]:
def clean_text(text):
    """Cleans input text by removing extra whitespace, newlines, and leading/trailing spaces."""
    if not isinstance(text, str):
        return ""
    # Replace multiple whitespace characters (including newlines) with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def create_chunk(text_to_embed, metadata):
    """Creates a standardized dictionary for a data chunk if the text is valid."""
    cleaned_text = clean_text(text_to_embed)
    # We only create a chunk if there is meaningful text to embed
        
    # Generate a unique ID for each chunk
    chunk_id = str(uuid.uuid4())
    
    return {
        "chunk_id": chunk_id,
        "text_to_embed": cleaned_text,
        "metadata": metadata
    }

In [5]:
def process_products_and_ingredients(data_frames, all_chunks):
    print("\n--- Processing Domain: Products & Ingredients ---")
    
    # --- Process Products ---
    df_products = data_frames.get('Products')
    if df_products is not None:
        product_count = 0
        # Define the columns that contain text we want to chunk separately
        product_columns_to_chunk = [
            'Description', 'Intended Use', 'Health Effects', 
            'Formulation Advantages', 'How to Use', 'Contraindications', 'Interactions'
        ]

        for index, row in df_products.iterrows():
            product_name = clean_text(row.get('Product Name'))
            if not product_name:
                continue
            
            product_count += 1
            base_metadata = {
                "domain": "Products",
                "source_file": "Products.csv",
                "primary_entity_id": str(row.get('Product ID', '')),
                "primary_entity_name": product_name,
                "type": clean_text(row.get('Type', '')),
                "link": clean_text(row.get('Link', ''))
            }
            
            for col in product_columns_to_chunk:
                text = row.get(col)
                if pd.notna(text):
                    metadata = base_metadata.copy()
                    metadata['section'] = col.replace(' ', '_') # Use snake_case
                    chunk = create_chunk(str(text), metadata)
                    if chunk:
                        all_chunks.append(chunk)
        print(f"Processed {product_count} products into multiple chunks.")

    # --- Process Ingredients ---
    df_ingredients = data_frames.get('Ingredients')
    if df_ingredients is not None:
        for index, row in df_ingredients.iterrows():
            text = row.get('Full Description')
            if pd.notna(text):
                metadata = {
                    "domain": "Ingredients",
                    "source_file": "Ingredients.csv",
                    "primary_entity_id": str(row.get('Ingredient ID', '')),
                    "primary_entity_name": clean_text(row.get('Ingredient Name', '')),
                    "section": "Description"
                }
                chunk = create_chunk(str(text), metadata)
                if chunk:
                    all_chunks.append(chunk)
        print(f"Processed {len(df_ingredients)} ingredients.")

# Execute the function
process_products_and_ingredients(data_frames, all_chunks)


--- Processing Domain: Products & Ingredients ---
Processed 57 products into multiple chunks.
Processed 281 ingredients.


In [6]:
def process_cosmetics(data_frames, all_chunks):
    print("\n--- Processing Domain: Cosmetics ---")
    
    # --- Process Cosmetics Products ---
    df_cosmetics = data_frames.get('Cosmetics')
    if df_cosmetics is not None:
        df_cosmetics.columns = [col.strip() for col in df_cosmetics.columns]
        
        for index, row in df_cosmetics.iterrows():
            text = row.get('Description')
            product_name = clean_text(row.get('Product Name'))
            if not product_name or pd.isna(text):
                continue

            # Dynamically find all ingredient-related columns and gather the values
            ingredients_list = []
            for col_name in df_cosmetics.columns:
                if 'Ingredient IDs' in col_name and pd.notna(row[col_name]):
                    ingredients_list.append(clean_text(row[col_name]))

            metadata = {
                "domain": "Cosmetics",
                "source_file": "Cosmetics .csv",
                "primary_entity_id": str(row.get('Product ID', '')),
                "primary_entity_name": product_name,
                "section": "Description",
                "type": clean_text(row.get('Type', '')),
                "ingredients_list": ingredients_list
            }
            chunk = create_chunk(str(text), metadata)
            if chunk:
                all_chunks.append(chunk)
        print(f"Processed {len(df_cosmetics)} cosmetic products.")

    # --- Process Cosmetic Ingredients (v2) ---
    df_ingredients_v2 = data_frames.get('Ingredients v2')
    if df_ingredients_v2 is not None:
        for index, row in df_ingredients_v2.iterrows():
            text = row.get('Description')
            if pd.notna(text):
                metadata = {
                    "domain": "Ingredients",
                    "source_file": "Ingredients v2.csv",
                    "primary_entity_id": str(row.get('Ingredient ID', '')),
                    "primary_entity_name": clean_text(row.get('Ingredient Name', '')),
                    "section": "Description"
                }
                chunk = create_chunk(str(text), metadata)
                if chunk:
                    all_chunks.append(chunk)
        print(f"Processed {len(df_ingredients_v2)} cosmetic ingredients.")

# Execute the function
process_cosmetics(data_frames, all_chunks)


--- Processing Domain: Cosmetics ---
Processed 20 cosmetic products.
Processed 35 cosmetic ingredients.


In [7]:
def process_business_model(data_frames, all_chunks):
    print("\n--- Processing Domain: Business Model ---")

    # --- Process Business Q&A ---
    df_qa = data_frames.get('Old Business Q&A')
    if df_qa is not None:
        for index, row in df_qa.iterrows():
            question = clean_text(row.get('Question', ''))
            answer = clean_text(row.get('Answer', ''))
            
            if not question or not answer:
                continue

            text_to_embed = f"Question: {question} Answer: {answer}"
            keywords = [kw.strip() for kw in str(row.get('Keywords', '')).split(',') if kw.strip()]

            metadata = {
                "domain": "Business_Model",
                "source_file": "Old Business Q&A.csv",
                "primary_entity_name": question,
                "section": "Q&A",
                "tags": keywords
            }
            chunk = create_chunk(text_to_embed, metadata)
            if chunk:
                all_chunks.append(chunk)
        print(f"Processed {len(df_qa)} Q&A entries.")

    # --- Process Incentive Programs ---
    df_incentives = data_frames.get('Incentive Program')
    if df_incentives is not None:
        # Note the typo "Discription" in the original column name
        text_col_name = next((col for col in df_incentives.columns if 'Discription' in col), None)
        if text_col_name:
            for index, row in df_incentives.iterrows():
                text = row.get(text_col_name)
                program_name = clean_text(row.get('Program', ''))
                if pd.notna(text) and program_name:
                    metadata = {
                        "domain": "Business_Model",
                        "source_file": "Incentive Program.csv",
                        "primary_entity_name": program_name,
                        "section": "Program_Details"
                    }
                    chunk = create_chunk(str(text), metadata)
                    if chunk:
                        all_chunks.append(chunk)
            print(f"Processed {len(df_incentives)} incentive programs.")

# Execute the function
process_business_model(data_frames, all_chunks)


--- Processing Domain: Business Model ---
Processed 39 Q&A entries.
Processed 4 incentive programs.


In [8]:
def process_ranks_and_compensation(data_frames, all_chunks):
    print("\n--- Processing Domain: Ranks & Compensation ---")

    # --- Process Club Ranks ---
    df_ranks = data_frames.get('Club Ranks')
    if df_ranks is not None:
        df_ranks.dropna(how='all', inplace=True)
        for index, row in df_ranks.iterrows():
            eng_position = clean_text(row.get('ENG Position', ''))
            if not eng_position: continue

            # Create a natural language sentence for semantic search
            text_to_embed = (
                f"The rank of {eng_position} (shortcut: {row.get('Shortcut', 'N/A')}) requires a minimum monthly activity of {row.get('Minimum Monthly Activity (P)', 'N/A')}P. "
                f"It requires {row.get('Minimum Number of Active Lines', 'N/A')} active lines and a total activity of {row.get('Total Activity', 'N/A')}P. "
                f"Average earnings range from {row.get('Base Earnings (in P)', 'N/A')} to {row.get('Max Earnings (in P)', 'N/A')} PP. "
                f"Associated Bonus: {row.get('Bonus', 'None')}."
            )
            
            # Store the structured data in metadata for precise filtering
            metadata = {key.replace(' ', '_').lower(): val for key, val in row.items()}
            metadata['domain'] = "Ranks"
            metadata['source_file'] = "Club Ranks.csv"
            metadata['primary_entity_name'] = eng_position
            metadata['section'] = "Rank_Requirements"
            
            chunk = create_chunk(text_to_embed, metadata)
            if chunk:
                all_chunks.append(chunk)
        print(f"Processed {len(df_ranks)} club ranks.")

    # --- Process Career Structure ---
    df_career = data_frames.get('Career Structure')
    if df_career is not None:
        df_career.dropna(how='all', inplace=True)
        for index, row in df_career.iterrows():
            position = clean_text(row.get('Position', ''))
            if not position: continue

            text_to_embed = (
                f"For the {position} career position, the minimum points turnover is {row.get('Minimum Points Turnover', 'N/A')}. "
                f"The structure commission is {row.get('Structure Commission', 0)*100:.2f}%. "
                f"This level includes a share of the global points turnover and may include a Compensatory Bonus and Rentier Bonus."
            )
            
            metadata = {key.replace(' ', '_').lower().replace('%', 'percent'): val for key, val in row.items()}
            metadata['domain'] = "Compensation"
            metadata['source_file'] = "Career Structure.csv"
            metadata['primary_entity_name'] = position
            metadata['section'] = "Career_Details"
            
            chunk = create_chunk(text_to_embed, metadata)
            if chunk:
                all_chunks.append(chunk)
        print(f"Processed {len(df_career)} career structure levels.")

# Execute the function
process_ranks_and_compensation(data_frames, all_chunks)


--- Processing Domain: Ranks & Compensation ---
Processed 20 club ranks.
Processed 14 career structure levels.


In [9]:
# --- Final Summary ---
print(f"\n--- PROCESSING COMPLETE ---")
print(f"Total number of structured data chunks created: {len(all_chunks)}")

# --- Display a few examples to verify the output ---
print("\n--- EXAMPLE CHUNKS ---")
if len(all_chunks) > 0:
    # Find and print one example from each domain to show the diversity of the data
    domains_seen = set()
    for chunk in all_chunks:
        domain = chunk['metadata']['domain']
        if domain not in domains_seen:
            print(f"\n--- Example from Domain: {domain} ---")
            print(json.dumps(chunk, indent=2))
            domains_seen.add(domain)
        # Stop after showing one of each main domain type
        if len(domains_seen) >= 4:
            break
else:
    print("No chunks were created. Please check the input files and code.")

# --- Save the final data to a JSON file ---
output_filename = 'structured_rag_data.json'
try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(all_chunks, f, indent=4, ensure_ascii=False)
    print(f"\nSuccessfully saved all {len(all_chunks)} structured data chunks to '{output_filename}'")
except Exception as e:
    print(f"\nAn error occurred while saving the file: {e}")


--- PROCESSING COMPLETE ---
Total number of structured data chunks created: 757

--- EXAMPLE CHUNKS ---

--- Example from Domain: Products ---
{
  "chunk_id": "76db44c9-3fa9-4bd1-af61-15fb9a511f48",
  "text_to_embed": "100% natural dietary supplements, created for people wishing to stay in a good physical and mental shape. Additional energy for the whole day of intense work and valuable support for calming down and regeneration of the body at night. A combination of as many as 26 extracts and fruit juices, to support functions of the cardiovascular system, the gastrointestinal tract, and the immune system, detoxification of the body, and brain, liver and kidney functions. Hundreds of active substances, including valuable antioxidants, amino acids, vitamins and minerals, to make not only each day, but also each night special.",
  "metadata": {
    "domain": "Products",
    "source_file": "Products.csv",
    "primary_entity_id": "D1",
    "primary_entity_name": "DuoLife Day and Night",
