In [6]:
"""
================================================================================
TRIOLOAD49V01 - TRIO FAMILY LOADER AND VALIDATOR
================================================================================

OVERVIEW:
This notebook loads and validates trio family data from 1000 Genomes Project 
pedigree files for nonparental haplotype insertion analysis. It identifies 
complete family trios (father-mother-child) and prepares them for genomic 
analysis of alien DNA insertions.

SCIENTIFIC CONTEXT:
- Analyzes trio families from 1000 Genomes Project for nonparental alleles
- Normal families show ~0.001% nonparental allele rate
- Families with alien DNA insertions show >0.5% rate in specific regions
- This tool prepares the family data for downstream genomic scanning

REQUIRED FOLDER STRUCTURE:
Your project must be organized exactly like this:

C:/Users/[username]/00XG1py/20250528Trios1k/
├── programs/           <- Run ALL scripts from here
├── downloaded/         <- Contains VCF files and pedigree data
│   ├── 20130606_g1k.ped     <- Main pedigree file (152.2KB)
│   ├── nygc_chr3_3202samples.vcf  <- Chromosome VCF files
│   └── [other genomic files]
└── outputs/           <- All results saved here automatically

USAGE INSTRUCTIONS:
1. Download this notebook to your programs/ folder
2. Ensure pedigree file exists in downloaded/ folder
3. Run all cells in order
4. Check outputs/ folder for validated trio families

INPUT FILES (searches for these automatically):
- downloaded/20130606_g1k.ped (primary)
- downloaded/nygc_pedigree.txt (backup)
- 20130606_g1k.ped (root folder backup)
- nygc_pedigree.txt (root folder backup)

OUTPUT FILES:
- outputs/trio_families_[timestamp].txt <- Validated trio data
- Console output with diagnostic information

EXPECTED RESULTS:
- Should load ~602 validated trio families
- Each trio contains: FamilyID, ChildID, FatherID, MotherID
- Automatic error checking for missing or malformed data

ERROR HANDLING:
- Automatically searches multiple file locations
- Shows exactly which files were checked if errors occur
- Validates data format and reports parsing issues
- Creates outputs directory if missing

NEXT STEPS AFTER RUNNING:
1. Use NPASearch tool to scan for nonparental alleles
2. Apply WindowRank tool for sliding window analysis  
3. Run ClusterFind to identify significant genomic regions
4. Execute AlienHunt for detailed alien DNA mapping

TROUBLESHOOTING:
- If "FileNotFoundError": Check that pedigree file exists in downloaded/
- If "No trios found": Verify pedigree file format (tab-separated)
- If permission errors: Ensure outputs/ folder is writable
- For other issues: Check console output for specific error messages

VERSION HISTORY:
- v01: Initial implementation with robust error handling

AUTHOR: Genomics Analysis Pipeline
CREATED: 2025-05-29
================================================================================
"""

# TrioLoad49v01 - Load and validate trio families from pedigree data
import os
import sys
from datetime import datetime

print(f"TrioLoad49v01 - LOAD AND VALIDATE TRIO FAMILIES - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Directory setup - mandatory for every script
current_dir = os.getcwd()
if 'programs' in current_dir:
    project_root = os.path.dirname(current_dir)
    os.chdir(project_root)
    print("Changed from programs/ to project root")
print(f"Working directory: {os.getcwd()}")

# Immediate diagnostic messages
print("Loading data files...")
print("Validating input files...")

# Check for pedigree file with multiple possible locations
pedigree_file = None
possible_files = [
    "downloaded/20130606_g1k.ped",
    "downloaded/nygc_pedigree.txt",
    "20130606_g1k.ped",
    "nygc_pedigree.txt"
]

for possible_file in possible_files:
    if os.path.exists(possible_file):
        pedigree_file = possible_file
        print(f"Found pedigree file: {possible_file}")
        break

if not pedigree_file:
    print("ERROR: No pedigree file found. Looked for:")
    for file in possible_files:
        print(f"  - {file}")
    print("Current directory contents:")
    try:
        print("Root:", os.listdir('.'))
        if os.path.exists('downloaded'):
            print("Downloaded folder:", os.listdir('downloaded'))
    except Exception as e:
        print(f"Cannot list directory contents: {e}")
    sys.exit(1)

# Load and validate trio data
print("Setup complete - starting analysis")
print("Reading pedigree data...")

trios = []
try:
    with open(pedigree_file, 'r') as f:
        lines = f.readlines()
    
    print(f"Found {len(lines)} lines in pedigree file")
    
    # Process each line (skip header if present)
    start_line = 1 if lines[0].startswith('#') or 'FamilyID' in lines[0] else 0
    
    for i, line in enumerate(lines[start_line:], 1):
        try:
            parts = line.strip().split()
            if len(parts) >= 6:  # Standard PED format: FamID IndID PatID MatID Sex Pheno
                family_id = parts[0]
                individual_id = parts[1]
                paternal_id = parts[2]
                maternal_id = parts[3]
                
                # Skip if this is a parent (has children but no parents listed)
                if paternal_id != '0' and maternal_id != '0':
                    trio = {
                        'family': family_id,
                        'child': individual_id,
                        'father': paternal_id,
                        'mother': maternal_id
                    }
                    trios.append(trio)
                    
                    if len(trios) % 100 == 0:
                        print(f"Processed {len(trios)} trios...")
                        
        except Exception as e:
            print(f"Warning: Could not parse line {i}: {line.strip()}")
            continue
    
    print(f"Successfully loaded {len(trios)} trio families")
    
    # Save results to outputs folder
    if not os.path.exists('outputs'):
        os.makedirs('outputs')
        print("Created outputs directory")
    
    # Save trio data with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"outputs/trio_families_{timestamp}.txt"
    
    print("Saving validated trio data...")
    with open(output_file, 'w') as f:
        f.write("FamilyID\tChildID\tFatherID\tMotherID\n")
        for trio in trios:
            f.write(f"{trio['family']}\t{trio['child']}\t{trio['father']}\t{trio['mother']}\n")
    
    print(f"Trio data saved to: {output_file}")
    
    # Summary statistics
    print("\n--- SUMMARY ---")
    print(f"Total trio families loaded: {len(trios)}")
    print(f"Source file: {pedigree_file}")
    print(f"Output file: {output_file}")
    
    # Show first few examples
    print("\nFirst 5 trio families:")
    for i, trio in enumerate(trios[:5]):
        print(f"  {i+1}. Family {trio['family']}: Child {trio['child']} (Father: {trio['father']}, Mother: {trio['mother']})")
    
    print(f"\nTrioLoad49v01 completed successfully at {datetime.now().strftime('%H:%M:%S')}")

except Exception as e:
    print(f"ERROR: Failed to process pedigree file: {e}")
    print(f"File: {pedigree_file}")
    sys.exit(1)

TrioLoad49v01 - LOAD AND VALIDATE TRIO FAMILIES - 2025-05-29 13:21:03
Working directory: C:\Users\mremp\00XG1py\20250528Trios1k\downloaded
Loading data files...
Validating input files...
Found pedigree file: 20130606_g1k.ped
Setup complete - starting analysis
Reading pedigree data...
Found 3502 lines in pedigree file
Processed 100 trios...
Processed 200 trios...
Processed 300 trios...
Processed 400 trios...
Processed 500 trios...
Processed 600 trios...
Successfully loaded 642 trio families
Saving validated trio data...
Trio data saved to: outputs/trio_families_20250529_132103.txt

--- SUMMARY ---
Total trio families loaded: 642
Source file: 20130606_g1k.ped
Output file: outputs/trio_families_20250529_132103.txt

First 5 trio families:
  1. Family Family: Child ID (Father: Individual, Mother: ID)
  2. Family BB01: Child HG01881 (Father: HG01879, Mother: HG01880)
  3. Family BB02: Child HG01888 (Father: HG01882, Mother: HG01883)
  4. Family BB03: Child HG01884 (Father: HG01885, Mother: H

In [2]:
import os

# Get current working directory
cwd = os.getcwd()
folder_name = os.path.basename(cwd)

if folder_name.lower() != 'downloaded':
    # Go up one level
    parent_dir = os.path.dirname(cwd)
    os.chdir(parent_dir)
    # List folders in parent
    folders = [f for f in os.listdir() if os.path.isdir(f)]
    # Find 'downloaded' folder (case-insensitive)
    downloaded_folder = [f for f in folders if f.lower() == 'downloaded']
    if downloaded_folder:
        os.chdir(downloaded_folder[0])
    else:
        print("'downloaded' folder not found.")
else:
    print("Already in 'downloaded' folder.")

print("Current directory:", os.getcwd())


Current directory: C:\Users\mremp\00XG1py\20250528Trios1k\downloaded


In [2]:
import os

# Get current working directory
cwd = os.getcwd()
folder_name = os.path.basename(cwd)

if folder_name.lower() != 'downloaded':
    # Go up one level
    parent_dir = os.path.dirname(cwd)
    os.chdir(parent_dir)
    # List folders in parent
    folders = [f for f in os.listdir() if os.path.isdir(f)]
    # Find 'downloaded' folder (case-insensitive)
    downloaded_folder = [f for f in folders if f.lower() == 'downloaded']
    if downloaded_folder:
        os.chdir(downloaded_folder[0])
    else:
        print("'downloaded' folder not found.")
else:
    print("Already in 'downloaded' folder.")

print("Current directory:", os.getcwd())


Current directory: C:\Users\mremp\00XG1py\20250528Trios1k\downloaded


In [4]:
# TrioLoad49v01 - Load and validate trio families from pedigree data
import os
import sys
from datetime import datetime

print(f"TrioLoad49v01 - LOAD AND VALIDATE TRIO FAMILIES - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Directory setup - mandatory for every script
current_dir = os.getcwd()
if 'programs' in current_dir:
    project_root = os.path.dirname(current_dir)
    os.chdir(project_root)
    print("Changed from programs/ to project root")
print(f"Working directory: {os.getcwd()}")

# Immediate diagnostic messages
print("Loading data files...")
print("Validating input files...")

# Check for pedigree file with multiple possible locations
pedigree_file = None
possible_files = [
    "downloaded/20130606_g1k.ped",
    "downloaded/nygc_pedigree.txt",
    "20130606_g1k.ped",
    "nygc_pedigree.txt"
]

for possible_file in possible_files:
    if os.path.exists(possible_file):
        pedigree_file = possible_file
        print(f"Found pedigree file: {possible_file}")
        break

if not pedigree_file:
    print("ERROR: No pedigree file found. Looked for:")
    for file in possible_files:
        print(f"  - {file}")
    print("Current directory contents:")
    try:
        print("Root:", os.listdir('.'))
        if os.path.exists('downloaded'):
            print("Downloaded folder:", os.listdir('downloaded'))
    except Exception as e:
        print(f"Cannot list directory contents: {e}")
    sys.exit(1)

# Load and validate trio data
print("Setup complete - starting analysis")
print("Reading pedigree data...")

trios = []
try:
    with open(pedigree_file, 'r') as f:
        lines = f.readlines()
    
    print(f"Found {len(lines)} lines in pedigree file")
    
    # Process each line (skip header if present)
    start_line = 1 if lines[0].startswith('#') or 'FamilyID' in lines[0] else 0
    
    for i, line in enumerate(lines[start_line:], 1):
        try:
            parts = line.strip().split()
            if len(parts) >= 6:  # Standard PED format: FamID IndID PatID MatID Sex Pheno
                family_id = parts[0]
                individual_id = parts[1]
                paternal_id = parts[2]
                maternal_id = parts[3]
                
                # Skip if this is a parent (has children but no parents listed)
                if paternal_id != '0' and maternal_id != '0':
                    trio = {
                        'family': family_id,
                        'child': individual_id,
                        'father': paternal_id,
                        'mother': maternal_id
                    }
                    trios.append(trio)
                    
                    if len(trios) % 100 == 0:
                        print(f"Processed {len(trios)} trios...")
                        
        except Exception as e:
            print(f"Warning: Could not parse line {i}: {line.strip()}")
            continue
    
    print(f"Successfully loaded {len(trios)} trio families")
    
    # Save results to outputs folder
    if not os.path.exists('outputs'):
        os.makedirs('outputs')
        print("Created outputs directory")
    
    # Save trio data with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"outputs/trio_families_{timestamp}.txt"
    
    print("Saving validated trio data...")
    with open(output_file, 'w') as f:
        f.write("FamilyID\tChildID\tFatherID\tMotherID\n")
        for trio in trios:
            f.write(f"{trio['family']}\t{trio['child']}\t{trio['father']}\t{trio['mother']}\n")
    
    print(f"Trio data saved to: {output_file}")
    
    # Summary statistics
    print("\n--- SUMMARY ---")
    print(f"Total trio families loaded: {len(trios)}")
    print(f"Source file: {pedigree_file}")
    print(f"Output file: {output_file}")
    
    # Show first few examples
    print("\nFirst 5 trio families:")
    for i, trio in enumerate(trios[:5]):
        print(f"  {i+1}. Family {trio['family']}: Child {trio['child']} (Father: {trio['father']}, Mother: {trio['mother']})")
    
    print(f"\nTrioLoad49v01 completed successfully at {datetime.now().strftime('%H:%M:%S')}")

except Exception as e:
    print(f"ERROR: Failed to process pedigree file: {e}")
    print(f"File: {pedigree_file}")
    sys.exit(1)

TrioLoad49v01 - LOAD AND VALIDATE TRIO FAMILIES - 2025-05-29 13:17:43
Working directory: C:\Users\mremp\00XG1py\20250528Trios1k\downloaded
Loading data files...
Validating input files...
Found pedigree file: 20130606_g1k.ped
Setup complete - starting analysis
Reading pedigree data...
Found 3502 lines in pedigree file
Processed 100 trios...
Processed 200 trios...
Processed 300 trios...
Processed 400 trios...
Processed 500 trios...
Processed 600 trios...
Successfully loaded 642 trio families
Created outputs directory
Saving validated trio data...
Trio data saved to: outputs/trio_families_20250529_131743.txt

--- SUMMARY ---
Total trio families loaded: 642
Source file: 20130606_g1k.ped
Output file: outputs/trio_families_20250529_131743.txt

First 5 trio families:
  1. Family Family: Child ID (Father: Individual, Mother: ID)
  2. Family BB01: Child HG01881 (Father: HG01879, Mother: HG01880)
  3. Family BB02: Child HG01888 (Father: HG01882, Mother: HG01883)
  4. Family BB03: Child HG01884 (