In [1]:
import os

# Paths to the directory and the input file
bam_dir = "/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/bams/"
input_file = "./steps_inputs/step1/ENCODE_snatac_pseudobulk_replacements_v2_experiemnt_id_bam_url.txt"

# Get folder names in the BAM directory
bam_folders = set(os.listdir(bam_dir))

# Read the first column from the input file
with open(input_file, "r") as file:
    file_ids = set(line.split()[0] for line in file if line.strip())  # Extract the first column

# Find BAM folders not present in the input file
folders_not_in_file = bam_folders - file_ids

# Print counts
print(f"Number of folders in the BAM directory: {len(bam_folders)}")
print(f"Number of IDs in the input file: {len(file_ids)}")
print(f"Number of folders not in the input file: {len(folders_not_in_file)}")

# Output the results
print("\nFolders available in the BAM directory but not in the input file:")
for folder in sorted(folders_not_in_file):
    print(folder)


Number of folders in the BAM directory: 1490
Number of IDs in the input file: 1490
Number of folders not in the input file: 0

Folders available in the BAM directory but not in the input file:


In [1]:
import os
import glob

# Paths to the directory and the input file
bam_dir = "/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/bams/"
input_file = "./steps_inputs/step1/ENCODE_snatac_pseudobulk_replacements_v2_experiemnt_id_bam_url.txt"
output_bam_subfile = "./steps_inputs/step1/missing_bam_files.txt"  # Output file for missing unsorted BAM rows
output_subfile = "./steps_inputs/step2/missing_bam_unsorted_files.txt"  # Output file for missing unsorted BAM rows

# Get folder names in the BAM directory (subdirectories are ENCSR_IDs)
bam_folders = {folder for folder in os.listdir(bam_dir) if os.path.isdir(os.path.join(bam_dir, folder))}

# Read the input file and map the ENCSR_ID and ENCFF_ID (extracted from the URL) to their respective rows
input_data = {}
with open(input_file, "r") as file:
    for line in file:
        if line.strip():
            columns = line.split()
            encsr_id = columns[0]
            encff_url = columns[1]  # Assuming the second column contains the URL

            # Extract ENCFF_ID from the URL (after the last '/')
            encff_id = encff_url.split('/')[-1].split('.')[0]  # This assumes ENCFF_ID is the part before '.bam'
            
            input_data[encsr_id] = {'encff_id': encff_id, 'line': line.strip()}

# Find ENCSR_IDs from the input file that do not have a corresponding sorted BAM file in the BAM directory
missing_sorted_bams = set()
missing_bam_rows = []

for encsr_id, data in input_data.items():
    encff_id = data['encff_id']
    # Use glob to find the *_sorted.bam file in the folder corresponding to ENCSR_ID
    bam_file_pattern = os.path.join(bam_dir, encsr_id, f"{encff_id}_sorted.bam")
    bam_files = glob.glob(bam_file_pattern)
    
    # If no matching sorted BAM file is found, add to missing_sorted_bams
    if not bam_files:
        missing_sorted_bams.add((encsr_id, encff_id))  # Store both ENCSR_ID and ENCFF_ID
        missing_bam_rows.append(data['line'])  # Keep track of the missing BAM file row for output

# Create the sub-file with rows for missing unsorted BAM files
with open(output_subfile, "w") as output_file:
    for encsr_id, encff_id in missing_sorted_bams:
        if encsr_id in input_data:
            # Construct the path for the missing unsorted BAM file
            unsorted_bam_path = os.path.join(bam_dir, encsr_id, f"{encff_id}_unsorted.bam")
            output_file.write(f"{unsorted_bam_path}\n")  # Write the path to the unsorted BAM file

# Create the sub-file for missing BAM rows from the input file
with open(output_bam_subfile, "w") as output_file:
    for row in missing_bam_rows:
        output_file.write(f"{row}\n")

# Print counts
print(f"Number of folders in the BAM directory: {len(bam_folders)}")
print(f"Number of IDs in the input file: {len(input_data)}")
print(f"Number of BAM folders not in the input file: {len(bam_folders - set(input_data))}")
print(f"Number of ENCSR_IDs from the input file missing corresponding sorted BAM files: {len(missing_sorted_bams)}")

# Output the results
print("\nFolders available in the BAM directory but not in the input file:")
for folder in sorted(bam_folders - set(input_data)):
    print(folder)

print("\nENCSR_IDs and ENCFF_IDs from the input file that are missing corresponding sorted BAM files:")
for encsr_id, encff_id in sorted(missing_sorted_bams):
    print(f"{encsr_id} - {encff_id}")

print(f"\nSub-file with missing unsorted BAM file paths created at: {output_subfile}")
print(f"Sub-file with missing BAM file rows created at: {output_bam_subfile}")

# Print the first 2 rows from each output file
def print_first_two_rows(file_path):
    try:
        with open(file_path, "r") as file:
            lines = file.readlines()
            print(f"\nFirst two rows from {file_path}:")
            print("".join(lines[:2]) if len(lines) >= 2 else "".join(lines))
    except FileNotFoundError:
        print(f"{file_path} not found.")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Print the first two rows of the output files
print_first_two_rows(output_bam_subfile)
print_first_two_rows(output_subfile)


Number of folders in the BAM directory: 1490
Number of IDs in the input file: 1490
Number of BAM folders not in the input file: 0
Number of ENCSR_IDs from the input file missing corresponding sorted BAM files: 0

Folders available in the BAM directory but not in the input file:

ENCSR_IDs and ENCFF_IDs from the input file that are missing corresponding sorted BAM files:

Sub-file with missing unsorted BAM file paths created at: ./steps_inputs/step2/missing_bam_unsorted_files.txt
Sub-file with missing BAM file rows created at: ./steps_inputs/step1/missing_bam_files.txt

First two rows from ./steps_inputs/step1/missing_bam_files.txt:


First two rows from ./steps_inputs/step2/missing_bam_unsorted_files.txt:



In [3]:
#preperation for step 3:
import os
import csv
import glob

# Define file paths
input_file = "./steps_inputs/step3/atac_pseudobulk_new_peaks_files_mapping.txt"
output_file = "./steps_inputs/step3/missing_atac_pseudobulk_new_peaks_files_mapping.txt"
extra_ids_file = "./steps_inputs/step3/extra_peaks_ids_not_in_input_file.txt"
peaks_dir = "/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/peaks"

# Ensure the input file exists
if not os.path.exists(input_file):
    print(f"Input file {input_file} does not exist. Exiting.")
    exit(1)

# Collect ENCSR_IDs from the input file
input_ids = set()
with open(input_file, "r") as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        encsr_id = os.path.basename(row['ID'].strip('/'))
        input_ids.add(encsr_id)

# Collect ENCSR_IDs from the peaks_dir
peaks_ids = set(next(os.walk(peaks_dir))[1])

# Identify extra IDs in peaks_dir not present in input file
extra_ids = peaks_ids - input_ids

# Write extra IDs to a separate file
with open(extra_ids_file, "w") as extra_file:
    extra_file.write("ENCSR_ID\n")
    for encsr_id in sorted(extra_ids):
        extra_file.write(f"{encsr_id}\n")

# Open the input file and output file for missing rows
with open(input_file, "r") as infile, open(output_file, "w", newline="") as outfile:
    reader = csv.DictReader(infile)
    writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)

    # Write the header to the output file
    writer.writeheader()

    # Iterate through each row in the input file
    for row in reader:
        # Extract ENCSR_ID from the ID column
        encsr_id = os.path.basename(row['ID'].strip('/'))

        # Construct the search pattern for .bed.gz files
        search_pattern = os.path.join(peaks_dir, encsr_id, "*", "*.bed.gz")

        # Check if any matching files exist
        if not glob.glob(search_pattern):
            # Write the missing row to the output file
            writer.writerow(row)

print(f"Missing ENCSR_ID rows have been saved to {output_file}.")
print(f"Extra ENCSR_IDs in {peaks_dir} not found in the input file have been saved to {extra_ids_file}.")



Missing ENCSR_ID rows have been saved to ./steps_inputs/step3/missing_atac_pseudobulk_new_peaks_files_mapping.txt.


In [1]:
# !squeue --user=$USER


In [2]:
!squeue --user=$USER | wc -l
# 483

3


In [23]:
# ls -d $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_data/bams/*/ | wc -l

In [24]:
# ls $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_data/bams/*/ | tail

In [25]:
# ls -d $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_data/peaks/*/ | wc -l

In [77]:
# ls -d $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_data/peaks/*/ | head

In [27]:
# !ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_data/peaks/*/*/*_*.bed.gz | wc -l
#1490


In [28]:
# negative bed files
!ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_negative/*/*/*/fold_*/*nonpeaks_negatives.bed | wc -l
# 7224



7350


In [102]:
1490*5

7450

In [103]:
!ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_negative/*/*/*/fold_0/*nonpeaks_negatives.bed | wc -l
# 1490

1490


In [104]:
!ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_negative/*/*/*/fold_1/*nonpeaks_negatives.bed | wc -l
# 1490


1490


In [105]:
!ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_negative/*/*/*/fold_2/*nonpeaks_negatives.bed | wc -l
# 1490



1490


In [106]:
!ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_negative/*/*/*/fold_3/*nonpeaks_negatives.bed | wc -l
# 1432


1490


In [107]:
!ls -lt $GROUP_SCRATCH/eila/encode_pseudobulks/encode_pseudobulks_negative/*/*/*/fold_4/*nonpeaks_negatives.bed | wc -l
# 1490


1490


In [5]:
!squeue --user=$USER | wc -l

# 3

1


In [6]:
!squeue --user=$USER | sort -k1,1n


             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


In [14]:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/bams/ENCSR110XMU/ENCFF553COV.bed.gz_unsorted.bam



ls: cannot access /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/bams/ENCSR110XMU/ENCFF553COV.bed.gz_unsorted.bam: No such file or directory


In [1]:
# !ls ./local_logs/

In [None]:
# !scancel 60411410

In [2]:
# !ls ./steps_inputs/step5/peaks_filtered_by_blacklist_merged_with_organism_output.txt

In [3]:
# !head -2 ./steps_inputs/step5/peaks_filtered_by_blacklist_merged_with_organism_output.txt

In [4]:
# !tail -2 ./local_logs/step5.NegativesNoPeaksBackground.combined.err

In [21]:
 # execution
!squeue --user=$USER | wc -l

1


In [22]:
# !squeue --user=$USER | grep step5Neg | wc -l
!squeue --user=$USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


In [15]:
# !scancel --user=$USER


In [7]:
# !squeue --user=$USER
# !squeue --user=$USER | sort -k1,1n


In [8]:
# !rm ./local_logs/step5.*

In [5]:
# !scancel 63790365

In [9]:
# !scancel -u $USER

In [10]:
# %%bash

# while true; do
#     squeue --user=$USER
#     sleep 120
# done


In [11]:
# !scancel 57899498

In [12]:
# !rm ./local_logs/633*
# !./execute_sbatch_arrays_on_sherlock.sh ./steps_inputs/step6_3/debug_one_modisco_results_profile_scores_h5.txt step6-3-3-qc-bias-tn5.sh 1 1 


In [13]:
# !grep "qval" ./local_logs/slurm.step62.bpnetPipeline.combined.out

In [14]:
# !wc -l ./steps_inputs/step6/chrombpnet_pipeline_extracted_paths.txt
# !head -5 ./local_logs/slurm.step62.bpnetPipeline.combined.out

In [15]:
# !tail -5 ./local_logs/slurm.step62.bpnetPipeline.combined.out

In [16]:
# !grep modisco ./local_logs/slurm.step62.bpnetPipeline.combined.err

In [17]:
# !grep "qval" ./local_logs/slurm.step62.bpnetPipeline.combined.err

In [18]:
# !head -5 ./local_logs/slurm.step62.bpnetPipeline.combined.err

In [19]:
# !tail -5 ./local_logs/slurm.step62.bpnetPipeline.combined.err

In [20]:
# !scancel 58205153
# !scancel -u $USER
# !ls ./steps_inputs/step6/

In [120]:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_0/step62.chrombpnet/auxiliary/interpret_subsample/modisco_results_profile_scores.h5  | wc -l
# 818


818


In [121]:

# fold 1:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_1/step62.chrombpnet/auxiliary/interpret_subsample/modisco_results_profile_scores.h5  | wc -l
# 412



412


In [122]:

# fold 2:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_2/step62.chrombpnet/auxiliary/interpret_subsample/modisco_results_profile_scores.h5  | wc -l
# 271



273


In [123]:

# fold 3:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_3/step62.chrombpnet/auxiliary/interpret_subsample/modisco_results_profile_scores.h5  | wc -l
# 163


163


In [124]:

# fold 4:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_4/step62.chrombpnet/auxiliary/interpret_subsample/modisco_results_profile_scores.h5  | wc -l
# 141



141


In [125]:

# all folds:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_*/step62.chrombpnet/auxiliary/interpret_subsample/modisco_results_profile_scores.h5  | wc -l
# 1805



1807


In [27]:
# 5416/2134

In [127]:
# !scancel -u $USER

In [42]:
# !squeue -u $USER -o "%i %j" | grep step5Neg


In [43]:
# scp -r eila@login.sherlock.stanford.edu:/scratch/users/eila/encode_pseudobulks_model_training/human/ENCSR037JDN/ENCFF933KCP/fold_1_30000_20240912_182023/evaluation . 

# !rm ./local_logs/slurm_samools_err.combined.err
# !rm ./local_logs/slurm_samools_out.combined.out

In [44]:
!ls execute*

execute_sbatch_arrays_on_sherlock-Copy1.sh
execute_sbatch_arrays_on_sherlock.sh
execute_sbatch_arrays_on_sherlock_try.sh


In [25]:
%%bash
find /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/*/*/fold_0/step62.bpnetPipeline/evaluation/ -name "overall_report.html" \
    | while read dir; do
        # Get the parent directory of "evaluation"
        parent_dir=$(dirname "$(dirname "$dir")")
        
        # Check if modisco_results_profile_scores.h5 exists under auxiliary/interpret_subsample
        if ! find "$parent_dir/auxiliary/interpret_subsample" -name "modisco_results_profile_scores.h5" > /dev/null; then
            # Print the directory containing overall_report.html if modisco_results_profile_scores.h5 is not found
            echo "$parent_dir"
        fi
    done





In [32]:
import os
import glob

# Set the base directories for the two files
evaluation_dir = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/*/*/fold_0/step62.bpnetPipeline/evaluation'
modisco_dir = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/*/*/fold_0/step62.bpnetPipeline/auxiliary/interpret_subsample'

# Find all directories with the relevant files
overall_report_dirs = set(os.path.dirname(path) for path in glob.glob(f"{evaluation_dir}/overall_report.html"))
modisco_dirs = set(os.path.dirname(path) for path in glob.glob(f"{modisco_dir}/modisco_results_profile_scores.h5"))

# Get the base path up to fold_0/step62.bpnetPipeline/
base_path = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/*/*/fold_0/step62.bpnetPipeline'

# Find directories missing at least one file
missing_files_dirs = set()

# Iterate over directories up to the base path level
for dir in glob.glob(f"{base_path}/*/*/"):
    # If the directory has overall_report.html but not modisco_results_profile_scores.h5
    if dir in overall_report_dirs and dir not in modisco_dirs:
        missing_files_dirs.add(dir)
    # If the directory has modisco_results_profile_scores.h5 but not overall_report.html
    elif dir in modisco_dirs and dir not in overall_report_dirs:
        missing_files_dirs.add(dir)

# Print out the result
print(f"Directories missing at least one of the files ('overall_report.html' or 'modisco_results_profile_scores.h5'):")
for dir in missing_files_dirs:
    print(dir)


Directories missing at least one of the files ('overall_report.html' or 'modisco_results_profile_scores.h5'):


In [28]:
import os

def find_folders_without_file(base_path, target_file):
    # Walk through the directory structure
    folders_without_file = []

    for root, dirs, files in os.walk(base_path):
        # Check if the current directory contains the target file
        if target_file not in files:
            folders_without_file.append(root)
    
    return folders_without_file

# Define the base path and the target file
base_path = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_0/'
target_file = 'step62.bpnetPipeline/auxiliary/interpret_subsample/modisco_results_profile_scores.h5'

# Call the function to find folders without the file
folders = find_folders_without_file(base_path, target_file)

# Print the results
for folder in folders:
    print(folder)


In [29]:
# !ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/ENCSR000NVR/ENCFF585MYS/fold_0/step62.bpnetPipeline/auxiliary/interpret_subsample/modisco_results_profile_scores.h5



In [1]:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/peaks | wc -l



1492


In [59]:
!ls /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/ENCSR000NVR/ENCFF585MYS/fold_0/step62.bpnetPipeline/auxiliary/interpret_subsample/modisco_results_profile_scores.h5




/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/human/ENCSR000NVR/ENCFF585MYS/fold_0/step62.bpnetPipeline/auxiliary/interpret_subsample/modisco_results_profile_scores.h5


In [None]:
#  prep for step6-2-chrombpnet-pipelline.sh 
# step 1 : find BAM ID without model

In [5]:
# import glob
# import os
# import csv

# def find_bams_ids(base_bams_path):
#     """List all BAM IDs from the bams directory."""
#     return [d for d in os.listdir(base_bams_path) if os.path.isdir(os.path.join(base_bams_path, d))]

# def find_matching_files_with_glob(base_training_path, target_file_pattern, bam_id):
#     """Find modisco_results_profile_scores.h5 using glob pattern matching."""
#     # Correct the glob pattern
#     search_pattern = os.path.join(base_training_path, '*', bam_id, '*', 'fold_0', 'step62.bpnetPipeline', 'auxiliary', 'interpret_subsample', target_file_pattern)
    
#     # Print the search pattern for debugging
#     print(f"Search pattern for {bam_id}: {search_pattern}")
    
#     # Use glob to find the files matching the pattern
#     found_paths = glob.glob(search_pattern, recursive=True)
    
#     return found_paths, search_pattern

# def compare_files(base_bams_path, base_training_path, target_file, output_file):
#     """Compare BAM IDs and return those without a matching modisco_results_profile_scores.h5."""
#     bams_ids = find_bams_ids(base_bams_path)

#     # Open CSV file for writing the results
#     with open(output_file, mode='w', newline='') as file:
#         writer = csv.writer(file)
        
#         # Write header
#         writer.writerow(["BAM ID", "Found Path", "Destination Path (Expected)", "Match Found?"])
        
#         for bam_id in bams_ids:
#             # Define the BAM source path
#             bam_path = os.path.join(base_bams_path, bam_id)
            
#             # Find matching files for the target file using glob
#             matching_folders, expected_path = find_matching_files_with_glob(base_training_path, target_file, bam_id)
            
#             if matching_folders:
#                 # If matches found, write the actual folder path where the file was found
#                 for folder in matching_folders:
#                     writer.writerow([bam_id, folder, expected_path, "Yes"])
#             else:
#                 # If no match is found, write the expected search path template
#                 writer.writerow([bam_id, "", expected_path, "No"])
            
#     print(f"Results have been written to {output_file}")

# # Define the base paths and target file
# base_bams_path = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_data/bams'
# base_training_path = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training'
# target_file = 'modisco_results_profile_scores.h5'
# output_file = 'missing_models_bams_output.csv'  # Update this to your desired output file location

# # Run the comparison and save the output to the file
# compare_files(base_bams_path, base_training_path, target_file, output_file)


In [13]:
# step 2: prepare input for the model creation step: step6-2-chrombpnet-pipelline.sh 
import csv

def load_previous_results(previous_results_file):
    """Load previous results to check which BAM IDs did not match."""
    missing_bam_ids = set()  # Set to store BAM IDs with no match
    with open(previous_results_file, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            bam_id = row[0]  # BAM ID is the first column
            match_found = row[3]  # 'Match Found?' is the fourth column
            if match_found == "No":  # We care about rows where "Match Found?" is "No"
                missing_bam_ids.add(bam_id)
    return missing_bam_ids

def find_missing_lines(input_file, missing_bam_ids, output_file):
    """Compare lines in the input file with missing BAM IDs and write relevant ones to the output file."""
    with open(input_file, mode='r') as infile, open(output_file, mode='w') as outfile:
        reader = infile.readlines()
        
        for line in reader:
            # Split the line by space to extract the columns
            columns = line.strip().split()
            bam_id = columns[1]  # BAM ID is in the second column
            
            # Check if the BAM ID is in the missing set
            if bam_id in missing_bam_ids:
                outfile.write(line)  # Write the relevant line to the output file

    print(f"Missing entries have been written to {output_file}")

# Define paths - change the index - based on the fold that you want to execute
input_file = './steps_inputs/step6/chrombpnet_pipeline_extracted_paths_fold_4.txt'  # Input text file
previous_results_file = 'missing_models_bams_output.csv'  # Previous results file (CSV)
output_file = './steps_inputs/step6/missing_chrombpnet_pipeline_extracted_paths_fold_4.txt'  # Corrected Output file path

# Load BAM IDs that didn't match from the previous script output
missing_bam_ids = load_previous_results(previous_results_file)

# Compare input file lines with missing BAM IDs and write missing entries to the output file
find_missing_lines(input_file, missing_bam_ids, output_file)


Missing entries have been written to ./steps_inputs/step6/missing_chrombpnet_pipeline_extracted_paths_fold_4.txt


In [6]:


# check step 6-3-3 output: how  many models were tested - count the folders:

 # / debug
# !ls -d /scratch/groups/akundaje/eila/encode_pseudobulks/old_encode_pseudobulks_model_training/*/*/*/fold_0/step62.bpnetPipeline/qc/out_step_6_3_3_motifs_qc_bias_tn5 | wc -l

# TF atlas:
!ls -d /scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/*/*/*/fold_0/step62.bpnetPipeline/qc/out_step_6_3_3_motifs_qc_bias_tn5 | wc -l

#973

973


In [5]:
import os
import glob

# Define the root directory to start the search from
root_dir = '/scratch/groups/akundaje/eila/encode_pseudobulks/encode_pseudobulks_model_training/'

# Initialize counters for valid and invalid logos
valid_logos_count = 0
invalid_logos_count = 0

# Define the folder patterns for valid and invalid logos
valid_folders = ['neg_patterns_valid_logos', 'pos_patterns_valid_logos']
invalid_folders = ['neg_patterns_invalid_logos', 'pos_patterns_invalid_logos']


# Traverse the directory structure to count directories
sample_dirs = glob.glob(f'{root_dir}/*/*/*/fold_0/step62.bpnetPipeline/qc/out_step_6_3_3_motifs_qc_bias_tn5/')

# Iterate over the sample directories
for sample_dir in sample_dirs:
    # Extract the unique ID (the two folders before "fold_0")
    parts = sample_dir.split('/')
    # IDs are the 7th and 8th parts of the path (zero-indexed), i.e., before 'fold_0'
    sample_id = '/'.join(parts[-5:-3])  # This gives us the two folders before "fold_0"

    # Count valid logos
    for valid_folder in valid_folders:
        valid_folder_path = os.path.join(sample_dir, valid_folder)
        if os.path.exists(valid_folder_path):  # Check if folder exists
            valid_files = glob.glob(os.path.join(valid_folder_path, '*.png'))
            valid_logos_count += len(valid_files)  # Count valid files
    
    # Count invalid logos
    for invalid_folder in invalid_folders:
        invalid_folder_path = os.path.join(sample_dir, invalid_folder)
        if os.path.exists(invalid_folder_path):  # Check if folder exists
            invalid_files = glob.glob(os.path.join(invalid_folder_path, '*.png'))
            invalid_logos_count += len(invalid_files)  # Count invalid files

# Count the total number of directories in the target folder (equivalent to `ls -d`)
total_sample_count = len(sample_dirs)

# Print the results
print(f"Valid logos count: {valid_logos_count}")
print(f"Invalid logos count: {invalid_logos_count}")
print(f"Total sample count: {total_sample_count}")


Valid logos count: 62176
Invalid logos count: 0
Total sample count: 973


In [4]:
# Valid logos count: 62176
# Invalid logos count: 0
# Total sample count: 973