# Count CSV Rows in Mask+Jaw Folders

This notebook counts rows (excluding headers) in `jaw.csv` files located in each folder under `C:\Users\wanglab\Desktop\Mask+Jaw`.

Each CSV is located at: `C:\Users\wanglab\Desktop\Mask+Jaw\<folder>\labels\jaw\jaw.csv`

In [11]:
from pathlib import Path
import csv
import pandas as pd

# Configure pandas to display all rows
pd.set_option('display.max_rows', None)

## Configuration

In [12]:
# Root directory containing folders with jaw.csv files
ROOT_DIR = r'C:\Users\wanglab\Desktop\Mask+Jaw'

## Helper Function

Function to count CSV rows with automatic delimiter detection (handles comma and space delimiters).

In [13]:
def count_csv_rows(csv_path: Path):
    """Count rows in CSV file, excluding header (returns count - 1).
    
    Automatically detects delimiter (comma, space, semicolon, tab, pipe) 
    and handles different line endings.
    """
    if not csv_path.exists() or not csv_path.is_file():
        return None, "File not found"
    
    try:
        # Read file with universal newlines to handle different line endings
        with open(csv_path, 'r', encoding='utf-8', newline='') as f:
            # Read a sample to detect the delimiter
            sample = f.read(8192)  # Read up to 8KB for detection
            f.seek(0)  # Reset to beginning
            
            # Use csv.Sniffer to detect delimiter
            try:
                sniffer = csv.Sniffer()
                dialect = sniffer.sniff(sample, delimiters=', ;\t|')
                reader = csv.reader(f, dialect=dialect)
            except csv.Error:
                # If sniffer fails, try common delimiters manually
                f.seek(0)
                first_line = f.readline()
                f.seek(0)
                
                # Count occurrences of common delimiters (including space)
                delimiters = [',', ' ', ';', '\t', '|']
                delimiter_counts = [(d, first_line.count(d)) for d in delimiters]
                best_delimiter = max(delimiter_counts, key=lambda x: x[1])[0]
                
                reader = csv.reader(f, delimiter=best_delimiter)
            
            # Count non-empty rows
            row_count = 0
            for row in reader:
                # Skip completely empty rows
                if any(cell.strip() for cell in row):
                    row_count += 1
            
            return max(0, row_count - 1), "Success"  # subtract header
    except Exception as e:
        return None, f"Error: {e}"

## Scan Folders and Count Rows

In [14]:
# Validate root directory
root = Path(ROOT_DIR)
if not root.exists():
    raise FileNotFoundError(f'Root path does not exist: {root}')
if not root.is_dir():
    raise NotADirectoryError(f'Root is not a directory: {root}')

# Collect data from each folder
results = []
total_rows = 0
folders_found = 0
folders_missing = 0

# Iterate through immediate subdirectories of root
children = sorted([p for p in root.iterdir() if p.is_dir()])

print(f"Scanning {len(children)} folders in {ROOT_DIR}\n")

for child in children:
    jaw_dir = child / 'labels' / 'jaw'
    
    # Check if jaw directory exists
    if not jaw_dir.exists() or not jaw_dir.is_dir():
        results.append({
            'Folder': child.name,
            'CSV File': 'N/A',
            'Rows (excluding header)': 0,
            'Status': 'labels/jaw directory not found'
        })
        folders_missing += 1
        continue
    
    # Find all CSV files in the jaw directory
    csv_files = list(jaw_dir.glob('*.csv'))
    
    if not csv_files:
        results.append({
            'Folder': child.name,
            'CSV File': 'N/A',
            'Rows (excluding header)': 0,
            'Status': 'No CSV files found'
        })
        folders_missing += 1
    else:
        # Process each CSV file found
        for csv_file in csv_files:
            row_count, status = count_csv_rows(csv_file)
            
            if row_count is not None:
                results.append({
                    'Folder': child.name,
                    'CSV File': csv_file.name,
                    'Rows (excluding header)': row_count,
                    'Status': status
                })
                total_rows += row_count
                folders_found += 1
            else:
                results.append({
                    'Folder': child.name,
                    'CSV File': csv_file.name,
                    'Rows (excluding header)': 0,
                    'Status': status
                })

print(f"Found CSV files in {folders_found} cases")
print(f"Missing or failed: {folders_missing}")
print(f"Total rows (excluding headers): {total_rows}")


Scanning 69 folders in C:\Users\wanglab\Desktop\Mask+Jaw

Found CSV files in 66 cases
Missing or failed: 4
Total rows (excluding headers): 7003


## Display Results Table

In [15]:
# Create DataFrame for easy viewing
df = pd.DataFrame(results)

# Add totals row
totals = pd.DataFrame([{
    'Folder': 'TOTAL',
    'CSV File': f'{folders_found} files',
    'Rows (excluding header)': total_rows,
    'Status': f'{folders_found} found, {folders_missing} missing'
}])

df_with_totals = pd.concat([df, totals], ignore_index=True)

# Display the table
print("\n" + "="*80)
print("CSV Row Counts by Folder:")
print("="*80)
display(df_with_totals)



CSV Row Counts by Folder:


Unnamed: 0,Folder,CSV File,Rows (excluding header),Status
0,10_bottom,jaw_bottom_10.csv,29,Success
1,10_side,jaw_side_10.csv,38,Success
2,11_bottom,jaw_bottom_11.csv,20,Success
3,11_side,jaw_side_11.csv,46,Success
4,12_bottom,jaw_bottom_12.csv,21,Success
5,12_side,jaw_side_12.csv,57,Success
6,13_bottom,jaw_bottom_13.csv,14,Success
7,13_side,jaw_side_13.csv,66,Success
8,13_side,jaw_side_13_reordered.csv,66,Success
9,14_bottom,jaw_bottom_14.csv,14,Success


## Show Only Valid Results (Folders with CSV Files)

In [None]:
# Filter to only show folders where CSV was found
df_valid = df[df['Status'] == 'Success'].copy()

if len(df_valid) > 0:
    print(f"\nFolders with valid jaw.csv files ({len(df_valid)} total):")
    print("="*80)
    display(df_valid)
    
    # Show some statistics
    print(f"\nStatistics:")
    print(f"  Min rows: {df_valid['Rows (excluding header)'].min()}")
    print(f"  Max rows: {df_valid['Rows (excluding header)'].max()}")
    print(f"  Mean rows: {df_valid['Rows (excluding header)'].mean():.2f}")
    print(f"  Median rows: {df_valid['Rows (excluding header)'].median():.2f}")
else:
    print("No valid CSV files found.")