# Load the dataset and take an overall look here

In [6]:
# import lib for data processing
import pandas as pd

# Load the csv file into a pandas df
df = pd.read_csv('../adhdata.csv')

In [7]:
# Display first few rows of the df to see the data 
df.head()

Unnamed: 0,Fp1,Fp2,F3,F4,C3,C4,P3,P4,O1,O2,...,F8,T7,T8,P7,P8,Fz,Cz,Pz,Class,ID
0,261.0,402.0,16.0,261.0,126.0,384.0,126.0,236.0,52.0,236.0,...,16.0,200.0,494.0,126.0,236.0,121.0,367.0,121.0,ADHD,v10p
1,121.0,191.0,-94.0,85.0,16.0,200.0,126.0,52.0,347.0,273.0,...,-57.0,126.0,347.0,52.0,52.0,15.0,121.0,-19.0,ADHD,v10p
2,-55.0,85.0,-204.0,15.0,-57.0,200.0,52.0,126.0,236.0,200.0,...,-94.0,126.0,420.0,52.0,126.0,-55.0,261.0,85.0,ADHD,v10p
3,191.0,85.0,52.0,50.0,89.0,236.0,163.0,89.0,89.0,89.0,...,-57.0,236.0,420.0,126.0,126.0,15.0,85.0,-55.0,ADHD,v10p
4,-55.0,-125.0,-204.0,-160.0,-204.0,16.0,-241.0,-241.0,89.0,16.0,...,-131.0,89.0,310.0,-57.0,52.0,-55.0,15.0,-336.0,ADHD,v10p


# Now check NaN values and possible values for each features

In [13]:
import pandas as pd
import os

# Assuming df is already loaded
# df = pd.read_csv('your_eeg_data.csv')

# Create output directory if it doesn't exist
os.makedirs('src', exist_ok=True)

# Get the record ranges for each participant
participant_ranges = []

# Group by ID and get first and last index for each participant
for participant_id in df['ID'].unique():
    participant_data = df[df['ID'] == participant_id]
    
    first_record = participant_data.index[0]
    last_record = participant_data.index[-1]
    num_records = len(participant_data)
    duration_seconds = num_records / 128  # Calculate duration in seconds
    class_label = participant_data['Class'].iloc[0]
    
    participant_ranges.append({
        'ID': participant_id,
        'Class': class_label,
        'First_Record': first_record,
        'Last_Record': last_record,
        'Num_Records': num_records,
        'Duration_Seconds': round(duration_seconds, 2),
        'Duration_Minutes': round(duration_seconds / 60, 2)
    })

# Create a summary dataframe
summary_df = pd.DataFrame(participant_ranges)

# Sort by first record to see the order
summary_df = summary_df.sort_values('First_Record').reset_index(drop=True)

# VALIDATION CHECKS
print("Running validation checks...")
validation_passed = True

# Check 1: Verify total records match
calculated_total = summary_df['Num_Records'].sum()
actual_total = len(df)
if calculated_total != actual_total:
    print(f"‚ùå VALIDATION FAILED: Calculated total ({calculated_total}) != Actual total ({actual_total})")
    validation_passed = False
else:
    print(f"‚úÖ Total records match: {actual_total:,}")

# Check 2: Verify no gaps or overlaps
for i in range(len(summary_df) - 1):
    current_last = summary_df.iloc[i]['Last_Record']
    next_first = summary_df.iloc[i+1]['First_Record']
    
    if current_last + 1 != next_first:
        print(f"‚ùå VALIDATION FAILED: Gap/overlap between {summary_df.iloc[i]['ID']} and {summary_df.iloc[i+1]['ID']}")
        validation_passed = False

if validation_passed:
    print("‚úÖ No gaps or overlaps detected")

# Check 3: Verify duration calculation for first participant
first_participant = summary_df.iloc[0]
expected_duration = first_participant['Num_Records'] / 128
if abs(first_participant['Duration_Seconds'] - expected_duration) > 0.01:
    print(f"‚ùå VALIDATION FAILED: Duration calculation incorrect")
    validation_passed = False
else:
    print(f"‚úÖ Duration calculations are correct")

# Check 4: Verify each participant has only one class
for pid in df['ID'].unique():
    unique_classes = df[df['ID'] == pid]['Class'].nunique()
    if unique_classes > 1:
        print(f"‚ùå VALIDATION FAILED: Participant {pid} has multiple class labels")
        validation_passed = False

print(f"‚úÖ All participants have consistent class labels\n")

if validation_passed:
    print("=" * 80)
    print("ALL VALIDATION CHECKS PASSED ‚úÖ")
    print("=" * 80 + "\n")
else:
    print("=" * 80)
    print("‚ö†Ô∏è  VALIDATION FAILED - Please review the issues above")
    print("=" * 80 + "\n")

# Prepare output content
output_lines = []
output_lines.append("=" * 80)
output_lines.append("EEG DATASET PARTICIPANT ANALYSIS")
output_lines.append("=" * 80)
output_lines.append(f"\nTotal Participants: {len(summary_df)}")
output_lines.append(f"Total Records: {len(df):,}")
output_lines.append(f"\nADHD Participants: {len(summary_df[summary_df['Class'] == 'ADHD'])}")
output_lines.append(f"Control Participants: {len(summary_df[summary_df['Class'] == 'Control'])}")
output_lines.append("\n" + "=" * 80)

# Participant ranges table
output_lines.append("\nParticipant Record Ranges:")
output_lines.append("-" * 80)
output_lines.append(summary_df.to_string(index=False))

# Statistics
output_lines.append("\n" + "=" * 80)
output_lines.append("RECORDING DURATION STATISTICS")
output_lines.append("=" * 80)
output_lines.append(f"\nAverage recording duration: {summary_df['Duration_Seconds'].mean():.2f} seconds ({summary_df['Duration_Minutes'].mean():.2f} minutes)")
output_lines.append(f"Minimum recording duration: {summary_df['Duration_Seconds'].min():.2f} seconds ({summary_df['Duration_Minutes'].min():.2f} minutes)")
output_lines.append(f"Maximum recording duration: {summary_df['Duration_Seconds'].max():.2f} seconds ({summary_df['Duration_Minutes'].max():.2f} minutes)")
output_lines.append(f"Median recording duration: {summary_df['Duration_Seconds'].median():.2f} seconds ({summary_df['Duration_Minutes'].median():.2f} minutes)")
output_lines.append(f"Std deviation: {summary_df['Duration_Seconds'].std():.2f} seconds")

# Group by class
output_lines.append("\n" + "=" * 80)
output_lines.append("DURATION BY CLASS")
output_lines.append("=" * 80)
for class_name in sorted(summary_df['Class'].unique()):
    class_data = summary_df[summary_df['Class'] == class_name]
    output_lines.append(f"\n{class_name}:")
    output_lines.append(f"  Count: {len(class_data)} participants")
    output_lines.append(f"  Average: {class_data['Duration_Seconds'].mean():.2f} seconds ({class_data['Duration_Minutes'].mean():.2f} minutes)")
    output_lines.append(f"  Min: {class_data['Duration_Seconds'].min():.2f} seconds")
    output_lines.append(f"  Max: {class_data['Duration_Seconds'].max():.2f} seconds")
    output_lines.append(f"  Std Dev: {class_data['Duration_Seconds'].std():.2f} seconds")

# Data integrity check
output_lines.append("\n" + "=" * 80)
output_lines.append("DATA INTEGRITY CHECK")
output_lines.append("=" * 80)
has_issues = False
for i in range(len(summary_df) - 1):
    current_last = summary_df.iloc[i]['Last_Record']
    next_first = summary_df.iloc[i+1]['First_Record']
    
    if current_last + 1 != next_first:
        output_lines.append(f"‚ö†Ô∏è  Gap detected between {summary_df.iloc[i]['ID']} and {summary_df.iloc[i+1]['ID']}")
        output_lines.append(f"   Records {current_last} to {next_first}")
        has_issues = True

if not has_issues:
    output_lines.append("‚úÖ All participant records are consecutive with no gaps or overlaps!")

output_lines.append("\n" + "=" * 80)

# Add validation summary
output_lines.append("\nVALIDATION SUMMARY")
output_lines.append("=" * 80)
output_lines.append(f"Total Records Validation: {'‚úÖ PASSED' if validation_passed else '‚ùå FAILED'}")
output_lines.append(f"Calculated Total: {summary_df['Num_Records'].sum():,}")
output_lines.append(f"Actual Total: {len(df):,}")
output_lines.append(f"Data Continuity: {'‚úÖ PASSED' if not has_issues else '‚ùå FAILED'}")
output_lines.append("=" * 80)

# Write to file
output_file = 'notes/participant_analysis_report.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))

print(f"\n‚úÖ Analysis report saved to: {output_file}")
print(f"üìä Summary DataFrame shape: {summary_df.shape}")
print(f"üìÅ File location: {os.path.abspath(output_file)}")

# Also save the summary dataframe as CSV for easy analysis
csv_file = 'notes/participant_summary.csv'
summary_df.to_csv(csv_file, index=False)
print(f"üìä Summary CSV saved to: {csv_file}")

# Return the summary dataframe for further analysis
summary_df

Running validation checks...
‚úÖ Total records match: 2,166,383
‚úÖ No gaps or overlaps detected
‚úÖ Duration calculations are correct
‚úÖ All participants have consistent class labels

ALL VALIDATION CHECKS PASSED ‚úÖ


‚úÖ Analysis report saved to: notes/participant_analysis_report.txt
üìä Summary DataFrame shape: (121, 7)
üìÅ File location: c:\Users\ADMIN\Documents\My GitHub Project\adhd-detection\src\notes\participant_analysis_report.txt
üìä Summary CSV saved to: notes/participant_summary.csv


Unnamed: 0,ID,Class,First_Record,Last_Record,Num_Records,Duration_Seconds,Duration_Minutes
0,v10p,ADHD,0,14303,14304,111.75,1.86
1,v12p,ADHD,14304,31907,17604,137.53,2.29
2,v14p,ADHD,31908,49469,17562,137.20,2.29
3,v15p,ADHD,49470,92721,43252,337.91,5.63
4,v173,ADHD,92722,116962,24241,189.38,3.16
...,...,...,...,...,...,...,...
116,v306,Control,2065639,2083939,18301,142.98,2.38
117,v307,Control,2083940,2106746,22807,178.18,2.97
118,v308,Control,2106747,2123771,17025,133.01,2.22
119,v309,Control,2123772,2148589,24818,193.89,3.23
