In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Path to the core directory
core_dir = "../data/core"

In [2]:
# Function to list data files
def list_data_files(directory):
    """Recursively list all data files in the directory"""
    data_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(('.txt', '.csv')):
                data_files.append(os.path.join(root, file))
    return data_files

In [3]:
# List all data files
data_files = list_data_files(core_dir)
print(f"Found {len(data_files)} data files in {core_dir}")
print("\nFirst few files:")
for file in data_files[:5]:
    print(file)

Found 529 data files in ../data/core

First few files:
../data/core/linked-external-data/led_l_seda_demo_c.csv
../data/core/linked-external-data/led_l_lodes.csv
../data/core/linked-external-data/led_l_denspop.csv
../data/core/linked-external-data/led_l_urbsat.csv
../data/core/linked-external-data/led_l_no2.csv


In [4]:
# Function to load ABCD file
def load_abcd_file(filepath):
    """Load an ABCD data file and return basic information"""
    try:
        # Try to load the file
        df = pd.read_csv(filepath, delimiter='\t', low_memory=False)
        
        # Get basic information
        info = {
            'shape': df.shape,
            'columns': list(df.columns),
            'missing_values': df.isnull().sum().sum(),
            'file_name': os.path.basename(filepath)
        }
        
        return df, info
    except Exception as e:
        print(f"Error loading {filepath}: {str(e)}")
        return None, None

In [5]:
# Load the first file as an example
if data_files:
    sample_file = data_files[0]
    print(f"\nLoading sample file: {sample_file}")
    
    df, info = load_abcd_file(sample_file)
    
    if df is not None:
        print("\nFile Information:")
        print(f"Shape: {info['shape']}")
        print(f"Number of columns: {len(info['columns'])}")
        print(f"Total missing values: {info['missing_values']}")
        
        print("\nFirst few rows:")
        print(df.head())
        
        print("\nColumn names:")
        for col in info['columns']:
            print(col) 


Loading sample file: ../data/core/linked-external-data/led_l_seda_demo_c.csv

File Information:
Shape: (10519, 1)
Number of columns: 1
Total missing values: 0

First few rows:
  src_subject_id,eventname,ledsch_seda_c_baplusavgall,ledsch_seda_c_hsecdnec,ledsch_seda_c_hsflnfl,ledsch_seda_c_lninc50avgall,ledsch_seda_c_perasn,ledsch_seda_c_perblk,ledsch_seda_c_perecd,ledsch_seda_c_perell,ledsch_seda_c_perfl,ledsch_seda_c_perfrl,ledsch_seda_c_perhsp,ledsch_seda_c_perind,ledsch_seda_c_perrl,ledsch_seda_c_perspeced,ledsch_seda_c_perwht,ledsch_seda_c_povertyavgall,ledsch_seda_c_rsecdnec,ledsch_seda_c_rsflnfl,ledsch_seda_c_rural,ledsch_seda_c_sesavgall,ledsch_seda_c_single_momavgall,ledsch_seda_c_snapavgall,ledsch_seda_c_suburb,ledsch_seda_c_totenrl,ledsch_seda_c_town,ledsch_seda_c_unempavgall,ledsch_seda_c_urban
0  NDAR_INV005V6D2C,baseline_year_1_arm_1,0.36073...                                                                                                                                   