In [1]:
import os
import sys
import glob
import csv
import shutil
import pandas as pd

In [2]:
def recursive_csv_to_dataframe(path, pattern, df=pd.DataFrame()):
    """
    Recursively reads CSV files and stores them to another DataFrame.
    
    Parameters:
    path: Path to the directory containing CSV files.
    pattern: The pattern that the filenames of CSV files should contain.
    df: DataFrame to which the CSV data is appended. Default is an empty DataFrame.
    
    Returns:
    DataFrame: DataFrame with the data from all the CSV files in the given directory and its subdirectories.
    """
    for root, _, files in os.walk(path):
        for file_name in files:
            file_path = os.path.join(root, file_name)

            if file_name.startswith(pattern) and file_name.endswith('.csv'):
                try:
                    csv_data = pd.read_csv(file_path)
                    df = pd.concat([df, csv_data], ignore_index=True, sort=False)
                except pd.errors.EmptyDataError:
                    pass
                
    return df


In [3]:
df = recursive_csv_to_dataframe('/Users/nathalia.esper/Documents/Ongoing_Projects/Kaggle/ratings/', 'data_cleaning_')
df.to_csv('all_data_cleaning.csv')


In [None]:
df_sleeplog = recursive_csv_to_dataframe('/Users/nathalia.esper/Documents/Ongoing_Projects/Kaggle/ratings/', 'sleeplog_')
df_sleeplog.to_csv('all_sleeplogs.csv')


* Now, this part of the code will read the sleeplog file with all subjects, save the IDs, and copy the csv file from output_GGIR/subj/meta/csv

In [34]:
def read_all_sleeplogs(path):
    """
    Reads CSV file with all sleeplogs and save only the ID (first column) to another dataframe)
    
    Parameters:
    path: Path to the directoru containing CSV files.
    
    Returns:
    DataFrame: DataFrame with the ID for all participants with sleeplog
    """
    
    csv_data = pd.read_csv(path)
    df_IDs = csv_data.iloc[:, 0]
    
    return df_IDs

In [43]:
def copy_files(df_IDs, source_dir, destination_dir, dir_pattern):
    
    # loop through first column values and copy files to destination directory
    for value in df_IDs:
        filename = value + '.gt3x.RData.csv'
        file_folder = 'output_' + value
        file_path = os.path.join(source_dir, file_folder)
        if os.path.exists(file_path):
            shutil.copy(file_path+dir_pattern+filename, destination_dir)

In [39]:
df_IDs = read_all_sleeplogs('/Users/nathalia.esper/Documents/Ongoing_Projects/Kaggle/all_sleeplogs.csv')
df_IDs.head(3)

0    NDARJA830BYV
1    NDARBG188RA5
2    NDARHU936MNP
Name: ID, dtype: object

In [42]:
source_dir = '/Users/nathalia.esper/Documents/Actigraph/data/Batch_2'
destination_dir = '/Users/nathalia.esper/Documents/Ongoing_Projects/Kaggle/csv_files'
dir_pattern = '/meta/csv/'
copy_files(df_IDs, source_dir, destination_dir, dir_pattern)

/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARJA830BYV
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARBG188RA5
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARHU936MNP
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARJC399UW7
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARBN365EV3
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARBJ159HXB
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARBF851NH6
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARJA788CH7
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARZT199MF6
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARHU910KZC
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARJF565ZRA
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARHT774ZK1
/Users/nathalia.esper/Documents/Actigraph/data/Batch_2/output_NDARBB118UDB
/Users/nathalia.esper/Doc