In [4]:
import pandas as pd
from pathlib import Path
import datetime

def get_files_info(directory_path):
    """
    Recursively search for all files under a folder and create a DataFrame
    with file information.
    
    Args:
        directory_path (str or Path): Path to the directory to search
        
    Returns:
        pd.DataFrame: DataFrame with file information
    """
    # Convert to Path object if string is provided
    directory = Path(directory_path)
    
    # Create an empty list to store file information
    file_list = []
    
    # Recursively find all files in the directory
    for file_path in directory.glob('**/*'):
        # Skip directories, only process files
        if file_path.is_file():
            try:
                # Get the file information
                file_basename = file_path.name
                size = file_path.stat().st_size
                file_type = file_path.suffix[1:] if file_path.suffix else ""
                file_full_path = str(file_path.absolute())
                
                # Get modification time and convert from Unix timestamp to datetime
                mod_time = file_path.stat().st_mtime
                date_modified = datetime.datetime.fromtimestamp(mod_time)
                
                # Append the file information to the list
                file_list.append({
                    'file_basename': file_basename,
                    'size': size,
                    'file_type': file_type,
                    'file_full_path': file_full_path,
                    'date_modified': date_modified
                })
            except (OSError, FileNotFoundError):
                # Handle cases where file might not be accessible
                continue
    
    # Create a DataFrame from the list
    df = pd.DataFrame(file_list)
    
    return df

# # Example usage
# if __name__ == "__main__":
#     directory_path = "your_directory_path"  # Replace with your target directory
#     file_df = get_files_info(directory_path)
#     print(file_df)


In [8]:
directory_path = Path(r"C:\Users\scott\OneDrive\share\ref")  # Replace with your target directory
file_df = get_files_info(directory_path)
file_df['size_MB'] = file_df['size'] / 1e6 # file_df.size_MB.apply(lambda x: x/1e6)
file_df.sort_values('size', ascending=False)

Unnamed: 0,file_basename,size,file_type,file_full_path,date_modified,size_MB
55069,pack-3890ac11c9c25611161a0dc371e27ea35de924e6....,382308750,pack,C:\Users\scott\OneDrive\share\ref\.git\objects...,2025-03-16 19:23:49.231817,382.308750
54667,7b80a20c54049b49450c27aa69146f2f96f33e,229423229,,C:\Users\scott\OneDrive\share\ref\.git\objects...,2025-03-16 16:54:48.034900,229.423229
53909,1c2389aa8440f7f87058e522f0cb52a1d66b26,229219496,,C:\Users\scott\OneDrive\share\ref\.git\objects...,2025-03-16 16:54:29.064270,229.219496
35020,Li19ZeroShotLearningIntrusion.pdf,83682813,pdf,C:\Users\scott\OneDrive\share\ref\obsidian\Obs...,2023-07-26 11:49:44.000000,83.682813
34729,He19TransferLearningFinancial.pdf,66529087,pdf,C:\Users\scott\OneDrive\share\ref\obsidian\Obs...,2023-08-07 12:31:24.000000,66.529087
...,...,...,...,...,...,...
20743,.gitkeep,0,,C:\Users\scott\OneDrive\share\ref\refwrangle\b...,2025-01-14 15:31:08.039157,0.000000
34600,Fraser24genAIhammerNail2.pdf,0,pdf,C:\Users\scott\OneDrive\share\ref\obsidian\Obs...,2024-03-03 13:34:19.000000,0.000000
37687,styles.css,0,css,C:\Users\scott\OneDrive\share\ref\obsidian\Obs...,2024-05-30 20:40:06.000000,0.000000
13121,recorder.B_SY1GJM.css,0,css,C:\Users\scott\OneDrive\share\ref\refwrangle\o...,2024-11-24 19:02:04.924734,0.000000
