In [30]:
import os
import mimetypes
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

In [9]:
def analyze_file(file_path):
    try:
        file_info = os.stat(file_path)
        file_name = os.path.basename(file_path)
        file_size = file_info.st_size
        file_type, _ = mimetypes.guess_type(file_path)
        if file_type is None:
            file_type = 'unknown'

        if file_type.startswith('text'):
            with open(file_path, 'r', errors='ignore') as file:
                content = file.read()
                word_count = len(content.split())
                return {"file_name": file_name, "size": file_size, "type": file_type, "word_count": word_count}
        else:
            return {"file_name": file_name, "size": file_size, "type": file_type}

    except Exception as e:
        print(f"Failed to analyze {file_path}: {e}")
        return {"file": file_path, "file_name": file_name, "size": None, "type": "error", "error": str(e)}

def analyze_directory(directory_path):
    results = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        for root, _, files in os.walk(directory_path):
            file_paths = [os.path.join(root, file) for file in files]
            results.extend(executor.map(analyze_file, file_paths))
    return results

# if __name__ == "__main__":
#     directory_path = "/path/to/your/directory"  # Replace with the path to your directory
#     results = analyze_directory(directory_path)
#     with open("results.json", "w") as f:
#         json.dump(results, f)

In [10]:
# Get the path of the directory three levels up from the current directory
three_levels_back_directory = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir,)

print("Three levels back directory:", three_levels_back_directory)

Three levels back directory: c:\Users\Max\OneDrive\Documents\GitHub\experiments\directory_traveler\..\..\..


In [36]:
onedrive_directory = 'c:/Users/OneDrive/'
users_directory = 'c:/Users'
f_drive = 'f:/'

In [37]:
results = analyze_directory(f_drive)

In [None]:
len(results)

585450

In [None]:
df = pd.DataFrame(results)

In [None]:
df['size'].sum() / 1_000_000_000

140.195081382