In [1]:
import os
import mimetypes
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import time

### Helper Functions

In [2]:
def get_file_paths(root_path):
    file_counts = []
    for root, _, files in os.walk(root_path):
        path_information = {}
        path_information['path'] = root
        path_information['file_count'] = len(files)
        file_counts.append(path_information)
    return pd.DataFrame(file_counts)

In [3]:
def split_paths_into_groups(df, num_groups):
    total_files = df['file_count'].sum()
    target_files_per_group = total_files // num_groups
    df_sorted = df.sort_values(by='file_count', ascending=False)
    current_group = 0
    current_file_count = 0
    groups = [[] for _ in range(num_groups)]
    # Iterate through paths
    for _, row in df_sorted.iterrows():
        if current_file_count + row['file_count'] <= target_files_per_group:
            # Add path to current group
            groups[current_group].append(row['path'])
            current_file_count += row['file_count']
        else:
            # Move to the next group
            current_group += 1
            if current_group >= num_groups:
                break
            groups[current_group].append(row['path'])
            current_file_count = row['file_count']
    
    return groups

In [4]:
def get_file_extension(filename):
    _, extension = os.path.splitext(filename)
    return extension

### Main Functions

In [5]:
def analyze_file(file_path):
    file_metadata = {}
    file_type, _ = mimetypes.guess_type(file_path)
    try:
        file_metadata['file_name'] = os.path.basename(file_path)
        file_metadata['file_extension'] = get_file_extension(file_path)
        file_metadata['file_creation_time'] = time.ctime(os.path.getctime(file_path))
        file_metadata['file_modification_time'] = time.ctime(os.path.getmtime(file_path))
        file_metadata['file_access_time'] = time.ctime(os.path.getatime(file_path))
        file_metadata['file_size_mb'] = os.stat(file_path).st_size / 1_000_000
        file_metadata['file_path'] = file_path 
        file_metadata['file_type'] = file_type
        if file_type is None:
            file_metadata['file_type'] = 'unknown'
        return file_metadata
    
    except Exception as e:
        print(f"Failed to analyze {file_path}: {e}")




def analyze_directory(directory_path):
    results = []
    for root, _, files in os.walk(directory_path):
        file_paths = [os.path.join(root, file) for file in files]
        for file_path in file_paths:
            print(file_path)
            file_metadata = analyze_file(file_path)
            results.extend(file_metadata)

    return results

def analyze_directory_paths(directory_paths):
    results = []
    for directory in directory_paths:
        file_metadata = analyze_directory(directory)
        print(file_metadata)
        results.extend(file_metadata)
    return results

def multithreaded_analyze_directory(directory_path):
    workers = 2 * os.cpu_count()
    file_paths_df = get_file_paths(directory_path)
    file_paths_groups = split_paths_into_groups(file_paths_df, workers)
    print(file_paths_groups)
    with ThreadPoolExecutor(max_workers=workers) as executor:
        results = list(executor.map(analyze_directory_paths, file_paths_groups))

    return results

# if __name__ == "__main__":
#     directory_path = "/path/to/your/directory"  # Replace with the path to your directory
#     results = analyze_directory(directory_path)
#     with open("results.json", "w") as f:
#         json.dump(results, f)

### Execute

In [6]:
users_directory = 'c:/Users/'
f_drive = 'f:/'
directory_to_travel = f_drive

In [7]:
results = multithreaded_analyze_directory(directory_to_travel)

In [18]:
results

[['file_name',
  'file_extension',
  'file_creation_time',
  'file_modification_time',
  'file_access_time',
  'file_size_mb',
  'file_path',
  'file_type',
  'file_name',
  'file_extension',
  'file_creation_time',
  'file_modification_time',
  'file_access_time',
  'file_size_mb',
  'file_path',
  'file_type',
  'file_name',
  'file_extension',
  'file_creation_time',
  'file_modification_time',
  'file_access_time',
  'file_size_mb',
  'file_path',
  'file_type',
  'file_name',
  'file_extension',
  'file_creation_time',
  'file_modification_time',
  'file_access_time',
  'file_size_mb',
  'file_path',
  'file_type',
  'file_name',
  'file_extension',
  'file_creation_time',
  'file_modification_time',
  'file_access_time',
  'file_size_mb',
  'file_path',
  'file_type',
  'file_name',
  'file_extension',
  'file_creation_time',
  'file_modification_time',
  'file_access_time',
  'file_size_mb',
  'file_path',
  'file_type',
  'file_name',
  'file_extension',
  'file_creation_time',

In [7]:
results = benchmark_analyze_directory(directory_to_travel)

In [9]:
df = pd.DataFrame(results)
# get sum of size in GB
storage_sum_gigabytes = round(df['file_size_mb'].sum(), 4)
print(f"{storage_sum_gigabytes}MB in {directory_to_travel}")

KeyError: 'file_size_mb'

In [None]:
df.sort_values(by='file_size_mb', ascending=False).head()

Unnamed: 0,file_name,file_extension,file_creation_time,file_modification_time,file_access_time,file_size_mb,file_path,file_type
54047,pakchunk0-WindowsNoEditor.pak,.pak,Wed Mar 20 22:40:28 2024,Sat Mar 16 20:13:45 2024,Wed Mar 20 22:51:25 2024,27821.624948,f:/Steam\steamapps\common\Ready Or Not\ReadyOr...,unknown
54026,pakchunk0-WindowsNoEditor.pak,.pak,Wed Mar 20 22:40:28 2024,Sat Mar 16 20:13:45 2024,Wed Mar 20 22:51:25 2024,27821.624948,f:/Steam\steamapps\common\Ready Or Not\ReadyOr...,unknown
43160,DataPC_Resources.forge,.forge,Wed Mar 20 20:50:37 2024,Thu Dec 28 15:03:17 2023,Wed Mar 20 21:01:44 2024,23586.865152,f:/Steam\steamapps\common\Ghost Recon Breakpoi...,unknown
43223,DataPC_Resources.forge,.forge,Wed Mar 20 20:50:37 2024,Thu Dec 28 15:03:17 2023,Wed Mar 20 21:01:44 2024,23586.865152,f:/Steam\steamapps\common\Ghost Recon Breakpoi...,unknown
40045,GeoChunk2.minizip,.minizip,Wed Mar 20 20:50:05 2024,Wed Mar 20 20:50:05 2024,Wed Mar 20 20:50:05 2024,19360.718848,f:/Steam\steamapps\common\ForzaHorizon5\media\...,unknown
