In [70]:
import json
from collections import Counter
import random

In [71]:
with open('rmlint.json') as f:    
    data = json.load(f)

In [72]:
print(f"Type of loaded data: {type(data)}")

unique_keys_counter = Counter()

if isinstance(data, list):
    print(f"Number of elements in the data: {len(data)}")
    for item in data[1:-1]:
        if isinstance(item, dict):
            for key in item.keys():
                unique_keys_counter[key] += 1
        
for key, count in unique_keys_counter.items():
    print(f"Key: {key}, Count: {count}")

Type of loaded data: <class 'list'>
Number of elements in the data: 37530
Key: id, Count: 37528
Key: type, Count: 37528
Key: progress, Count: 37528
Key: path, Count: 37528
Key: size, Count: 37528
Key: depth, Count: 37528
Key: inode, Count: 37528
Key: disk_id, Count: 37528
Key: is_original, Count: 37528
Key: mtime, Count: 37528
Key: checksum, Count: 34350
Key: parent_path, Count: 132
Key: n_children, Count: 30
Key: twins, Count: 34218


In [73]:
data[0]

{'description': 'rmlint json-dump of lint files',
 'cwd': '/volume1/homes/Nod0n/Drive/rmlint/',
 'args': 'rmlint -gkmD /volume1/Gondor/ // /volume1/photo /volume1/homes/ /volume1/Bruchtal/',
 'version': '2.10.2',
 'rev': 'unknown',
 'progress': 0,
 'checksum_type': 'blake2b',
 'merge_directories': True}

In [74]:
data[-1]

{'aborted': False,
 'progress': 100,
 'total_files': 2086093,
 'ignored_files': 0,
 'ignored_folders': 0,
 'duplicates': 14382,
 'duplicate_sets': 13938,
 'total_lint_size': 23827012244}

In [75]:
data[2]

{'id': 1542826447,
 'type': 'emptydir',
 'progress': 100,
 'path': '/volume1/Gondor/MSI Aaron/Archiv/Windows Backup MSI old1/Data/C/Windows/System32/ras',
 'size': 0,
 'depth': 8,
 'inode': 620801,
 'disk_id': 41,
 'is_original': False,
 'mtime': 1634443996.1048765}

In [76]:
empty_dirs = [item for item in data if isinstance(item, dict) and item.get('type') == 'emptydir']
len(empty_dirs)

1077

In [77]:
def random_sample(list_: list, size: int = 10) -> list:
    return random.Random(0).sample(list_, size)

random_sample(empty_dirs, 3)

[{'id': 3867650435,
  'type': 'emptydir',
  'progress': 100,
  'path': '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/Users/miz/AppData/LocalLow/Adobe/Linguistics',
  'size': 0,
  'depth': 12,
  'inode': 1951553,
  'disk_id': 41,
  'is_original': False,
  'mtime': 1540097046.7086701},
 {'id': 2741730464,
  'type': 'emptydir',
  'progress': 100,
  'path': '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/Users/miz/AppData/Local/Packages/Microsoft.MicrosoftEdge_8wekyb3d8bbwe/AC/#!121/MicrosoftEdge/IECompatCache',
  'size': 0,
  'depth': 16,
  'inode': 2085888,
  'disk_id': 41,
  'is_original': False,
  'mtime': 1626988323.771443},
 {'id': 4289465458,
  'type': 'emptydir',
  'progress': 100,
  'path': '/volume1/Gondor/MSI Aaron/Archiv/Windows Backup MSI old1/Data/C/Users/Aaron Bruelisauer/AppData/Local/MicrosoftEdge/SharedCacheContainers/MicrosoftEdge_EmieSiteList',
  'size': 0,
  'depth': 12,
  'inode': 30921,
  'disk_id': 41,
  'is_original': False,
  'mti

In [78]:
def path_is_trash(path: str) -> bool:
    return (
        'trash' in path.lower() or
        '#recycle' in path.lower()
    )
def is_in_trash(item_: dict) -> bool:
    return (
            isinstance(item_, dict) and
            isinstance(item_.get('path'), str) and
            path_is_trash(item_.get('path'))
    )

trash_items = [item for item in data if is_in_trash(item)]
len(trash_items)

14866

In [79]:
trash_paths = [item.get('path') for item in trash_items]
random_sample(trash_paths)

['/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.68/f1790557016.xml',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.84/f1795995712.txt',
 '/volume1/homes/#recycle/aaron/recover_benj/2/recup_dir.81/f1794027400.evtx',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.64/f1790422424.txt',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.93/f1797584192.png',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.56/f1790291128.png',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.105/f1798014256.png',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.132/f1806641304.txt',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.85/f1796016192.txt',
 '/volume1/homes/#recycle/aaron/recover_benj/1/recup_dir.37/f1788571816.java']

In [80]:
non_trash_items = [item for item in data if not is_in_trash(item)]
len(non_trash_items)

22664

In [81]:
non_trash_paths = [item.get('path') for item in non_trash_items]
random_sample(non_trash_paths)

['/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/Program Files/WindowsApps/king.com.ParadiseBay_3.7.0.0_x86__kgqvnymyfvs32/bundle/data/holidayShop_I45 (2018_10_21 00_52_38 UTC).png',
 '/volume1/Bruchtal/old_bruchtal/Photos/2017/2017-10-16/20170726_161833_021.jpg',
 '/volume1/Gondor/Archiv/Miz HTC/.data/CacheManager/temp/-1970032149_1135208744_1405499188071',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/$OF/180781/3235 (2018_11_04 13_55_39 UTC).png',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/Program Files/Windows Defender/AmMonitoringInstall (2018_10_21 00_52_38 UTC).mof',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/Program Files/WindowsApps/king.com.ParadiseBay_3.7.0.0_x86__kgqvnymyfvs32/bundle/data/prometheusWorldPosition_Trail (2018_10_21 00_52_38 UTC).hs',
 '/volume1/homes/mirjam/laptop_andy/bilder (1)/2014-10-20/IMAG1076.jpg',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/$OF/145782/14596

In [82]:
def is_in_eadir(item_: dict) -> bool:
    return (
        'path' in item_.keys() and (
            '/@eaDir/' in item_.get('path')
            or item_.get('path').endswith('/@eaDir')
        )
    )

not_eadir_items = [item for item in non_trash_items if not is_in_eadir(item)]

In [83]:
def summarise_types(items: list[dict]) -> dict:
    types_counter = Counter(item_.get('type') for item_ in items)
    return dict(types_counter)

summarise_types(not_eadir_items)

{None: 2,
 'emptydir': 1034,
 'emptyfile': 1137,
 'part_of_directory': 105,
 'duplicate_dir': 3,
 'duplicate_file': 18863}

In [84]:
duplicate_dirs = [item for item in not_eadir_items if item.get('type') == 'duplicate_dir']
[item.get('path') for item in duplicate_dirs]

['/volume1/Gondor/Archiv/Festplatte 1TB Toschiba Miz/USB Copy_2019-04-01_203757/x201-20160723/restore/20160723/.mozilla/firefox/bc8e9dia.default/webapps',
 '/volume1/Gondor/Archiv/Festplatte 1TB Toschiba Miz/USB Copy_2019-04-01_203757/x201-20160723/restore/20160722/.mozilla/firefox/bc8e9dia.default/webapps',
 '/volume1/Gondor/Archiv/Festplatte 1TB Toschiba Miz/USB Copy_2019-04-01_203757/x201-20160723/.mozilla/firefox/bc8e9dia.default/webapps']

In [89]:
part_of_directory = [item for item in not_eadir_items if item.get('type') == 'part_of_directory']
{'number of files in duplicate_dir folders': len(part_of_directory)}

{'number of files in duplicate_dir folders': 105}

In [90]:
duplicate_files_random_sample = random_sample(duplicate_files)
[item.get('path') for item in duplicate_files_random_sample]

['/volume1/Gondor/Archiv/Miz HTC/.data/CacheManager/temp/-143952354_1126798560_1410642206965',
 '/volume1/Gondor/Archiv/Miz HTC/.data/CacheManager/temp/1039756326_1114274808_1410637828167',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/$RECYCLE.BIN/S-1-5-21-2031780992-3694289565-779588206-1001/@eaDir/$I13HXLY (2019_01_22 16_14_32 UTC).jpg/SYNOPHOTO_THUMB_XL.fail',
 '/volume1/Gondor/Archiv/Note8/backups/USB/Android/data/com.viber.voip',
 '/volume1/Gondor/Archiv/Miz HTC/.data/CacheManager/temp/-819538873_1126488120_1406554760538',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/$RECYCLE.BIN/S-1-5-21-2031780992-3694289565-779588206-1001/@eaDir/$IGPE9X2 (2019_01_22 16_14_32 UTC).jpg/SYNOPHOTO_THUMB_B.fail',
 '/volume1/Gondor/ASUS Miz/Archiv/Windows Backup/MIZ-ASUS/Data/C/$RECYCLE.BIN/S-1-5-21-2031780992-3694289565-779588206-1001/@eaDir/$IGE5W5E (2019_01_22 16_14_32 UTC).jpg/SYNOPHOTO_THUMB_B.fail',
 '/volume1/Gondor/Archiv/Miz HTC/.data/CacheManager/temp