# üßπ Clean Old Training Data

**Purpose:** Delete all old training outputs to start fresh

**What will be deleted:**
1. Patches: data/patches/train, val, test
2. Checkpoints: checkpoints/*.pth
3. Outputs: outputs/*
4. Logs: logs/*
5. Figures: figures/*.png and subdirectories

**‚ö†Ô∏è WARNING:** This operation cannot be undone!

**When to use:**
- Before retraining with different channel configurations
- After changing data preprocessing
- To free up disk space

## 1. Import Libraries

In [1]:
import shutil
from pathlib import Path

print('Libraries imported successfully!')
print(f'Working directory: {Path.cwd()}')

Libraries imported successfully!
Working directory: d:\HaiDang\25-26_HKI_DATN_21021411_DangNH\notebooks


## 2. Check Current Status

In [2]:
def get_dir_size(path):
    total = 0
    if path.exists():
        for f in path.rglob('*'):
            if f.is_file():
                total += f.stat().st_size
    return total / (1024**2)

def count_files(path, pattern='*'):
    if path.exists():
        return len(list(path.rglob(pattern)))
    return 0

print('='*80)
print('CURRENT STATUS')
print('='*80)
print()

patches_dir = Path('../data/patches')
print('1. Patches:')
for split in ['train', 'val', 'test']:
    split_dir = patches_dir / split
    if split_dir.exists():
        n_files = len(list(split_dir.glob('*.npy')))
        size_mb = get_dir_size(split_dir)
        print(f'   {split}: {n_files} files, {size_mb:.1f} MB')
    else:
        print(f'   {split}: Not found')
print()

checkpoints_dir = Path('../checkpoints')
n_checkpoints = count_files(checkpoints_dir, '*.pth')
checkpoint_size = get_dir_size(checkpoints_dir)
print(f'2. Checkpoints: {n_checkpoints} files, {checkpoint_size:.1f} MB')
print()

outputs_dir = Path('../outputs')
n_outputs = count_files(outputs_dir)
output_size = get_dir_size(outputs_dir)
print(f'3. Outputs: {n_outputs} files, {output_size:.1f} MB')
print()

logs_dir = Path('../logs')
n_logs = count_files(logs_dir)
log_size = get_dir_size(logs_dir)
print(f'4. Logs: {n_logs} files, {log_size:.1f} MB')
print()

figures_dir = Path('../figures')
n_figures = count_files(figures_dir, '*.png')
figure_size = get_dir_size(figures_dir)
print(f'5. Figures: {n_figures} files, {figure_size:.1f} MB')
print()

total_size = (get_dir_size(patches_dir) + checkpoint_size + output_size + log_size + figure_size)
print(f'Total size: {total_size:.1f} MB')
print('='*80)

CURRENT STATUS

1. Patches:
   train: Not found
   val: Not found
   test: Not found

2. Checkpoints: 0 files, 0.3 MB

3. Outputs: 0 files, 0.0 MB

4. Logs: 0 files, 0.0 MB

5. Figures: 8 files, 5.8 MB

Total size: 6.1 MB


## 3. Delete Patches

In [3]:
print('Deleting patches...')
patches_dir = Path('../data/patches')

deleted_count = 0
for split in ['train', 'val', 'test']:
    split_dir = patches_dir / split
    if split_dir.exists():
        n_files = len(list(split_dir.glob('*.npy')))
        shutil.rmtree(split_dir)
        print(f'  Deleted {split}: {n_files} files')
        deleted_count += n_files
    else:
        print(f'  {split}: Not found')

summary_file = patches_dir / 'dataset_summary.txt'
if summary_file.exists():
    summary_file.unlink()
    print('  Deleted summary file')

print(f'\nTotal patches deleted: {deleted_count}')

Deleting patches...
  train: Not found
  val: Not found
  test: Not found

Total patches deleted: 0


## 4. Delete Checkpoints

In [4]:
print('Deleting checkpoints...')
checkpoints_dir = Path('../checkpoints')

deleted_count = 0
for f in checkpoints_dir.glob('*.pth'):
    size_mb = f.stat().st_size / (1024**2)
    f.unlink()
    print(f'  Deleted {f.name} ({size_mb:.1f} MB)')
    deleted_count += 1

if deleted_count == 0:
    print('  No checkpoints found')
else:
    print(f'\nTotal checkpoints deleted: {deleted_count}')

Deleting checkpoints...
  No checkpoints found


## 5. Delete Outputs

In [5]:
print('Deleting outputs...')
outputs_dir = Path('../outputs')

if outputs_dir.exists():
    n_files = count_files(outputs_dir)
    shutil.rmtree(outputs_dir)
    outputs_dir.mkdir()
    print(f'  Deleted outputs: {n_files} files')
else:
    print('  Outputs directory not found')

Deleting outputs...
  Deleted outputs: 0 files


## 6. Delete Logs

In [6]:
print('Deleting logs...')
logs_dir = Path('../logs')

if logs_dir.exists():
    n_files = count_files(logs_dir)
    shutil.rmtree(logs_dir)
    logs_dir.mkdir()
    print(f'  Deleted logs: {n_files} files')
else:
    print('  Logs directory not found')

Deleting logs...
  Deleted logs: 0 files


## 7. Delete Training Figures

In [7]:
print('Deleting training figures...')
figures_dir = Path('../figures')

deleted_count = 0

subdirs = ['training_curves', 'confusion_matrices', 'sample_predictions']
for subdir in subdirs:
    subdir_path = figures_dir / subdir
    if subdir_path.exists():
        n_files = len(list(subdir_path.glob('*.png')))
        shutil.rmtree(subdir_path)
        print(f'  Deleted {subdir}: {n_files} files')
        deleted_count += n_files

files_to_delete = [
    'roc_curves_all_models.png',
    'model_agreement_analysis.png',
    'spatial_distribution_grid.png',
    'probability_distribution.png',
    'full_probability_map.png',
    'full_binary_map.png',
    'full_map_visualization.png',
    'comparison_prob_vs_binary.png',
    'regional_analysis_zoomed.png'
]

for filename in files_to_delete:
    filepath = figures_dir / filename
    if filepath.exists():
        filepath.unlink()
        print(f'  Deleted {filename}')
        deleted_count += 1

if deleted_count == 0:
    print('  No training figures found')
else:
    print(f'\nTotal figures deleted: {deleted_count}')

print('\nNote: Exploration figures (from notebooks 01, 02) are kept.')

Deleting training figures...
  No training figures found

Note: Exploration figures (from notebooks 01, 02) are kept.


## 8. Final Status

In [8]:
print()
print('='*80)
print('CLEANUP COMPLETED')
print('='*80)
print()
print('Remaining data:')

patches_dir = Path('../data/patches')
checkpoints_dir = Path('../checkpoints')
outputs_dir = Path('../outputs')
logs_dir = Path('../logs')
figures_dir = Path('../figures')

for split in ['train', 'val', 'test']:
    split_dir = patches_dir / split
    status = 'Exists' if split_dir.exists() else 'Deleted'
    print(f'  Patches/{split}: {status}')

n_checkpoints = count_files(checkpoints_dir, '*.pth')
print(f'  Checkpoints: {n_checkpoints} files')

n_outputs = count_files(outputs_dir)
print(f'  Outputs: {n_outputs} files')

n_logs = count_files(logs_dir)
print(f'  Logs: {n_logs} files')

n_figures = count_files(figures_dir, '*.png')
print(f'  Figures: {n_figures} files (exploration only)')

print()
print('='*80)
print('Ready to retrain!')
print('='*80)
print()
print('Next steps:')
print('  1. Run notebook 02: Create patches')
print('  2. Run notebook 03: Train models')
print('  3. Run notebook 04: Evaluate results')


CLEANUP COMPLETED

Remaining data:
  Patches/train: Deleted
  Patches/val: Deleted
  Patches/test: Deleted
  Checkpoints: 0 files
  Outputs: 0 files
  Logs: 0 files
  Figures: 8 files (exploration only)

Ready to retrain!

Next steps:
  1. Run notebook 02: Create patches
  2. Run notebook 03: Train models
  3. Run notebook 04: Evaluate results
