# Codebase Verification Test Notebook

This notebook verifies the structure and counts of the PPI-Inhibitors codebase.

In [None]:
import json
import os
from pathlib import Path
import pandas as pd

# Define the base path (adjust if running from different location)
base_path = Path('.')
code_path = base_path / 'code'

print("=" * 80)
print("PPI-INHIBITORS CODEBASE VERIFICATION")
print("=" * 80)

# 1. Count all notebooks in repository
all_notebooks = list(base_path.rglob('*.ipynb'))
# Exclude hidden directories
all_notebooks = [nb for nb in all_notebooks if not any(part.startswith('.') for part in nb.parts)]
print(f"\n1. NOTEBOOK COUNT")
print(f"   Total notebooks in repository: {len(all_notebooks)}")
for nb in sorted(all_notebooks):
    print(f"      - {nb}")

# 2. Count files in /code directory
code_files = list(code_path.glob('*'))
code_notebooks = list(code_path.glob('*.ipynb'))
print(f"\n2. /CODE DIRECTORY ANALYSIS")
print(f"   Total files in /code: {len(code_files)}")
print(f"   Notebooks in /code: {len(code_notebooks)}")
for f in sorted(code_files):
    size = os.path.getsize(f) / 1024  # KB
    print(f"      - {f.name:<80} ({size:>8.2f} KB)")

# 3. Analyze main GNN notebook
main_notebook_path = code_path / 'GNN_based_pipeline_Training_for_Predicting_small_molecule_inhibition_of_protein_complexes_ipynb.ipynb'
print(f"\n3. MAIN GNN NOTEBOOK ANALYSIS")
print(f"   File: {main_notebook_path.name}")

if main_notebook_path.exists():
    with open(main_notebook_path, 'r') as f:
        notebook_data = json.load(f)
    
    total_cells = len(notebook_data['cells'])
    code_cells = [cell for cell in notebook_data['cells'] if cell['cell_type'] == 'code']
    markdown_cells = [cell for cell in notebook_data['cells'] if cell['cell_type'] == 'markdown']
    
    print(f"   Total cells: {total_cells}")
    print(f"   Code cells: {len(code_cells)}")
    print(f"   Markdown cells: {len(markdown_cells)}")
    
    # Analyze code cells
    print(f"\n   Code Cell Details:")
    for i, cell in enumerate(code_cells):
        source = ''.join(cell['source']) if isinstance(cell['source'], list) else cell['source']
        lines = len(source.split('\n'))
        preview = source[:100].replace('\n', ' ')[:60]
        print(f"      Cell {i}: {lines} lines - {preview}...")
else:
    print(f"   ERROR: Notebook not found!")

# 4. Directory structure
print(f"\n4. TOP-LEVEL DIRECTORY STRUCTURE")
dirs = sorted([d for d in os.listdir(base_path) if os.path.isdir(base_path / d) and not d.startswith('.')])
print(f"   Total directories: {len(dirs)}")
for d in dirs:
    dir_path = base_path / d
    file_count = len(list(dir_path.glob('*')))
    print(f"      - {d:<40} ({file_count} items)")

# 5. Other notebooks analysis
print(f"\n5. OTHER NOTEBOOKS IN /CODE")
for nb_path in sorted(code_notebooks):
    if nb_path != main_notebook_path:
        with open(nb_path, 'r') as f:
            nb_data = json.load(f)
        cells = len(nb_data['cells'])
        print(f"   - {nb_path.name}")
        print(f"     Cells: {cells}")

# 6. Summary statistics
print(f"\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total Notebooks: {len(all_notebooks)}")
print(f"Notebooks in /code: {len(code_notebooks)}")
print(f"Main GNN notebook cells: {total_cells if main_notebook_path.exists() else 'N/A'}")
print(f"Top-level directories: {len(dirs)}")
print("=" * 80)

## Expected Results

Based on the codebase analysis, the expected counts are:

- **Total notebooks in repository:** 7
- **Notebooks in /code directory:** 4
- **Main GNN notebook total cells:** 10
  - Code cells: 7
  - Markdown cells: 3
- **Top-level directories:** 8
  - Data
  - Features
  - Final Results
  - GNN-pipeline
  - GNNbasedTrainedModels
  - GearNet Only
  - GearNet and GNN Trained models
  - code

In [None]:
# Additional verification: Check if specific files exist
print("FILE EXISTENCE CHECK")
print("=" * 80)

important_files = [
    'code/GNN_based_pipeline_Training_for_Predicting_small_molecule_inhibition_of_protein_complexes_ipynb.ipynb',
    'code/GearNet Embedding.ipynb',
    'code/seqfeaturesand_gnn_generate_prediction_gnn_with_binders_and_random_both_as_negative_ipynb.ipynb',
    'code/svmreadfromfile_generate_prediction_binders_and_random_both_as_negative.ipynb',
    'README.md',
    'research_paper.pdf',
    'Data/WriteAllexamplesRandomBindersIdsAll_24JAN_Binary.txt'
]

for file_path in important_files:
    exists = (base_path / file_path).exists()
    status = "✓ EXISTS" if exists else "✗ MISSING"
    print(f"{status}: {file_path}")