# Morris et al. Memorization Reproduction - Colab GPU

Transfer of modular memorization pipeline to Google Colab for GPU acceleration.

Original paper: "How much do language models memorize?" (Morris et al., 2025)

## 1. Environment Setup

In [None]:
# Check GPU availability
import torch
import os

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("Running on CPU")

In [None]:
# Clone repository
!git clone https://github.com/mllrjo/MorrisRepro.git
%cd MorrisRepro/memorization_reproduction

In [None]:
# Auto-detect dependencies from Python files
import re
import os
from collections import set

def extract_imports(directory):
    """Extract all import statements from Python files"""
    imports = set()
    
    for root, dirs, files in os.walk(directory):
        # Skip __pycache__ directories
        dirs[:] = [d for d in dirs if d != '__pycache__']
        
        for file in files:
            if file.endswith('.py'):
                filepath = os.path.join(root, file)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        content = f.read()
                        
                    # Extract import statements
                    import_patterns = [
                        r'^import\s+([\w\.]+)',
                        r'^from\s+([\w\.]+)\s+import',
                    ]
                    
                    for pattern in import_patterns:
                        matches = re.findall(pattern, content, re.MULTILINE)
                        for match in matches:
                            # Get root package name
                            root_package = match.split('.')[0]
                            if root_package not in ['src', 'tests', 'os', 'sys', 're', 'json', 'math', 'random', 'time', 'datetime', 'collections', 'itertools', 'functools', 'pathlib']:
                                imports.add(root_package)
                                
                except Exception as e:
                    print(f"Error reading {filepath}: {e}")
    
    return sorted(list(imports))

# Extract dependencies
detected_imports = extract_imports('.')
print("Detected imports:")
for imp in detected_imports:
    print(f"  {imp}")

# Common ML package mappings
package_mapping = {
    'torch': 'torch',
    'torchvision': 'torchvision', 
    'transformers': 'transformers',
    'numpy': 'numpy',
    'np': 'numpy',
    'pandas': 'pandas',
    'pd': 'pandas',
    'matplotlib': 'matplotlib',
    'plt': 'matplotlib',
    'seaborn': 'seaborn',
    'sklearn': 'scikit-learn',
    'tqdm': 'tqdm',
    'wandb': 'wandb',
    'tensorboard': 'tensorboard'
}

install_packages = []
for imp in detected_imports:
    if imp in package_mapping:
        install_packages.append(package_mapping[imp])
    else:
        install_packages.append(imp)

install_packages = list(set(install_packages))  # Remove duplicates
print(f"\nPackages to install: {install_packages}")

In [None]:
# Install detected dependencies
import subprocess
import sys

def install_package(package):
    """Install package with error handling"""
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
        print(f"✓ Successfully installed {package}")
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to install {package}: {e}")
        return False
    return True

# Install each package
for package in install_packages:
    install_package(package)

print("\nDependency installation complete.")

## 2. Pipeline Execution

In [None]:
# Verify repository structure
print("Repository structure:")
!find . -name "*.py" | head -20

print("\nKey files:")
key_files = ['test_scaled_pipeline.py', 'src/model_trainer.py', 'src/experiment_runner.py']
for file in key_files:
    if os.path.exists(file):
        print(f"✓ {file}")
    else:
        print(f"✗ {file} not found")

In [None]:
# Execute the main pipeline
print("Starting Morris et al. memorization reproduction pipeline...")
print("=" * 60)

# Run the scaled pipeline test
!python test_scaled_pipeline.py

print("\n" + "=" * 60)
print("Pipeline execution complete.")

## 3. Results Analysis

In [None]:
# Display results structure
print("Results directory structure:")
if os.path.exists('results'):
    !find results -type f | head -20
else:
    print("No results directory found")

print("\nGenerated files:")
for ext in ['*.png', '*.txt', '*.json', '*.csv']:
    files = !find . -name "{ext}" -not -path "*/__pycache__/*" | head -10
    if files:
        print(f"\n{ext} files:")
        for f in files:
            print(f"  {f}")

In [None]:
# Display generated plots
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython.display import Image, display
import glob

# Find and display PNG files
png_files = glob.glob('**/*.png', recursive=True)
png_files = [f for f in png_files if '__pycache__' not in f]

print(f"Found {len(png_files)} plot files:")

for i, png_file in enumerate(png_files[:6]):  # Limit to first 6 plots
    print(f"\n{i+1}. {png_file}")
    try:
        display(Image(png_file))
    except Exception as e:
        print(f"Error displaying {png_file}: {e}")

In [None]:
# Display text reports
import glob

# Find and display recent report files
report_files = glob.glob('*report*.txt') + glob.glob('results/**/*.txt', recursive=True)

print(f"Found {len(report_files)} report files:")

for report_file in report_files[:3]:  # Show first 3 reports
    print(f"\n{'='*60}")
    print(f"Report: {report_file}")
    print('='*60)
    
    try:
        with open(report_file, 'r') as f:
            content = f.read()
            # Show first 2000 characters
            if len(content) > 2000:
                print(content[:2000])
                print(f"\n... (truncated, full file has {len(content)} characters)")
            else:
                print(content)
    except Exception as e:
        print(f"Error reading {report_file}: {e}")

## 4. Download Results

In [None]:
# Package results for download
import zipfile
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"morris_repro_results_{timestamp}.zip"

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add all result files
    for ext in ['*.png', '*.txt', '*.json', '*.csv']:
        files = glob.glob(f'**/{ext}', recursive=True)
        files = [f for f in files if '__pycache__' not in f]
        
        for file in files:
            zipf.write(file)
            print(f"Added to zip: {file}")

print(f"\nResults packaged in: {zip_filename}")
print(f"File size: {os.path.getsize(zip_filename) / (1024*1024):.2f} MB")

# Download the zip file
from google.colab import files
files.download(zip_filename)

## 5. Execution Summary

In [None]:
# Execution summary
print("Morris et al. Memorization Reproduction - Execution Summary")
print("=" * 60)

print(f"✓ Repository cloned from: https://github.com/mllrjo/MorrisRepro.git")
print(f"✓ Dependencies auto-detected and installed")
print(f"✓ GPU status: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  - Device: {torch.cuda.get_device_name(0)}")
    print(f"  - Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print(f"✓ Pipeline executed: test_scaled_pipeline.py")

# Count generated files
result_counts = {}
for ext in ['png', 'txt', 'json', 'csv']:
    files = glob.glob(f'**/*.{ext}', recursive=True)
    files = [f for f in files if '__pycache__' not in f]
    result_counts[ext] = len(files)

print(f"✓ Results generated:")
for ext, count in result_counts.items():
    if count > 0:
        print(f"  - {count} {ext.upper()} files")

print(f"✓ Results packaged for download")

print("\nPipeline transfer to Colab GPU complete.")