In [5]:
import sys, os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import subprocess as sp
import tables as tb
import h5py as h5
from Bio import PDB
from Bio.PDB import PDBIO
from collections import Counter

ModuleNotFoundError: No module named 'tensorflow'

In [16]:
# upside_path = os.environ['UPSIDE_HOME']
upside_path = "/home/okleinmann/projects/upside2-md-prod/" # WHY DOES MY IPY ENV NOT UPDATE?
print( "Using upside path:", upside_path)
upside_utils_dir = os.path.expanduser(upside_path+"/py")
sys.path.insert(0, upside_utils_dir)
import tensorflow_upside as tu
data_dir = '/home/okleinmann/data/upside_runs/condiv_demo' # CHANGE THIS TO YOUR DATA DIRECTORY
training_pdbs_dir = os.path.join(data_dir, 'pdbs')
force_fields_dir = os.path.join(data_dir, 'init_param')
input_dir = os.path.join(data_dir, 'input')

Using upside path: /home/okleinmann/projects/upside2-md-prod/


In [17]:
# determine n_restype
h5_file_path_string = f"{force_fields_dir}/sidechain.h5"
with h5.File(h5_file_path_string, 'r') as f:
    df=pd.DataFrame(f["bead_order"])
    print(df.head())
    print(df.shape)

          0
0  b'ALA_0'
1  b'ARG_0'
2  b'ASN_0'
3  b'ASP_0'
4  b'CYS_0'
(20, 1)


In [23]:
#AI SLOP FUNCTION DEFINTIONS (NOTHING GETS RUN)

def count_amino_acids(pdb_file_path):
    """
    Count the number of different amino acids in a PDB file.

    Args:
        pdb_file_path (str): Path to the PDB file

    Returns:
        dict: Dictionary with amino acid counts and summary statistics
    """
    # Initialize PDB parser
    parser = PDB.PDBParser(QUIET=True)

    try:
        # Parse the structure
        structure = parser.get_structure('protein', pdb_file_path)

        # List to store all amino acids
        amino_acids = []

        # Iterate through all residues in all chains
        for model in structure:
            for chain in model:
                for residue in chain:
                    # Check if it's a standard amino acid (not water, ions, etc.)
                    if PDB.is_aa(residue):
                        amino_acids.append(residue.get_resname())

        # Count amino acids
        aa_counts = Counter(amino_acids)

        # Create summary
        total_residues = len(amino_acids)
        unique_aa_types = len(aa_counts)

        result = {
            'amino_acid_counts': dict(aa_counts),
            'total_residues': total_residues,
            'unique_amino_acid_types': unique_aa_types,
            'amino_acid_list': amino_acids
        }

        return result

    except Exception as e:
        print(f"Error reading PDB file: {e}")
        return None

# Cell 4: Function to substitute amino acids and write new PDB
def substitute_amino_acid(input_pdb_path, output_pdb_path, target_aa, replacement_aa, chain_id=None):
    """
    Substitute one amino acid for another in a PDB file and write the result.

    Args:
        input_pdb_path (str): Path to input PDB file
        output_pdb_path (str): Path for output PDB file
        target_aa (str): Three-letter code of amino acid to replace (e.g., 'ALA')
        replacement_aa (str): Three-letter code of replacement amino acid (e.g., 'VAL')
        chain_id (str, optional): Specific chain to modify. If None, modifies all chains.

    Returns:
        int: Number of substitutions made
    """
    # Initialize parser and writer
    parser = PDB.PDBParser(QUIET=True)
    io = PDBIO()

    try:
        # Parse the structure
        structure = parser.get_structure('protein', input_pdb_path)

        substitutions_made = 0

        # Iterate through structure and make substitutions
        for model in structure:
            for chain in model:
                # Skip if specific chain requested and this isn't it
                if chain_id and chain.id != chain_id:
                    continue

                for residue in chain:
                    if PDB.is_aa(residue) and residue.get_resname() == target_aa:
                        # Change residue name
                        residue.resname = replacement_aa
                        substitutions_made += 1

        # Write the modified structure
        io.set_structure(structure)
        io.save(output_pdb_path)

        print(f"Successfully made {substitutions_made} substitutions of {target_aa} -> {replacement_aa}")
        print(f"Modified structure saved to: {output_pdb_path}")

        return substitutions_made

    except Exception as e:
        print(f"Error processing PDB file: {e}")
        return 0

# Cell 5: Helper function to display results nicely
def display_amino_acid_analysis(pdb_file_path):
    """
    Display a nice summary of amino acid analysis.

    Args:
        pdb_file_path (str): Path to the PDB file
    """
    result = count_amino_acids(pdb_file_path)

    if result:
        print(f"=== Amino Acid Analysis for {os.path.basename(pdb_file_path)} ===")
        print(f"Total residues: {result['total_residues']}")
        print(f"Unique amino acid types: {result['unique_amino_acid_types']}")
        print("\nAmino acid counts:")

        # Sort by count (descending)
        sorted_counts = sorted(result['amino_acid_counts'].items(),
                             key=lambda x: x[1], reverse=True)

        for aa, count in sorted_counts:
            percentage = (count / result['total_residues']) * 100
            print(f"  {aa}: {count:3d} ({percentage:5.1f}%)")

        return result
    else:
        print("Failed to analyze the PDB file.")
        return None

def validate_and_process_pdb_directory(directory_path, allowed_amino_acids=None, substitutions=None):
    """
    Validate all PDB files in a directory against allowed amino acids and optionally apply substitutions.

    Args:
        directory_path (str): Path to directory containing PDB files
        allowed_amino_acids (list, optional): List of allowed amino acid three-letter codes
                                            e.g., ['ALA', 'ARG', 'ASN', ...]
        substitutions (dict, optional): Dictionary of amino acid substitutions to apply
                                      Format: {'target_aa': 'replacement_aa', ...}

    Returns:
        dict: Comprehensive report of analysis and modifications
    """
    import glob
    from datetime import datetime

    # Find all PDB files in directory
    pdb_files = glob.glob(os.path.join(directory_path, "*.pdb"))

    if not pdb_files:
        print(f"No PDB files found in directory: {directory_path}")
        return None

    # Initialize report data
    report = {
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'directory': directory_path,
        'total_pdbs_found': len(pdb_files),
        'analyzed_pdbs': {},
        'problematic_pdbs': [],
        'substitutions_made': {},
        'validation_results': {},
        'disallowed_amino_acids_found': {},
        'summary': {}
    }

    print(f"Found {len(pdb_files)} PDB files in {directory_path}")
    print("Analyzing PDB files...\n")

    for pdb_file in pdb_files:
        pdb_name = os.path.basename(pdb_file).replace('.pdb', '')
        print(f"Processing: {pdb_name}")

        # Analyze amino acid composition
        analysis = count_amino_acids(pdb_file)

        if analysis:
            report['analyzed_pdbs'][pdb_name] = analysis

            # Validate against allowed amino acids if provided
            if allowed_amino_acids:
                validation_result = _validate_pdb_amino_acids(
                    pdb_name, analysis, allowed_amino_acids
                )
                report['validation_results'][pdb_name] = validation_result

                if not validation_result['is_valid']:
                    report['problematic_pdbs'].append({
                        'name': pdb_name,
                        'disallowed_amino_acids': validation_result['disallowed_amino_acids']
                    })

                    # Track disallowed amino acids across all files
                    for aa_info in validation_result['disallowed_amino_acids']:
                        aa = aa_info['amino_acid']
                        if aa not in report['disallowed_amino_acids_found']:
                            report['disallowed_amino_acids_found'][aa] = []
                        report['disallowed_amino_acids_found'][aa].append({
                            'pdb': pdb_name,
                            'count': aa_info['count'],
                            'residue_numbers': aa_info['residue_numbers']
                        })

            # Apply substitutions if provided
            if substitutions:
                substitution_results = _apply_substitutions_to_pdb(
                    pdb_file, substitutions, directory_path
                )
                if substitution_results['total_substitutions'] > 0:
                    report['substitutions_made'][pdb_name] = substitution_results
        else:
            print(f"  ❌ Failed to analyze {pdb_name}")
            report['problematic_pdbs'].append({
                'name': pdb_name,
                'disallowed_amino_acids': [{'error': 'Failed to parse PDB file'}]
            })

    # Generate summary
    _generate_validation_report(report)

    return report

def _validate_pdb_amino_acids(pdb_name, analysis, allowed_amino_acids):
    """
    Validate PDB amino acids against allowed list and track residue positions.
    """
    # Re-parse the PDB to get residue positions for disallowed amino acids
    parser = PDB.PDBParser(QUIET=True)

    validation_result = {
        'is_valid': True,
        'disallowed_amino_acids': []
    }

    actual_amino_acids = analysis['amino_acid_counts']

    # Find disallowed amino acids
    disallowed = {}
    for aa, count in actual_amino_acids.items():
        if aa not in allowed_amino_acids:
            disallowed[aa] = count
            validation_result['is_valid'] = False

    # If we found disallowed amino acids, get their positions
    if disallowed:
        # We need to re-parse to get residue positions
        # This is a bit inefficient but necessary for detailed reporting
        try:
            # Get the PDB file path from the analysis (we'll need to pass it)
            # For now, we'll just report the counts
            for aa, count in disallowed.items():
                validation_result['disallowed_amino_acids'].append({
                    'amino_acid': aa,
                    'count': count,
                    'residue_numbers': f"Found {count} instances"  # Simplified for now
                })
        except:
            for aa, count in disallowed.items():
                validation_result['disallowed_amino_acids'].append({
                    'amino_acid': aa,
                    'count': count,
                    'residue_numbers': f"Found {count} instances"
                })

    return validation_result

def _get_residue_positions(pdb_file_path, target_amino_acids):
    """
    Get specific residue positions for target amino acids.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)

    positions = {aa: [] for aa in target_amino_acids}

    for model in structure:
        for chain in model:
            for residue in chain:
                if PDB.is_aa(residue):
                    resname = residue.get_resname()
                    if resname in target_amino_acids:
                        positions[resname].append({
                            'chain': chain.id,
                            'residue_number': residue.id[1],
                            'insertion_code': residue.id[2]
                        })

    return positions

def _generate_validation_report(report):
    """
    Generate and display a comprehensive validation report.
    """
    print("\n" + "="*70)
    print("🧬 PDB VALIDATION AND PROCESSING REPORT")
    print("="*70)
    print(f"📅 Generated: {report['timestamp']}")
    print(f"📁 Directory: {report['directory']}")
    print(f"📊 Total PDB files found: {report['total_pdbs_found']}")
    print(f"✅ Successfully analyzed: {len(report['analyzed_pdbs'])}")

    # Validation results
    if report['validation_results']:
        valid_count = sum(1 for result in report['validation_results'].values() if result['is_valid'])
        invalid_count = len(report['validation_results']) - valid_count

        print(f"✅ Valid PDBs (only allowed amino acids): {valid_count}")
        print(f"❌ Invalid PDBs (contain disallowed amino acids): {invalid_count}")

        if report['problematic_pdbs']:
            print(f"\n🚨 PDBs WITH DISALLOWED AMINO ACIDS:")
            for problem in report['problematic_pdbs']:
                print(f"  • {problem['name']}:")
                for aa_info in problem['disallowed_amino_acids']:
                    if 'error' in aa_info:
                        print(f"    ❌ {aa_info['error']}")
                    else:
                        print(f"    - {aa_info['amino_acid']}: {aa_info['count']} occurrences")

    # Summary of all disallowed amino acids found
    if report['disallowed_amino_acids_found']:
        print(f"\n🔍 SUMMARY OF DISALLOWED AMINO ACIDS ACROSS ALL PDBs:")
        for aa, occurrences in report['disallowed_amino_acids_found'].items():
            total_count = sum(occ['count'] for occ in occurrences)
            pdb_list = [occ['pdb'] for occ in occurrences]
            print(f"  • {aa}: {total_count} total occurrences in {len(pdb_list)} PDB(s)")
            print(f"    Found in: {', '.join(pdb_list)}")

    # Substitution results
    if report['substitutions_made']:
        print(f"\n🔄 SUBSTITUTIONS PERFORMED:")
        for pdb_name, sub_data in report['substitutions_made'].items():
            print(f"  • {pdb_name}: {sub_data['total_substitutions']} total substitutions")
            for sub_type, count in sub_data['substitutions'].items():
                print(f"    - {sub_type}: {count} substitutions")
            print(f"    📁 Output: {os.path.basename(sub_data['output_file'])}")

    # Amino acid composition summary
    if report['analyzed_pdbs']:
        print(f"\n📈 AMINO ACID COMPOSITION SUMMARY:")
        all_aa_counts = {}
        for pdb_data in report['analyzed_pdbs'].values():
            for aa, count in pdb_data['amino_acid_counts'].items():
                all_aa_counts[aa] = all_aa_counts.get(aa, 0) + count

        total_residues = sum(all_aa_counts.values())
        sorted_aa = sorted(all_aa_counts.items(), key=lambda x: x[1], reverse=True)

        print(f"  Total residues across all PDBs: {total_residues}")
        print(f"  All amino acids found:")
        for aa, count in sorted_aa:
            percentage = (count / total_residues) * 100
            print(f"    {aa}: {count} ({percentage:.1f}%)")

    print("="*70)

def _apply_substitutions_to_pdb(pdb_file, substitutions, output_dir):
    """
    Apply amino acid substitutions to a PDB file.
    """
    pdb_name = os.path.basename(pdb_file).replace('.pdb', '')
    output_file = os.path.join(output_dir, f"{pdb_name}_modified.pdb")

    substitution_results = {
        'output_file': output_file,
        'substitutions': {},
        'total_substitutions': 0
    }

    # Apply each substitution
    current_file = pdb_file
    temp_files = []

    for target_aa, replacement_aa in substitutions.items():
        temp_output = os.path.join(output_dir, f"{pdb_name}_temp_{target_aa}_{replacement_aa}.pdb")

        num_subs = substitute_amino_acid(
            input_pdb_path=current_file,
            output_pdb_path=temp_output,
            target_aa=target_aa,
            replacement_aa=replacement_aa
        )

        if num_subs > 0:
            substitution_results['substitutions'][f"{target_aa}->{replacement_aa}"] = num_subs
            substitution_results['total_substitutions'] += num_subs
            current_file = temp_output
            temp_files.append(temp_output)

    # Move final result to output file and clean up temp files
    if substitution_results['total_substitutions'] > 0:
        if current_file != pdb_file:
            os.rename(current_file, output_file)
            temp_files.remove(current_file)

        # Clean up temporary files
        for temp_file in temp_files:
            if os.path.exists(temp_file):
                os.remove(temp_file)

    return substitution_results



In [25]:
def example_batch_processing_with_list():
    """
    Example of how to use the batch PDB validation function with amino acid list
    """
    # Standard 20 amino acids
    resnames = ['ALA', 'ARG', 'ASN', 'ASP',
                'CYS', 'GLN', 'GLU', 'GLY',
                'HIS', 'ILE', 'LEU', 'LYS',
                'MET', 'PHE', 'PRO', 'SER',
                'THR', 'TRP', 'TYR', 'VAL']

    # Example directory path
    pdb_directory = training_pdbs_dir  # Replace with your directory

    # Example substitutions for disallowed amino acids
    amino_acid_substitutions = {
        "MSE": "MET",  # Selenomethionine to Methionine
        "SEC": "CYS",  # Selenocysteine to Cysteine
        "PYL": "LYS"   # Pyrrolysine to Lysine
    }

    # Run validation with amino acid list
    print("=== Validating PDBs against standard 20 amino acids ===")
    report1 = validate_and_process_pdb_directory(
        directory_path=pdb_directory,
        allowed_amino_acids=resnames,
        substitutions=amino_acid_substitutions
    )

    print("\n" + "="*50 + "\n")

    # Run analysis only (no validation)
    print("=== Analysis only (all amino acids allowed) ===")
    report2 = validate_and_process_pdb_directory(
        directory_path=pdb_directory
    )

    return report1, report2


example_batch_processing_with_list()


=== Validating PDBs against standard 20 amino acids ===
Found 60 PDB files in /home/okleinmann/data/upside_runs/condiv_demo/pdbs
Analyzing PDB files...

Processing: 2bn6_temp_SEC_CYS_temp_MSE_MET
Successfully made 0 substitutions of MSE -> MET
Modified structure saved to: /home/okleinmann/data/upside_runs/condiv_demo/pdbs/2bn6_temp_SEC_CYS_temp_MSE_MET_temp_MSE_MET.pdb
Successfully made 0 substitutions of SEC -> CYS
Modified structure saved to: /home/okleinmann/data/upside_runs/condiv_demo/pdbs/2bn6_temp_SEC_CYS_temp_MSE_MET_temp_SEC_CYS.pdb
Successfully made 0 substitutions of PYL -> LYS
Modified structure saved to: /home/okleinmann/data/upside_runs/condiv_demo/pdbs/2bn6_temp_SEC_CYS_temp_MSE_MET_temp_PYL_LYS.pdb
Processing: chig_temp_SEC_CYS_temp_MSE_MET
Successfully made 0 substitutions of MSE -> MET
Modified structure saved to: /home/okleinmann/data/upside_runs/condiv_demo/pdbs/chig_temp_SEC_CYS_temp_MSE_MET_temp_MSE_MET.pdb
Successfully made 0 substitutions of SEC -> CYS
Modified 

({'timestamp': '2025-06-25 07:10:03',
  'directory': '/home/okleinmann/data/upside_runs/condiv_demo/pdbs',
  'total_pdbs_found': 60,
  'analyzed_pdbs': {'2bn6_temp_SEC_CYS_temp_MSE_MET': {'amino_acid_counts': {'GLY': 2,
     'ALA': 5,
     'ASP': 1,
     'TYR': 3,
     'SER': 2,
     'GLN': 2,
     'TRP': 1,
     'GLU': 5,
     'ARG': 1,
     'VAL': 1,
     'LYS': 4,
     'ILE': 2,
     'THR': 1,
     'LEU': 1,
     'ASN': 2},
    'total_residues': 33,
    'unique_amino_acid_types': 15,
    'amino_acid_list': ['GLY',
     'ALA',
     'ASP',
     'TYR',
     'SER',
     'ALA',
     'GLN',
     'TRP',
     'ALA',
     'GLU',
     'TYR',
     'TYR',
     'ARG',
     'SER',
     'VAL',
     'GLY',
     'LYS',
     'ILE',
     'GLU',
     'GLU',
     'ALA',
     'GLU',
     'ALA',
     'ILE',
     'GLU',
     'LYS',
     'THR',
     'LEU',
     'LYS',
     'ASN',
     'LYS',
     'GLN',
     'ASN']},
   'chig_temp_SEC_CYS_temp_MSE_MET': {'amino_acid_counts': {'GLY': 3,
     'TYR': 1,
     '

In [19]:
def prepare_training_data():
    # convert pdbs to initial structure format & save list of protein names
    global protein_names
    protein_names = ['prot']
    output_txt_path = "pdb_list"  # Change path if you want elsewhere

    with open(output_txt_path, 'w') as out_file:
        for f in os.listdir(training_pdbs_dir):
            if os.path.isfile(os.path.join(training_pdbs_dir, f)):
                protein_name = f.split('.')[0]
                protein_names.append(protein_name)
                out_file.write(protein_name + '\n')  # Write each name to the file
                cmd = (
                    "python3 {0}/PDB_to_initial_structure.py "
                    "{1}/{2}.pdb "
                    "{3}/{2} "
                    "--record-chain-breaks "
                ).format(upside_utils_dir, training_pdbs_dir, protein_name, input_dir)
                print(cmd)
                sp.check_output(cmd.split())
    return protein_names

def save_list_to_txt(filename, items):
    """
    Save a list of items to a text file, each item on a new line.

    Args:
        filename (str): The name of the file to save.
        items (list): The list of items to save.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for item in items:
            f.write(str(item) + '\n')

save_list_to_txt("pdb_list",prepare_training_data())

python3 /home/okleinmann/projects/upside2-md-prod//py/PDB_to_initial_structure.py /home/okleinmann/data/upside_runs/condiv_demo/pdbs/2bn6_temp_MSE_MET.pdb /home/okleinmann/data/upside_runs/condiv_demo/input/2bn6_temp_MSE_MET --record-chain-breaks 


Traceback (most recent call last):
  File "/home/okleinmann/projects/upside2-md-prod//py/PDB_to_initial_structure.py", line 3, in <module>
    import prody
ModuleNotFoundError: No module named 'prody'


CalledProcessError: Command '['python3', '/home/okleinmann/projects/upside2-md-prod//py/PDB_to_initial_structure.py', '/home/okleinmann/data/upside_runs/condiv_demo/pdbs/2bn6_temp_MSE_MET.pdb', '/home/okleinmann/data/upside_runs/condiv_demo/input/2bn6_temp_MSE_MET', '--record-chain-breaks']' returned non-zero exit status 1.

In [None]:
cmd = mpi -n 4 