This file is used to create a dataset of structures of known interactions.
Steps:
1. generate dataset of ARMs and TFs
2. For all ARMs and TFs: run BLASTp against the pdb database
3. Create intersection of results

In [None]:
import pandas as pd
import os
from pathlib import Path
import subprocess
from typing import List

BLAST_BIN: str = "blastp"
BLAST_DB: str = "/home/markus/MPI_local/data/BLAST_PDB/pdbaa/pdbaa"
E_VAL_CUTOFF: str = "0.00001"
TF_QUERY_DIR: str = '../../production1/tf_unrev_fasta'
ARM_QUERY_DIR: str = '../../production1/arm_all_uniprot_rev_fasta'
# don't change these two, instead adjust e-lim filter and others later
TF_OUTPUT_DIR: str = '../../production1/blastp_results/tf_blastp_no_e_lim_new_fmt' 
ARM_OUTPUT_DIR: str = '../../production1/blastp_results/arm_blastp_no_e_lim_new_fmt'

In [None]:
def run_blastp(fasta_path: str, output_path: str) -> None:
    """
    Run BLASTp for a single FASTA file against the PDB database.
    
    Args:
        fasta_path: Path to the input FASTA file
        output_path: Path where the BLAST output will be saved
        
    Raises:
        Exception: If the FASTA file does not exist
        subprocess.CalledProcessError: If BLAST command fails
    """
    if os.path.exists(fasta_path):
        try:
            subprocess.run([
                BLAST_BIN,
                "-query", fasta_path,
                "-db", BLAST_DB,
                # "-evalue", E_VAL_CUTOFF,
                "-out", output_path,
                "-outfmt", "7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs qcovhsp qcovus"
            ], check=True)
        except subprocess.CalledProcessError as e:
            print(f"BLAST error for {fasta_path}: {e}")
    else:
        raise Exception(f"No FASTA file found: {fasta_path}.")

In [None]:
from pathlib import Path
from typing import List
import os

def blastp_for_dir(input_dir: str, output_dir: str) -> None:
    """
    Run BLASTp for all FASTA files in a directory.
    
    Args:
        input_dir: Directory containing input FASTA files
        output_dir: Directory where BLAST output files will be saved
    """
    fasta_files: List[Path] = list(Path(input_dir).glob('*.fasta'))

    os.makedirs(output_dir, exist_ok=True)

    i: int = 0
    duplicates: int = 0
    for fasta_file in fasta_files:
            if i % max(1, len(fasta_files)//10) == 0:
                print(f"{i} of {len(fasta_files)}")

            fasta_path: Path = fasta_file
            uid: str = fasta_file.stem
            output_path: str = os.path.join(output_dir, f"{uid}_blastp.out")
            
            if os.path.exists(output_path):
                # print(f"Skipping {uid}, output already exists.")
                duplicates += 1
                i += 1
                continue
            
            run_blastp(str(fasta_path), output_path)
            i += 1
    print(f"Found {i} proteins in {input_dir}.\nRan blastp on {i-duplicates} proteins.\nFound existing blast file for {duplicates} proteins.")

In [None]:
# blastp_for_dir(TF_QUERY_DIR, TF_OUTPUT_DIR)
blastp_for_dir(ARM_QUERY_DIR, ARM_OUTPUT_DIR)

In [None]:
blastp_for_dir(TF_QUERY_DIR, TF_OUTPUT_DIR)

## Filter BLAST Results

In [None]:
def clean_blastp_out(df: pd.DataFrame, identity_cutoff :int|bool=False, score_cutoff :int|bool=False, evalue_cutoff :float|bool = False, coverage_cutoff:float|bool=False) -> pd.DataFrame:
    """
    Clean and filter BLAST output DataFrame.
    
    Args:
        df: Raw BLAST output DataFrame
        
    Returns:
        Filtered and cleaned DataFrame with additional columns for PDB ID and chain
    """
    df['pdb_id'] = df['subject'].apply(lambda x: x.split('_')[0] if '_' in x else x)
    df['chain'] = df['subject'].apply(lambda x: x.split('_')[1] if len(x.split('_')) > 1 else '')
    df = df.drop_duplicates(subset='subject')
    df['%identity'] = pd.to_numeric(df['%identity'], errors='raise')
    df['bit score'] = pd.to_numeric(df['bit score'], errors='raise')
    df['evalue'] = pd.to_numeric(df['evalue'], errors='raise')
    df['% query coverage per subject'] = pd.to_numeric(df['% query coverage per subject'], errors='raise')
    if identity_cutoff:
        df = df[df['%identity'] >= identity_cutoff]
    if score_cutoff:
        df = df[df['bit score'] >= score_cutoff]
    if evalue_cutoff:
        df = df[df['evalue'] <= evalue_cutoff]
    if coverage_cutoff:
        df = df[df['% query coverage per subject'] >= coverage_cutoff]
    return df

def read_blast_to_df(output_dir: str, column_names: List[str]) -> pd.DataFrame:
    """
    Read BLAST output files from a directory and combine them into a single DataFrame.
    
    Args:
        output_dir: Directory containing BLAST output files
        column_names: List of column names for the DataFrame
        
    Returns:
        Combined and cleaned DataFrame with all BLAST results
    """
    output_files: List[Path] = list(Path(output_dir).glob('*_blastp.out'))
    output_list: List[List[str]] = []
    
    for output_file in output_files:
        try:
            with open(output_file, 'r') as f:
                lines: List[str] = [line for line in f if not line.startswith('#')]
            output_list.append(lines)
        except IOError as e:
            print(f"Error reading file {output_file}: {e}")
            continue
    
    # Flatten the list of lines and split by tab to create rows
    rows: List[List[str]] = [line.strip().split('\t') for file_lines in output_list for line in file_lines if line.strip()]
    
    if not rows:
        print(f"Warning: No data found in {output_dir}")
        return pd.DataFrame(columns=column_names)
    
    return pd.DataFrame(rows, columns=column_names)

In [None]:
BLAST_IDENTITY_CUTOFF: int|bool = False
BLAST_SCORE_CUTOFF: int|bool = False
BLAST_EVALUE_CUTOFF: float|bool = 0.00001
BLAST_COVERAGE_CUTOFF: float|bool = 0.5


columns: List[str] = [
    "query", "subject", "%identity", "alignment length", "mismatches", "gap opens",
    "q. start", "q. end", "s. start", "s. end", "evalue", "bit score", "% query coverage per subject", "% query coverage per hsp", "% query coverage per uniq subject"
]

tf_blast_df: pd.DataFrame = clean_blastp_out(read_blast_to_df(TF_OUTPUT_DIR, columns), 
                                             identity_cutoff=BLAST_IDENTITY_CUTOFF,
                                             score_cutoff=BLAST_SCORE_CUTOFF,
                                             evalue_cutoff=BLAST_EVALUE_CUTOFF,
                                             coverage_cutoff=BLAST_COVERAGE_CUTOFF)
print(len(tf_blast_df))
arm_blast_df: pd.DataFrame = clean_blastp_out(read_blast_to_df(ARM_OUTPUT_DIR, columns), 
                                             identity_cutoff=BLAST_IDENTITY_CUTOFF,
                                             score_cutoff=BLAST_SCORE_CUTOFF,
                                             evalue_cutoff=BLAST_EVALUE_CUTOFF,
                                             coverage_cutoff=BLAST_COVERAGE_CUTOFF)
print(len(arm_blast_df))

## Create dataset with candidate PDBs (new)

TODO: create filter for previously reviewed structures (PDB ID + chains)

In [None]:
import glob

review_files = glob.glob('/home/markus/MPI_local/production1/structure_reviews/*.csv')
reviews_df = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
reviews_df = reviews_df.drop_duplicates(subset=['pdb_id', 'query_x', 'query_y', 'chain_x', 'chain_y'])

In [None]:
intersect_df: pd.DataFrame = tf_blast_df.merge(
    arm_blast_df, 
    how='inner', 
    on='pdb_id', 
    suffixes=('_tf', '_arm')
)

# filter out candidates with same chains
intersect_df = intersect_df[intersect_df['chain_x'] != intersect_df['chain_y']]

print(f"Intersection contains {len(intersect_df)} structures")

# filter out candidates that were reviewed before
intersect_df = intersect_df.merge(
    reviews_df[['pdb_id', 'query_x', 'query_y', 'chain_x', 'chain_y']],
    on=['pdb_id', 'query_x', 'query_y', 'chain_x', 'chain_y'],
    how='left',
    indicator=True
)
intersect_df = intersect_df[intersect_df['_merge'] == 'left_only'].drop(columns=['_merge'])

print(f"After removing reviewed structures: {len(intersect_df)} structures remain")