This file is used to create a dataset of structures of known interactions.
Steps:
1. generate dataset of ARMs and TFs
2. For all ARMs and TFs: run BLASTp against the pdb database
3. Create intersection of results

In [None]:
import pandas as pd
import os
from pathlib import Path
import subprocess
from typing import List

BLAST_BIN: str = "blastp"
BLAST_DB: str = "/home/markus/MPI_local/data/BLAST_PDB/pdbaa/pdbaa"
E_VAL_CUTOFF: str = "0.00001"
TF_QUERY_DIR: str = '../../production1/tf_fasta'
ARM_QUERY_DIR: str = '../../production1/arm_fasta'
TF_OUTPUT_DIR: str = '../../production1/tf_blastp_no_e_lim'
ARM_OUTPUT_DIR: str = '../../production1/arm_blastp_no_e_lim'

In [None]:
def run_blastp(fasta_path: str, output_path: str) -> None:
    if os.path.exists(fasta_path):
        try:
            subprocess.run([
                BLAST_BIN,
                "-query", fasta_path,
                "-db", BLAST_DB,
                # "-evalue", E_VAL_CUTOFF,
                "-out", output_path,
                "-outfmt", "7"
            ], check=True)
        except subprocess.CalledProcessError as e:
            print(f"BLAST error for {fasta_path}: {e}")
    else:
        raise Exception(f"No FASTA file found: {fasta_path}.")

In [None]:
from pathlib import Path
from typing import List
import os

def blastp_for_dir(input_dir: str, output_dir: str) -> None:
    fasta_files: List[Path] = list(Path(input_dir).glob('*.fasta'))

    os.makedirs(output_dir, exist_ok=True)

    i: int = 0
    duplicates: int = 0
    for fasta_file in fasta_files:
            if i % 25 == 0:
                print(f"{i} of {len(fasta_files)}")

            fasta_path: Path = fasta_file
            uid: str = fasta_file.stem
            output_path: str = os.path.join(output_dir, f"{uid}_blastp.out")
            
            if os.path.exists(output_path):
                # print(f"Skipping {uid}, output already exists.")
                duplicates += 1
                i += 1
                continue
            
            run_blastp(str(fasta_path), output_path)
            i += 1
    print(f"Found {i} proteins in {input_dir}.\nRan blastp on {i-duplicates} proteins.\nFound existing blast file for {duplicates} proteins.")

In [None]:
blastp_for_dir(TF_QUERY_DIR, TF_OUTPUT_DIR)
blastp_for_dir(ARM_QUERY_DIR, ARM_OUTPUT_DIR)

In [None]:
BLAST_IDENTITY_CUTOFF: int = 0
BLAST_SCORE_CUTOFF: int = 50

def clean_blastp_out(df: pd.DataFrame) -> pd.DataFrame:
    df['pdb_id'] = df['subject'].apply(lambda x: x.split('_')[0] if '_' in x else x)
    df['chain'] = df['subject'].apply(lambda x: x.split('_')[1] if len(x.split('_')) > 1 else '')
    df = df.drop_duplicates(subset='subject')
    df['%identity'] = pd.to_numeric(df['%identity'], errors='raise')
    df = df[df['%identity'] >= BLAST_IDENTITY_CUTOFF]
    df['bit score'] = pd.to_numeric(df['bit score'], errors='raise')
    df = df[df['bit score'] >= BLAST_SCORE_CUTOFF]
    return df

In [None]:
from typing import List

columns: List[str] = [
    "query", "subject", "%identity", "alignment length", "mismatches", "gap opens",
    "q. start", "q. end", "s. start", "s. end", "evalue", "bit score"
]

In [None]:
KEEP_TOP_N: int = 100

def read_blast_to_df(output_dir: str, column_names: List[str]) -> pd.DataFrame:
    output_files: List[Path] = list(Path(output_dir).glob('*_blastp.out'))
    output_list: List[List[str]] = []
    
    for output_file in output_files:
        try:
            with open(output_file, 'r') as f:
                lines: List[str] = [line for line in f if not line.startswith('#')]
            # output_list.append(lines[:KEEP_TOP_N]) # only keep top 10 rows to reduce size
            output_list.append(lines)
        except IOError as e:
            print(f"Error reading file {output_file}: {e}")
            continue
    
    # Flatten the list of lines and split by tab to create rows
    rows: List[List[str]] = [line.strip().split('\t') for file_lines in output_list for line in file_lines if line.strip()]
    
    if not rows:
        print(f"Warning: No data found in {output_dir}")
        return pd.DataFrame(columns=column_names)
    
    out_df: pd.DataFrame = pd.DataFrame(rows, columns=column_names)
    out_df = clean_blastp_out(out_df)
    return out_df

In [None]:
tf_blast_df: pd.DataFrame = read_blast_to_df(TF_OUTPUT_DIR, columns)
print(len(tf_blast_df))
arm_blast_df: pd.DataFrame = read_blast_to_df(ARM_OUTPUT_DIR, columns)
print(len(arm_blast_df))

In [None]:
intersect_df: pd.DataFrame = tf_blast_df.merge(arm_blast_df, how='inner', on='pdb_id')
print(f"Intersection contains {len(intersect_df)} structures")

# filter out candidates with same chains
intersect_df = intersect_df[intersect_df['chain_x'] != intersect_df['chain_y']]

if len(intersect_df) > 0:
    display(intersect_df[['pdb_id', 'query_x', 'query_y', 'chain_x', 'chain_y', 'subject_x', 'subject_y', '%identity_x', '%identity_y']])
else:
    print("Warning: No intersecting structures found between TF and ARM BLAST results")