In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
import torch

# Function to convert model outputs to PDB files
def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    final_atom_positions_np = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"].cpu().numpy()

    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i].cpu().numpy()
        pred_pos = final_atom_positions_np[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i].cpu().numpy() + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i].cpu().numpy(),
            chain_index=outputs["chain_index"][i].cpu().numpy()
            if "chain_index" in outputs
            else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

# Load tokenizer and model
model_name = "facebook/esmfold_v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmForProteinFolding.from_pretrained(model_name, low_cpu_mem_usage=True).cuda()

# Uncomment to switch the stem to float16 for memory optimization
model.esm = model.esm.half()

# Enable TensorFloat32 for faster matrix multiplications
torch.backends.cuda.matmul.allow_tf32 = True

# Set chunk size optimized for an 11GB GPU
model.trunk.set_chunk_size(128)

def generate_structures_from_csvs_in_directory(directory_path):
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory_path) if f.endswith(".csv")]

    if not csv_files:
        print("No CSV files found in the directory.")
        return

    for csv_file in csv_files:
        csv_path = os.path.join(directory_path, csv_file)
        print(f"Processing file: {csv_file}")

        try:
            # Read the CSV file
            df = pd.read_csv(csv_path)
            if "sequence" not in df.columns or "gene" not in df.columns:
                print(f"Skipping {csv_file}: Missing 'sequence' or 'gene' column.")
                continue

            # Create an output folder in the same directory as the CSV
            output_dir = os.path.join(directory_path, os.path.splitext(csv_file)[0] + "_predicted_structures")
            os.makedirs(output_dir, exist_ok=True)

            # Iterate through sequences and generate structures
            for idx, row in df.iterrows():
                sequence = row["sequence"]
                protein_name = row["gene"]

                # Tokenize input
                input_ids = tokenizer([sequence], return_tensors="pt", add_special_tokens=False)['input_ids'].cuda()

                # Predict structure
                with torch.no_grad():
                    outputs = model(input_ids)

                # Convert to PDB
                pdbs = convert_outputs_to_pdb(outputs)

                # Save each PDB structure
                for pdb_idx, pdb in enumerate(pdbs):
                    pdb_path = os.path.join(output_dir, f"{protein_name}_{pdb_idx + 1}.pdb")
                    with open(pdb_path, "w") as f:
                        f.write(pdb)

            print(f"Successfully processed and saved structures for file: {csv_file}")

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")
        finally:
            print(f"Finished processing file: {csv_file}")

# Example usage
directory_path = "path_to_your_directory"  # Replace with your directory path
generate_structures_from_csvs_in_directory(directory_path)
