In [None]:
## MAKING STRUCTURAL PREDICTIONS WITH CHAI, mismatch
from pathlib import Path
import numpy as np
import torch
from chai_lab.chai1 import run_inference
import os
gpu_index = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_index
os.environ["OMP_NUM_THREADS"] = "6"


example_fasta = f"""
>protein|PIK3CB_AUAGGACUCAUAUUAGGAGAU_AUCUCCUAAUAUGAAUCCUAU_mismatch_guide_pos7
MYSGAGPALAPPAPPPPIQGYAFKPPPRPDFGTSGRTIKLQANFFEMDIPKIDIYHYELDIKPEKCPRRVNREIVEHMVQHFKTQIFGDRKPVFDGRKNLYTAMPLPIGRDKVELEVTLPGEGKDRIFKVSIKWVSCVSLQALHDALSGRLPSVPFETIQALDVVMRHLPSMRYTPVGRSFFTASEGCSNPLGGGREVWFGFHQSVRPSLWKMMLNIDVSATAFYKAQPVIEFVCEVLDFKSIEEQQKPLTDSQRVKFTKEIKGLKVEITHCGQMKRKYRVCNVTRRPASHQTFPLQQESGQTVECTVAQYFKDRHKLVLRYPHLPCLQVGQEQKHTYLPLEVCNIVAGQRCIKKLTDNQTSTMIRATARSAPDRQEEISKLMRSADFNTDPYVREFGIMVKDEMTDVTGRVLQPPSILYGGRNKAIATPVQGVWDMRNKQFHTGIEIKVWAIACFAPQRQCTEVHLKSFTEQLRKISRDAGMPIQGQPCFCKYAQGADSVEPMFRHLKNTYAGLQLVVVILPGKTPVYAEVKRVGDTVLGMATQCVQMKNVQRTTPQTLSNLCLKINVKLGGVNNILLPQGRPPVFQQPVIFLGADVTHPPAGDGKKPSIAAVVGSMDAHPNRYCATVRVQQHRQEIIQDLAAMVRELLIQFYKSTRFKPTRIIFYRAGVSEGQFQQVLHHELLAIREACIKLEKDYQPGITFIVVQKRHHTRLFCTDKNERVGKSGNIPAGTTVDTKITHPTEFDFYLCSHAGIQGTSRPSHYHVLWDDNRFSSDELQILTYQLCHTYVRCTRSVSIPAPAYYAHLVAFRARYHLVDKEHDAAEGDHTDGQANGRDHQALAKAVQVHQDTLRTMYFA
>rna|PIK3CB_AUAGGACUCAUAUUAGGAGAU_AUCUCCUAAUAUGAAUCCUAU_mismatch_guide_pos7
AUAGGACUCAUAUUAGGAGAU
>rna|PIK3CB_AUAGGACUCAUAUUAGGAGAU_AUCUCCUAAUAUGAAUCCUAU_mismatch_guide_pos7
AUCUCCUAAUAUGAAUCCUAU 
""".strip()

# Write the example FASTA to a file ### REQUIRED
fasta_path = Path("example.fasta")
fasta_path.write_text(example_fasta)


output_dir = Path("outputs")
output_cif_paths = run_inference(
    fasta_file=fasta_path,
    output_dir=output_dir,
    num_trunk_recycles=3,
    num_diffn_timesteps=200,
    seed=42,
    device=torch.device("cuda:0"),  # Use "cpu" if no GPU is available
    use_esm_embeddings=True,
)


pdb_files = list(output_dir.glob("*.pdb"))
print("Generated PDB files:")
for pdb_file in pdb_files:
    print(pdb_file)


In [None]:
from pathlib import Path
import numpy as np
import torch
from chai_lab.chai1 import run_inference
import os

gpu_index = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_index
os.environ["OMP_NUM_THREADS"] = "6"

sequence_file = Path("pik3cb_sequence.txt")
protein_sequence = sequence_file.read_text().strip()
example_fasta = f"""
>protein|PIK3CB_AUAGGACUCAUAUUAGGAGAU_AUCUCCUAAUAUGAAUCCUAU_mismatch_guide_pos7
{protein_sequence}
>rna|PIK3CB_AUAGGACUCAUAUUAGGAGAU_AUCUCCUAAUAUGAAUCCUAU_mismatch_guide_pos7
AUAGGACUCAUAUUAGGAGAU
>rna|PIK3CB_AUAGGACUCAUAUUAGGAGAU_AUCUCCUAAUAUGAAUCCUAU_mismatch_guide_pos7
AUCUCCUAAUAUGAAUCCUAU
""".strip()

fasta_path = Path("example.fasta")
fasta_path.write_text(example_fasta)

output_dir = Path(f"output{gpu_index}")
output_cif_paths = run_inference(
    fasta_file=fasta_path,
    output_dir=output_dir,
    num_trunk_recycles=3,
    num_diffn_timesteps=200,
    seed=42,
    device=torch.device(f"cuda:{gpu_index}"),
    use_esm_embeddings=True,
)

pdb_files = list(output_dir.glob("*.pdb"))
print("Generated PDB files:")
for pdb_file in pdb_files:
    print(pdb_file)


In [None]:
## MAKING STRUCTURAL PREDICTIONS WITH CHAI, mismatch
import pandas as pd
from pathlib import Path
import torch
from chai_lab.chai1 import run_inference
import shutil
import os

gpu_index = 0  # Default to GPU index 0 if not specified
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
os.environ["OMP_NUM_THREADS"] = "6"
print(f"Using GPU index: {gpu_index}")

# Set the device directly with torch.device
device = torch.device(f"cuda:{gpu_index}" if torch.cuda.is_available() else "cpu")
print(f"Running on device: {device}")

sequence_file = Path("pik3cb_sequence.txt")
protein_sequence = sequence_file.read_text().strip()
print(f"Loaded protein sequence from {sequence_file}")

df = pd.read_csv('gene_alignments2.csv')
print(f"Loaded CSV file with {len(df)} rows")

total_rows = len(df)
portion_size = total_rows // 4  # Dividing into 4 parts
start_idx = gpu_index * portion_size
end_idx = start_idx + portion_size if gpu_index < 3 else total_rows  # Handle last portion
df_gpu = df.iloc[start_idx:end_idx]  # Get portion of the file for this GPU

print(f"Processing rows {start_idx} to {end_idx} (total {len(df_gpu)} rows)")

for index, row in df_gpu.iterrows():
    ensembl_id = row['ensembl_id']
    target_rna = row['target_rna']
    
    print(f"Processing ensembl_id: {ensembl_id}, target_rna: {target_rna}")

    example_fasta = f""">protein|PIK3CB
{protein_sequence}
>rna|AUAGGACUCAUAUUAGGAGAU
AUAGGACUCAUAUUAGGAGAU
>rna|PIK3CB_AUAGGACUCAUAUUAGGAGAU_{ensembl_id}_{target_rna}
{target_rna}"""

    fasta_path = Path(f"{gpu_index}.fasta")
    fasta_path.write_text(example_fasta)
    print(f"FASTA file written to {fasta_path}")

    output_dir = Path(f"output{gpu_index}")
    output_cif_paths = run_inference(
        fasta_file=fasta_path,
        output_dir=output_dir,
        num_trunk_recycles=3,
        num_diffn_timesteps=200,
        seed=42,
        device=device,
        use_esm_embeddings=True,
    )

    cif_file = output_dir / "pred.model_idx_0.cif"
    if cif_file.exists():
        output_cifs_dir = Path("output_cifs")
        output_cifs_dir.mkdir(exist_ok=True)
        new_cif_name = output_cifs_dir / f"{ensembl_id}.cif"
        shutil.move(str(cif_file), str(new_cif_name))
        print(f"Moved {cif_file} to {new_cif_name}")
    else:
        print(f"File {cif_file} does not exist")

pdb_files = list(output_dir.glob("*.pdb"))
print("Generated PDB files:")
for pdb_file in pdb_files:
    print(pdb_file)


In [None]:
##OVERLAPPING

import pandas as pd
from pathlib import Path
import torch
from chai_lab.chai1 import run_inference
import shutil
import sys

def main():
    num_gpus = 1
    gpu_index = 0

    df = pd.read_csv('gene_alignments3.csv')
    output_cifs_dir = Path("output_cifs_overlap")
    
    # Filter out rows where the .cif file already exists
    # df = df[~df['ensembl_id'].apply(lambda ensembl_id: (output_cifs_dir / f"{ensembl_id}.cif").exists())]
    df = df[df['ensembl_id'] == 'ENSG00000005073']
    total_rows = len(df)
    portion_size = total_rows // num_gpus
    
    start_idx = gpu_index * portion_size
    end_idx = (gpu_index + 1) * portion_size if gpu_index < (num_gpus - 1) else total_rows
    df_gpu = df.iloc[start_idx:end_idx]

    for index, row in df_gpu.iterrows():
        ensembl_id = row['ensembl_id']
        target_rna = row['target_rna']
        
        # cif_file = Path(f"{output_cifs_dir}/{ensembl_id}.cif")
        # if cif_file.exists():
        #     print(f"Skipping {ensembl_id}, output file already exists: {cif_file}")
        #     continue
        
        print(f"Processing ensembl_id: {ensembl_id}, target_rna: {target_rna}")

        example_fasta = f""">protein|PIK3CB
MYSGAGPALAPPAPPPPIQGYAFKPPPRPDFGTSGRTIKLQANFFEMDIPKIDIYHYELDIKPEKCPRRVNREIVEHMVQHFKTQIFGDRKPVFDGRKNLYTAMPLPIGRDKVELEVTLPGEGKDRIFKVSIKWVSCVSLQALHDALSGRLPSVPFETIQALDVVMRHLPSMRYTPVGRSFFTASEGCSNPLGGGREVWFGFHQSVRPSLWKMMLNIDVSATAFYKAQPVIEFVCEVLDFKSIEEQQKPLTDSQRVKFTKEIKGLKVEITHCGQMKRKYRVCNVTRRPASHQTFPLQQESGQTVECTVAQYFKDRHKLVLRYPHLPCLQVGQEQKHTYLPLEVCNIVAGQRCIKKLTDNQTSTMIRATARSAPDRQEEISKLMRSADFNTDPYVREFGIMVKDEMTDVTGRVLQPPSILYGGRNKAIATPVQGVWDMRNKQFHTGIEIKVWAIACFAPQRQCTEVHLKSFTEQLRKISRDAGMPIQGQPCFCKYAQGADSVEPMFRHLKNTYAGLQLVVVILPGKTPVYAEVKRVGDTVLGMATQCVQMKNVQRTTPQTLSNLCLKINVKLGGVNNILLPQGRPPVFQQPVIFLGADVTHPPAGDGKKPSIAAVVGSMDAHPNRYCATVRVQQHRQEIIQDLAAMVRELLIQFYKSTRFKPTRIIFYRAGVSEGQFQQVLHHELLAIREACIKLEKDYQPGITFIVVQKRHHTRLFCTDKNERVGKSGNIPAGTTVDTKITHPTEFDFYLCSHAGIQGTSRPSHYHVLWDDNRFSSDELQILTYQLCHTYVRCTRSVSIPAPAYYAHLVAFRARYHLVDKEHDAAEGDHTDGQANGRDHQALAKAVQVHQDTLRTMYFA
>rna|AUAGGAUUCAUAUUAGGAGAU
AUAGGAUUCAUAUUAGGAGAU
>rna|PIK3CB_AUAGGAUUCAUAUUAGGAGAU_{ensembl_id}_{target_rna}
{target_rna}"""

        fasta_path = Path(f"fasta/{gpu_index}.fasta")
        fasta_path.write_text(example_fasta)
        print(f"FASTA file written to {fasta_path}")

        output_dir = Path(f"outputdirs/output{gpu_index}")
        output_cif_paths = run_inference(
            fasta_file=fasta_path,
            output_dir=output_dir,
            num_trunk_recycles=3,
            num_diffn_timesteps=200,
            seed=42,
            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
            use_esm_embeddings=True,
        )

        cif_file = output_dir / "pred.model_idx_0.cif"
        if cif_file.exists():
            new_cif_name = output_cifs_dir / f"{ensembl_id}.cif"
            shutil.move(str(cif_file), str(new_cif_name))
            print(f"Moved {cif_file} to {new_cif_name}")
        else:
            print(f"File {cif_file} does not exist")

main()

In [11]:
from pathlib import Path
import pandas as pd
num_gpus = 8
gpu_index = 0
df = pd.read_csv('gene_alignments3.csv')
output_cifs_dir = Path("output_cifs_overlap")
with open('gna/errors.txt') as f1, open('amide/errors.txt') as f2:
    errors1 = set(line.strip() for line in f1)
    errors2 = set(line.strip() for line in f2)

combined_errors = errors1.union(errors2)
ensembl_ids = {Path(file).stem for file in combined_errors}
df = df[df['ensembl_id'].isin(ensembl_ids)]
print(len(df))
df = df[~df['ensembl_id'].apply(lambda ensembl_id: (output_cifs_dir / f"{ensembl_id}.cif").exists())]
total_rows = len(df)
print(len(df))
portion_size = total_rows // num_gpus

start_idx = gpu_index * portion_size
end_idx = (gpu_index + 1) * portion_size if gpu_index < (num_gpus - 1) else total_rows
df_gpu = df.iloc[start_idx:end_idx]
print(len(df_gpu))

376
375
46


In [9]:
## COMPARE ERROR FILES
df = pd.read_csv('gna/errors.txt', header=None)
df = df.drop_duplicates()
remaining_entries = len(df)
print(remaining_entries)

with open('amide/errors.txt') as f:
    errors = set(line.strip() for line in f)

existing_entries = set(df[0].astype(str))
missing_entries = errors - existing_entries

if missing_entries:
    print("Missing entries in gna/errors.txt:", missing_entries)

311
Missing entries in gna/errors.txt: {'ENSG00000005469.pdb', 'ENSG00000082898.pdb', 'ENSG00000081386.pdb', 'ENSG00000144036.pdb', 'ENSG00000064012.pdb', 'ENSG00000109321.pdb', 'ENSG00000069345.pdb', 'ENSG00000200795.pdb', 'ENSG00000228794.pdb', 'ENSG00000104880.pdb', 'ENSG00000144028.pdb', 'ENSG00000205730.pdb', 'ENSG00000236081.pdb', 'ENSG00000198626.pdb', 'ENSG00000138617.pdb', 'ENSG00000160959.pdb', 'ENSG00000198171.pdb', 'ENSG00000153933.pdb', 'ENSG00000166260.pdb', 'ENSG00000170917.pdb', 'ENSG00000140395.pdb', 'ENSG00000277972.pdb', 'ENSG00000023909.pdb', 'ENSG00000253352.pdb', 'ENSG00000115966.pdb', 'ENSG00000033627.pdb', 'ENSG00000138794.pdb', 'ENSG00000102362.pdb', 'ENSG00000152669.pdb', 'ENSG00000143373.pdb', 'ENSG00000139132.pdb', 'ENSG00000152404.pdb', 'ENSG00000163629.pdb', 'ENSG00000077157.pdb', 'ENSG00000156976.pdb', 'ENSG00000135775.pdb', 'ENSG00000169071.pdb', 'ENSG00000105649.pdb', 'ENSG00000172428.pdb', 'ENSG00000148346.pdb', 'ENSG00000136205.pdb', 'ENSG00000164022.

In [14]:
import subprocess
import time

slurm_script_name = "run_chai_mod.sh"

with open("run_chai.sh", "r") as original_file:
    lines = original_file.readlines()

for i in range(8):
    with open(slurm_script_name, "w") as modified_file:
        modified_file.writelines(lines)
        modified_file.write(f"python run_chai3.py {i}\n")

    try:
        print(f"Submitting {slurm_script_name}...")
        subprocess.run(["sbatch", slurm_script_name], check=True)
        print(f"Submitted {slurm_script_name} successfully.")
        time.sleep(10)
    except subprocess.CalledProcessError as e:
        print(f"Failed to submit {slurm_script_name}: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

print("Script execution completed.")

Submitting run_chai_mod.sh...
Submitted batch job 89330
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89331
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89332
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89333
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89334
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89335
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89336
Submitted run_chai_mod.sh successfully.
Submitting run_chai_mod.sh...
Submitted batch job 89337
Submitted run_chai_mod.sh successfully.
Script execution completed.
