# SIF workflow for characterising synthetic proteins

# This is the development version 0.1

### First define the data sources - there should be a synthetic set of sequences as a single fasta file and single natural set of sequences as a fasta file. 

In [None]:
import os

# Define the dataset names
synName = os.environ['SYNPROTS_NAME'] = 'ISP_1111_A1_short'
natName = os.environ['NATPROTS_NAME'] = 'sdr_pdg_filtered_short'

# Locations of the data folders

# Synthetic protein input data lives here
os.environ['SYNPROTS'] = '/home/neil/data/sif/synProts/ISP_1111_A1/'
# Natural protein input data lives here
os.environ['NATPROTS'] = '/home/neil/data/sif/natProts/zhen_sdrs/'

# Access the environment variable later in code
synProts = os.environ['SYNPROTS']
natProts = os.environ['NATPROTS']

# Synthetic protein structure output data lives here
synStrucs = os.environ['SYNSTRUCS'] = '/home/neil/data/sif/synStrucs/'

# Natural protein structure output data lives here 
natStrucs = os.environ['NATSTRUCS'] = '/home/neil/data/sif/natStrucs/'

# Synthetic protein comparison data lives here
os.environ['SYNPROTSCOMP'] = '/home/neil/data/sif/synProtsComp/'
# Natural protein input data lives here
os.environ['NATPROTSCOMP'] = '/home/neil/data/sif/natProtsComp/'

# Access the environment variable later in code
synProtsComp = os.environ['SYNPROTSCOMP']
natProtsComp = os.environ['NATPROTSCOMP']



In [None]:
#Natural language library
!pip install nltk

In [None]:
# Import necessary libraries
import nltk
import random
import uuid
from collections import Counter

# Download the brown corpus
nltk.download('brown')

# Get a list of English words
word_list = nltk.corpus.brown.words()

# Count the occurrences of each word
word_counter = Counter(word_list)

# Get the 5000 most common words
common_words = [word for word, _ in word_counter.most_common(5000)]

def generate_human_memorable_id():
    # Generate a unique ID using uuid
    unique_id = uuid.uuid4().int

    # Select two random words
    word1 = random.choice(common_words)
    word2 = random.choice(common_words)

    # Truncate the UUID to the last 6 digits for brevity and append it to the words
    human_memorable_id = f'{word1}-{word2}-{str(unique_id)[-6:]}'

    return human_memorable_id

# Print a unique, human-memorable ID
wf_id = generate_human_memorable_id()
print("Workflow ID is:", wf_id)


### Structural Predictions using esmFold

**Fold the synthetic proteins**

In [11]:
!pip install py3Dmol
!pip install biopython
!pip install transformers
!pip install torch
!pip install pandas
!pip install accelerate

# Build the complete paths first
fasta_path = f"{synProts}{synName}.fasta"
output_path = f"{synStrucs}{wf_id}/{synName}"

# Use the paths in the command
!python /home/neil/projects-dep/esmFold/esmFold.py {fasta_path} {output_path}
print("/home/neil/projects-dep/esmFold/esmFold.py", fasta_path, output_path)

Collecting accelerate
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)


Installing collected packages: accelerate
Successfully installed accelerate-0.20.3
Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing protein: 1.1.1.1_1_0_1.4083549294407938
Average pLDDT score: 0.318901926279068
Processing protein: 1.1.1.1_4_0_1.1745054754441522
Average pLDDT score: 0.3258002996444702
Processing protein: 1.1.1.1_11_0_1.2964538976041016
Average pLDDT score: 0.4318179488182068
Processing protein: 1.1.1.1_11_1_1.6089056986638242
Average pLDDT score: 0.36328986287117004
/home/neil/projects-dep/esmFold/esmFold.py /home/neil/data/sif/synProts/ISP_1111_A1/ISP_1111_A1_short.fasta /home/neil/data/sif/synStrucs/follows-study-374520/ISP_1111_A1_short


**Fold the natural proteins**

In [None]:
# Build the complete paths first
fasta_path = f"{natProts}{natName}.fasta"
output_path = f"{natStrucs}{wf_id}/{natName}"

# Use the paths in the command
!python /home/neil/projects-dep/esmFold/esmFold.py {fasta_path} {output_path}
print("/home/neil/projects-dep/esmFold/esmFold.py", fasta_path, output_path)

# Building the SSNs



We are going to do this manually over a terminal to TBone as it can take a while.

1) SSH into TBone e.g. ssh neil@92.40.34.250
2) conda activate nf-needleall-ava
3) export NXF_VER=22.10.0
4) run the command generated below so that things end up in the right directory

In [None]:
## Synthetic Proteins

In [18]:
fasta_path = f"{synProts}{synName}.fasta"
comp_output_path_ssn = f"{synProtsComp}ssn/{synName}/{wf_id}"
print ("nextflow run ravenlocke/nf-needleall-ava --infile", fasta_path, "--outdir", comp_output_path_ssn, "--threshold 0.4")

nextflow run ravenlocke/nf-needleall-ava --infile /home/neil/data/sif/synProts/ISP_1111_A1/ISP_1111_A1_short.fasta --outdir /home/neil/data/sif/synProtsComp/ssn/ISP_1111_A1_short/follows-study-374520 --threshold 0.4


In [None]:
## Natural Proteins

In [19]:
fasta_path = f"{natProts}{natName}.fasta"
comp_output_path_ssn = f"{natProtsComp}ssn/{natName}/{wf_id}"
print ("nextflow run ravenlocke/nf-needleall-ava --infile", fasta_path, "--outdir", comp_output_path_ssn, "--threshold 0.4")

nextflow run ravenlocke/nf-needleall-ava --infile /home/neil/data/sif/natProts/zhen_sdrs/sdr_pdg_filtered_short.fasta --outdir /home/neil/data/sif/natProtsComp/ssn/sdr_pdg_filtered_short/follows-study-374520 --threshold 0.4
