# SIF workflow for characterising synthetic proteins

# This is the development version 0.1

### First define the data sources - there should be a synthetic set of sequences as a single fasta file and single natural set of sequences as a fasta file. 

In [1]:
import os

# Define the dataset names
synName = os.environ['SYNPROTS_NAME'] = 'ISP_1111_A1_short'
natName = os.environ['NATPROTS_NAME'] = 'sdr_pdg_filtered_short'

# Locations of the data folders

# Synthetic protein input data lives here
os.environ['SYNPROTS'] = '/home/neil/data/sif/synProts/ISP_1111_A1/'
# Natural protein input data lives here
os.environ['NATPROTS'] = '/home/neil/data/sif/natProts/zhen_sdrs/'

# Access the environment variable later in code
synProts = os.environ['SYNPROTS']
natProts = os.environ['NATPROTS']

# Synthetic protein structure data lives here
synStrucs = os.environ['SYNSTRUCS'] = '/home/neil/data/sif/synStrucs/'

# Natural protein structure data lives here 
natStrucs = os.environ['NATSTRUCS'] = '/home/neil/data/sif/natStrucs/'




In [2]:
#Natural language library
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting click (from nltk)
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.3.1-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regex>=2021.8.3 (from nltk)
  Using cached regex-2023.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.3 joblib-1.3.1 nltk-3.8.1 regex-2023.6.3 tqdm-4.65.0


In [3]:
# Import necessary libraries
import nltk
import random
import uuid
from collections import Counter

# Download the brown corpus
nltk.download('brown')

# Get a list of English words
word_list = nltk.corpus.brown.words()

# Count the occurrences of each word
word_counter = Counter(word_list)

# Get the 5000 most common words
common_words = [word for word, _ in word_counter.most_common(5000)]

def generate_human_memorable_id():
    # Generate a unique ID using uuid
    unique_id = uuid.uuid4().int

    # Select two random words
    word1 = random.choice(common_words)
    word2 = random.choice(common_words)

    # Truncate the UUID to the last 6 digits for brevity and append it to the words
    human_memorable_id = f'{word1}-{word2}-{str(unique_id)[-6:]}'

    return human_memorable_id

# Print a unique, human-memorable ID
wf_id = generate_human_memorable_id()
print("Workflow ID is:", wf_id)


[nltk_data] Downloading package brown to /home/neil/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Workflow ID is: conspiracy-chances-283886


### Structural Predictions using esmFold

**Fold the synthetic proteins**

In [4]:
!pip install py3Dmol
!pip install biopython

# Build the complete paths first
fasta_path = f"{synProts}{synName}.fasta"
output_path = f"{synStrucs}{wf_id}/{synName}"

# Use the paths in the command
!python /home/neil/projects-dep/esmFold/esmFold.py {fasta_path} {output_path}
print("/home/neil/projects-dep/esmFold/esmFold.py", fasta_path, output_path)

Collecting py3Dmol
  Using cached py3Dmol-2.0.3-py2.py3-none-any.whl (12 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-2.0.3
Collecting biopython
  Downloading biopython-1.81-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy (from biopython)
  Using cached numpy-1.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
Installing collected packages: numpy, biopython
Successfully installed biopython-1.81 numpy-1.25.0
Traceback (most recent call last):
  File "/home/neil/projects-dep/esmFold/esmFold.py", line 3, in <module>
    from transformers.utils import send_example_telemetry
ModuleNotFoundError: No module named 'transformers'
/home/neil/projects-dep/esmFold/esmFold.py /home/neil/data/sif/synProts/ISP_1111_A1/ISP_1111_A1_short.fasta /home/neil/data/sif/synStrucs/

**Fold the natural proteins**

In [None]:
# Build the complete paths first
fasta_path = f"{natProts}{natName}.fasta"
output_path = f"{natStrucs}{wf_id}/{natName}"

# Use the paths in the command
!python /home/neil/projects-dep/esmFold/esmFold.py {fasta_path} {output_path}
print("/home/neil/projects-dep/esmFold/esmFold.py", fasta_path, output_path)

#Building the SSN. We are going to do this manually over a terminal to TBone as it can take a while.

1) SSH into TBone e.g. ssh neil@92.40.34.250
2) conda activate ssn
3) run the command generated below so that things end up in the right directory
4) export NXF_VER=22.10.0
5) nextflow run ravenlocke/nf-needleall-ava --infile /home/neil/data/sif/synProts/ISP_1111_A1/ISP_1111_A1_short.fasta --outdir /home/neil/data/sif/synProtComps/ssn/ --threshold 0.4

In [None]:


print ("")