# Testing Environment

In [1]:
!pip install numpy
!pip install fair-esm==2.0.0
!pip install torch
!pip install pandas

Collecting numpy
  Downloading numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl (20.1 MB)
[K     |████████████████████████████████| 20.1 MB 4.6 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.25.1
Collecting fair-esm==2.0.0
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 1.4 MB/s eta 0:00:01
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
Collecting torch
  Downloading torch-2.0.1-cp310-none-macosx_10_9_x86_64.whl (143.4 MB)
[K     |████████████████████████████████| 143.4 MB 148 kB/s  eta 0:00:01    |███████████████▉                | 71.1 MB 16.2 MB/s eta 0:00:05     |█████████████████████████████▉  | 133.5 MB 9.2 MB/s eta 0:00:02
Collecting jinja2
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 21.3 MB/s eta 0:00:01
[?25hCollecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[K     |

In [96]:
" AND ".join(a + ["dasda"])

'hello AND dasda AND dasda'

In [84]:
!pip install fair-esm==2.0.0


Collecting matplotlib
  Using cached matplotlib-3.7.2-cp310-cp310-macosx_10_12_x86_64.whl (7.4 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl (243 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl (65 kB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.41.1-cp310-cp310-macosx_10_9_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pyparsing<3.1,>=2.3.1
  Using cached pyparsing-3.0.9-py3-none-any.whl (98 kB)
Installing collected packages: pyparsing, kiwisolver, fonttools, cycler, contourpy, matplotlib
Successfully installed contourpy-1.1.0 cycler-0.11.0 fonttools-4.41.1 kiwisolver-1.4.4 matplotlib-3.7.2 pyparsing-3.0.9


In [None]:
from esm.data import BatchConverter

In [42]:
# Impobrt packages
import torch
import esm

# Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # disables dropout for deterministic results

# Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
data = [
    ("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
    ("protein2", "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
]

batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

# Extract per-residue representations (on CPU)
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33], return_contacts=True)
token_representations = results["representations"][33]

# Extract sequence representations
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))

# Per protein embeddings
protein1_embedding = sequence_representations[0]
protein2_embedding = sequence_representations[1]

In [43]:
protein1_embedding

tensor([ 0.0614, -0.0687,  0.0430,  ..., -0.1642, -0.0678,  0.0446])

In [44]:
protein2_embedding

tensor([ 0.0553, -0.0757,  0.0414,  ..., -0.3117, -0.0026,  0.1683])

In [37]:
from typing import Sequence


class MyBatchConverter():
    
    def __init__(self, alphabet, truncation_seq_length) -> None:
        self.alphabet = alphabet
        self.truncation_seq_length = truncation_seq_length

    def __call__(self, raw_batch: Sequence[str]):

        batch_size = len(raw_batch)

        seq_encoded_list = [self.alphabet.encode(seq_str) for seq_str in raw_batch]
        if self.truncation_seq_length:
            seq_encoded_list = [seq_str[:self.truncation_seq_length] for seq_str in seq_encoded_list]
        
        max_len = max(len(seq_encoded) for seq_encoded in seq_encoded_list)
        tokens = torch.empty(
            (
                batch_size,
                max_len + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
            ),
            dtype=torch.int64,
        )
        tokens.fill_(self.alphabet.padding_idx)
        strs = []

        for i, (label, seq_str, seq_encoded) in enumerate(
            zip(batch_labels, raw_batch, seq_encoded_list)
        ):
            strs.append(seq_str)
            if self.alphabet.prepend_bos:
                tokens[i, 0] = self.alphabet.cls_idx
            seq = torch.tensor(seq_encoded, dtype=torch.int64)
            tokens[
                i,
                int(self.alphabet.prepend_bos) : len(seq_encoded)
                + int(self.alphabet.prepend_bos),
            ] = seq
            if self.alphabet.append_eos:
                tokens[i, len(seq_encoded) + int(self.alphabet.prepend_bos)] = self.alphabet.eos_idx

        return strs, tokens

In [46]:
from esm.data import BatchConverter
class MyBatchConverter(BatchConverter):
    
    def __init__(self, alphabet, labels: bool = False, truncation_seq_length: int = None):
        super().__init__(alphabet=alphabet, truncation_seq_length=truncation_seq_length)
        self.label = False

    def __call__(self, raw_batch):

        if not self.label:
            raw_batch = [(f"id{i}", seq_str) for i, seq_str in enumerate(raw_batch)]

        labels, strs, tokens = super().__call__(raw_batch)
        return labels, strs, tokens
    

In [50]:
data = [
    "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
    "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
]

batch_converter = MyBatchConverter(alphabet=alphabet)
labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

# Extract per-residue representations (on CPU)
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33], return_contacts=True)
token_representations = results["representations"][33]

# Extract sequence representations
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))

# Per protein embeddings
protein1_embedding = sequence_representations[0]
protein2_embedding = sequence_representations[1]

In [61]:
import numpy as np
test = np.zeros(len(sequence_representations))
new = np.column_stack((test, sequence_representations))

In [64]:
new.shape

(2, 1281)

In [51]:
protein1_embedding

tensor([ 0.0614, -0.0687,  0.0430,  ..., -0.1642, -0.0678,  0.0446])

In [52]:
protein2_embedding

tensor([ 0.0553, -0.0757,  0.0414,  ..., -0.3117, -0.0026,  0.1683])

In [10]:
import io

buffer = io.BytesIO()
torch.save(protein1_embedding, buffer)

serialized_tensor = buffer.getvalue()

buffer = io.BytesIO(serialized_tensor)

loaded_tensor = torch.load(buffer)

In [8]:
print(protein1_embedding)
print(loaded_tensor)

tensor([ 0.0614, -0.0687,  0.0430,  ..., -0.1642, -0.0678,  0.0446])
tensor([ 0.0614, -0.0687,  0.0430,  ..., -0.1642, -0.0678,  0.0446])


In [29]:
test[protein2_embedding]

'dasd'

In [46]:
import numpy as np
tensor1 = np.array([1,2,3])
tensor2 = np.array([1,2,3])

In [47]:
tensor2 == tensor1

array([ True,  True,  True])

In [41]:
hash(tensor1) == hash(tensor2)

TypeError: unhashable type: 'numpy.ndarray'

In [34]:
hash(tensor1)

4756870704

In [35]:
hash(tensor2)

4757249936

In [36]:
a = dict()
a[tensor1] = 0
a[tensor2] = 1

In [38]:
a[tensor1]

0

In [29]:
sequence_representations = []
for i, tokens_len in enumerate(batch_lens):
    sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))

# Look at the unsupervised self-attention map contact predictions
import matplotlib.pyplot as plt
for (_, seq), tokens_len, attention_contacts in zip(data, batch_lens, results["contacts"]):
    plt.matshow(attention_contacts[: tokens_len, : tokens_len])
    plt.title(seq)
    plt.show()

ModuleNotFoundError: No module named 'pyparsing'

In [110]:
test = ["day154-rep2-IgK-seqC", "day154-rep2-IgK-seqC", "day154-rep2-IgM-seqB", "day154-rep2-IgM-seqB", "day154-rep2-IgG-seqB", "day154-rep2-IgG-seqB", "day154-rep1-IgL-seqC", "day154-rep1-IgL-seqC", "day154-rep2-IgK-seqA", "day154-rep2-IgK-seqA"]
for i in test + [5]:
    print(i)

day154-rep2-IgK-seqC
day154-rep2-IgK-seqC
day154-rep2-IgM-seqB
day154-rep2-IgM-seqB
day154-rep2-IgG-seqB
day154-rep2-IgG-seqB
day154-rep1-IgL-seqC
day154-rep1-IgL-seqC
day154-rep2-IgK-seqA
day154-rep2-IgK-seqA
5


["day154-rep2-IgK-seqC", "day154-rep2-IgK-seqC", "day154-rep2-IgM-seqB", "day154-rep2-IgM-seqB", "day154-rep2-IgG-seqB", "day154-rep2-IgG-seqB", "day154-rep1-IgL-seqC", "day154-rep1-IgL-seqC", "day154-rep2-IgK-seqA", "day154-rep2-IgK-seqA"]

In [9]:
import re

test = ["day154-rep2-IgK-seqC", "day154-rep2-IgK-seqC", 
        "day154-rep2-IgM-seqB", "day154-rep2-IgM-seqB", 
        "day154-rep2-IgG-seqB", "day154-rep2-IgG-seqB", 
        "day154-rep1-IgL-seqC", "day154-rep1-IgL-seqC", 
        "day154-rep2-IgK-seqA", "day154-rep2-IgK-seqA"]

chains = ["IGK", "IGM", "IGG", "IGL"]
dsad = dict()
for rep in test:
    for ch in chains:
        if ch.lower() in rep.lower():
            if not ch in dsad.keys():
                dsad[ch] = list()

            dsad[ch].append(rep)    



In [13]:
len(dsad.values())

4

In [43]:
import re 

text = "What are we aaaa from this message?"

print(re.sub(r"r", '_', text))
print(re.sub(r" h(\w+)", ' _', text))
print(re.sub(r"(\w+)r(\w+)", '_', text))

print(re.match(r"aaaa", text)) # looks only at begining of string
print(re.search(r"aaaa", text))

What a_e we aaaa f_om this message?
What are we aaaa from this message?
What _ we aaaa _ this message?
None
<re.Match object; span=(12, 16), match='aaaa'>


In [5]:
for i in range(10):
    print(i)
    if i == 5:
        break

0
1
2
3
4
5


In [7]:
import sys
sys.stderr.write(f"test{4}")

test4

5

In [3]:
import sqlite3

conn = sqlite3.connect("example.db")
cursor = conn.cursor()

# Create the employees table
cursor.execute('''CREATE TABLE IF NOT EXISTS employees (
                    id INTEGER PRIMARY KEY,
                    name TEXT,
                    age INTEGER,
                    department TEXT
                );''')

# Insert some sample data
employees_data = [
    ("Alice", 25, "Engineering"),
    ("Bob", 30, "HR"),
    ("Charlie", 28, "Engineering"),
    ("David", 22, "Finance"),
    ("Eva", 35, "Engineering"),
    ("Frank", 40, "Finance"),
    ("Grace", 29, "HR")
]
cursor.executemany('INSERT INTO employees (name, age, department) VALUES (?, ?, ?);', employees_data)

# Commit changes and close the connection
conn.commit()
conn.close()

In [11]:
department_name = "Engineering"
max_age = 30

conn = sqlite3.connect("example.db")
conn.row_factory = sqlite3.Row  # Set row_factory to return columns as dictionaries

cursor = conn.cursor()

query = "SELECT * FROM employees WHERE department = ? AND age < ?;"
cursor.execute(query, (department_name, max_age))
columns = [col[0] for col in cursor.description]
[print(col) for col in cursor.description]

employee_data = cursor.fetchall()

cursor.close()
conn.close()

('id', None, None, None, None, None, None)
('name', None, None, None, None, None, None)
('age', None, None, None, None, None, None)
('department', None, None, None, None, None, None)


In [14]:
import pandas as pd
df = pd.DataFrame.from_records(employee_data)

NameError: name 'employee_data' is not defined

In [32]:
conn = sqlite3.connect("example.db")
df = pd.read_sql_query(query, conn, params=(department_name, max_age))
conn.close()
df

Unnamed: 0,id,name,age,department
0,1,Alice,25,Engineering
1,3,Charlie,28,Engineering
2,8,Alice,25,Engineering
3,10,Charlie,28,Engineering


In [46]:
df.to_csv("test.tsv", sep="\t")

In [47]:
df2 = pd.read_table("test.tsv", sep="\t")

In [2]:
import pandas as pd
clns = pd.read_table("~/Desktop/clones.tsv", sep="\t")

In [65]:
clns

Unnamed: 0,cloneId,readCount,bestVGene,bestDGene,bestJGene,aaSeqImputedVDJRegion,nSeqImputedVDJRegion,aaSeqImputedVDJRegion+CRegion,nSeqImputedVDJRegion+CRegion
0,0,4958.0,IGKV2-28,,IGKJ2,divmtqsplslpvtpgepasiscrssqsllhsngynyldwylqkpg...,gatattgtgatgactcagtctccactctccctgcccgtcacccctg...,region_not_covered,gatattgtgatgactcagtctccactctccctgcccgtcacccctg...
1,1,4522.0,IGKV3-15,,IGKJ2,eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprl...,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...,region_not_covered,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...
2,2,4239.0,IGKV1-12,,IGKJ3,diqmtqspssvsasvgdrvtitcrasqgisswlawyqqkpgkapkl...,gacatccagatgacccagtctccatcttccgtgtctgcatctgtag...,region_not_covered,gacatccagatgacccagtctccatcttccgtgtctgcatctgtag...
3,3,3267.0,IGKV3-15,,IGKJ1,eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprl...,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...,region_not_covered,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...
4,4,3131.0,IGKV3-15,,IGKJ1,eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprl...,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...,region_not_covered,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...
...,...,...,...,...,...,...,...,...,...
21082,24220,1.0,IGKV6-21,,IGKJ2,eivltqspdfqsvtpkekvtitcrasqsigsslhwyqqkpdqspkl...,gaaattgtgctgactcagtctccagactttcagtctgtgactccaa...,region_not_covered,region_not_covered
21083,24221,1.0,IGKV3-15,,IGKJ4,eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprl...,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...,region_not_covered,gaaatagtgatgacgcagtctccagccaccctgtctgtgtctccag...
21084,24224,1.0,IGKV2-26,,IGKJ3,eivmtqtplslsitpgeqasmscrssqsllhsdgytylywflqkar...,gagattgtgatgacccagactccactctccttgtctatcacccctg...,region_not_covered,region_not_covered
21085,24226,1.0,IGKV1D-17,,IGKJ1,niqmtqspsamsasvgdrvtitcrarqgisnylawfqqkpgkvpkh...,aacatccagatgacccagtctccatctgccatgtctgcatctgtag...,region_not_covered,region_not_covered


In [3]:
seqs = clns[["aaSeqImputedVDJRegion"]].to_numpy().tolist()
len(seqs)

21087

In [78]:
seqs.tolist()

[['divmtqsplslpvtpgepasiscrssqsllhsngynyldwylqkpgqspqlliylgsnrasgvpdrfsgsgsgtdftlkisrveaedvgvyyCMQALQTPYTFgqgtkleik_'],
 ['eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprlliygastratgiparfsgsgsgteftltisslqsedfavyyCQQYNNWPPYTFgqgtkleik_'],
 ['diqmtqspssvsasvgdrvtitcrasqgisswlawyqqkpgkapklliyaasslqsgvpsrfsgsgsgtdftltisslqpedfatyyCQQANSFPRTFgpgtkvdik_'],
 ['eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprlliygastratgiparfsgsgsgteftltisslqsedfavyyCQQYNNWPRTFgqgtkveik_'],
 ['eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprlliygastratgiparfsgsgsgteftltisslqsedfavyyCQQYNNWPPWTFgqgtkveik_'],
 ['eivmtqspatlsvspgeratlscrasqsvssnlawyqqkpgqaprlliygastratgiparfsgsgsgteftltisslqsedfavyyCQQYNNWPRTFgqgtkveik_'],
 ['dvvmtqsplslpvtlgqpasiscrssqslvysdgntylnwfqqrpgqsprrliykvsnrdsgvpdrfsgsgsgtdftlkisrveaedvgvyyCMQGTHWPRTFgqgtkveik_'],
 ['eivltqspatlslspgeratlscrasqsvssylawyqqkpgqaprlliydasnratgiparfsgsgsgtdftltisslepedfavyyCQQRANWPPITFgqgtrleik_'],
 ['divmtqtplslsvtpgqpasisckssqsllhsdgktylywylqkpgqspqlliyevssrfsgvp

In [82]:
extract = ["", ""]
("?,"*len(extract)+1).rstrip(",")

TypeError: can only concatenate str (not "int") to str

In [60]:
test = np.zeros(len(seqs))
new = np.column_stack((seqs, test))

In [23]:
import sys
test = ['a']
try:
    test.index('a')
except ValueError:
    sys.stdout.write(f"\nNo ESM-2 embeddings will be generated.")

In [39]:
from datetime import datetime

kength = 10
for i in range(kength):

    # Get the current date and time
    current_date_time = datetime.now()

    # Format the date using f-string
    formatted_date = f"seq_pred_{current_date_time:%b%d}_job#{i+1:0{4}}.fasta".lower()

    print(formatted_date)

seq_pred_jul26_job#0001.fasta
seq_pred_jul26_job#0002.fasta
seq_pred_jul26_job#0003.fasta
seq_pred_jul26_job#0004.fasta
seq_pred_jul26_job#0005.fasta
seq_pred_jul26_job#0006.fasta
seq_pred_jul26_job#0007.fasta
seq_pred_jul26_job#0008.fasta
seq_pred_jul26_job#0009.fasta
seq_pred_jul26_job#0010.fasta


In [43]:
with open("test.txt", 'w') as file:
    print(f"test12", file=file)
    print(f"test12", file=file)

In [98]:
import numpy as np
test = np.zeros(5)
test[5] = 5

IndexError: index 5 is out of bounds for axis 0 with size 5