#### 1. Install ESM (Evolutionary Scale Modeling) – Transformer protein language models

In [1]:
!pip install fair-esm



#### 2. Embedding Proteins – default

In [3]:
!python scripts/extract.py esm1b_t33_650M_UR50S examples/data/some_proteins.fasta examples/data/some_proteins_emb_esm1b/ --repr_layers 0 32 33 --include mean per_tok

Read examples/data/some_proteins.fasta with 15 sequences
Processing 1 of 3 batches (9 sequences)
Processing 2 of 3 batches (4 sequences)
Processing 3 of 3 batches (2 sequences)


In [5]:
!wget https://scop.berkeley.edu/downloads/scopeseq-2.08/astral-scopedom-seqres-sel-gs-bib-95-2.08.fa

--2022-08-13 09:05:59--  https://scop.berkeley.edu/downloads/scopeseq-2.08/astral-scopedom-seqres-sel-gs-bib-95-2.08.fa
Resolving scop.berkeley.edu (scop.berkeley.edu)... 128.32.236.13
Connecting to scop.berkeley.edu (scop.berkeley.edu)|128.32.236.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9862103 (9.4M)
Saving to: ‘astral-scopedom-seqres-sel-gs-bib-95-2.08.fa’


2022-08-13 09:06:12 (1.43 MB/s) - ‘astral-scopedom-seqres-sel-gs-bib-95-2.08.fa’ saved [9862103/9862103]



In [21]:
!python scripts/extract.py esm1b_t33_650M_UR50S examples/data/astral-seg.fa examples/data/astral_esm1b/ --repr_layers 0 32 33 --include mean per_tok

Read examples/data/astral-seg.fa with 4 sequences
Processing 1 of 1 batches (4 sequences)


In [None]:
# >d1ux8a_a
# napyeaigeellsqlvdtfyervashpllkpifpsdltetarkqkqfltqylggpplyteehghpmlrarhlpfpitneradawlscmkdamdhvglegeireflfgrleltarhmvnq
# >d2gkma_a
# gllsrlrkrepisiydkiggheaievvvedffvrvladdqlsaffsgtnmsrlkgkqveffaaalggpepytgapmkqvhqgrgitmhhfslvaghladaltaagvpsetiteilgviaplavdvts
# >d1ngka_a
# ksfydavggaktfdaivsrfyaqvaedevlrrvypeddlagaeerlrmfleqywggprtyseqrghprlrmrhapfrislierdawlrcmhtavasidsetlddehrrelldylemaahslvnspf
# >d1dlya_a
# slfaklggreaveaavdkfynkivadptvstyfsntdmkvqrskqfaflayalggasewkgkdmrtahkdlvphlsdvhfqavarhlsdtltelgvppeditdamavvastrtevlnmpqq

In [18]:
v = 'slfaklggreaveaavdkfynkivadptvstyfsntdmkvqrskqfaflayalggasewkgkdmrtahkdlvphlsdvhfqavarhlsdtltelgvppeditdamavvastrtevlnmpqq'
v.upper()

'SLFAKLGGREAVEAAVDKFYNKIVADPTVSTYFSNTDMKVQRSKQFAFLAYALGGASEWKGKDMRTAHKDLVPHLSDVHFQAVARHLSDTLTELGVPPEDITDAMAVVASTRTEVLNMPQQ'

In [1]:
CRED = '\033[91m'
CEND = '\033[0m'

def readMultipleFASTA(fileName):
    '''
    :param fileName: anyName.txt or, anyName.fa or, anyName.fasta
    :return: DNA/RNA/Protein/Peptide sequences
    '''

    with open(fileName, 'r') as file:
        v = []
        genome = ''
        for line in file:
            if line[0] != '>':
                genome += line.strip()
            else:
                v.append(genome.upper())
                genome = ''
        #end-for
        v.append(genome.upper())
        del v[0]
        return v
    #end-with
#end-def

def check(x, elements):
    '''
    :param x: a single sequence
    :param elements: charater of DNA/RNA/PROT as a set elements
    :return: ensure whether "elements" is superset or not
    '''

    x  = set(x)
    if elements >= x:
        return True
    else:
        return False
    #end-if
#end-def

def standardElements(seqType):
    '''
    :param seqType: DNA/RNA/PROT
    :return: elements of a set
    '''

    if seqType == 'DNA':
        elements = set('ACGT')
    else:
        if seqType == 'RNA':
            elements = set('ACGU')
        else:
            if seqType == 'PROT':
                elements = set('ACDEFGHIKLMNPQRSTVWYBOJUXZ') # Except: BOJUXZ
            else:
                elements = None
            #end-if
        #end-if
    #end-if
    return elements
#end-def

def ensureBadElements(X, seqType):
    '''
    :param X:
    :param seqType:
    :return:
    '''

    elements = standardElements(seqType)
    for x in X:
        if check(x, elements) == False:
            return False
        #end-if
    #end-for
    return True
#end-def


def fetchX(fileName, seqType):
    '''
    :param fileName: anyName.txt or, anyName.fa or, anyName.fasta
    :param seqType: DNA, RNA, PROT
    :return: (Without bad elements) DNA/RNA/Protein/Peptide sequences
    '''

    X = readMultipleFASTA(fileName)

    # Check the bad/evil elements ...
    if ensureBadElements(X, seqType) == False:
        raise Exception(CRED+'Please remove the bad elements from the given \'{}\' sequences. We only accept {} as characters.'.format(seqType, sorted(standardElements(seqType)))+CEND)
    #end-if

    return X
#end-def

In [2]:
# dataSource = '/home/rafsanjani/backup-research/esm/examples/data/some_proteins.fasta'
# dataSource = '/home/rafsanjani/backup-research/esm/examples/data/astral.fa'
# dataSource = '/home/rafsanjani/backup-research/esm/examples/data/P62593.fasta'
dataSource = '/home/rafsanjani/backup-research/esm/astral-common.fa'


X = fetchX(dataSource, 'PROT')

In [3]:
X[0]

'NAPYEAIGEELLSQLVDTFYERVASHPLLKPIFPSDLTETARKQKQFLTQYLGGPPLYTEEHGHPMLRARHLPFPITNERADAWLSCMKDAMDHVGLEGEIREFLFGRLELTARHMVNQ'

In [4]:
import numpy as np
X = np.array(X)

In [5]:
X.shape

(35494,)

In [6]:
import esm

In [7]:
C=1
d = {}
for header, _seq in esm.data.read_fasta(dataSource):
    # print(header)
    o = header.split()
    o = o[1].split('.')
    o = o[0:-1]
    
    o = '.'.join(o)
    # print(o)
    
    if o not in d:
        d[o] = 1
    else:
        d[o] += 1
    # if key not in d:
    # d[key] = value
    
    

    # if C==100:
    #     break
    # C += 1

In [123]:
C=1
d = {}
for header, _seq in esm.data.read_fasta(dataSource):
    print(header)
    if C==10:
        break
    C += 1

>0|beta-lactamase_P20P|1.581033423
>1|beta-lactamase_D207D|1.42563125
>2|beta-lactamase_A215A|1.422813331
>3|beta-lactamase_C75C|1.4155315119999998
>4|beta-lactamase_N134N|1.39696596
>5|beta-lactamase_L137L|1.355533136
>6|beta-lactamase_L28L|1.3516090040000002
>7|beta-lactamase_L199L|1.3516090040000002
>8|beta-lactamase_F149F|1.32191175
>9|beta-lactamase_A200A|1.295473865


In [8]:
d

{'a.1.1': 253,
 'a.1.2': 8,
 'a.2.1': 3,
 'a.2.2': 6,
 'a.2.3': 24,
 'a.2.5': 2,
 'a.2.6': 3,
 'a.2.7': 6,
 'a.2.8': 1,
 'a.2.9': 2,
 'a.2.10': 5,
 'a.2.11': 59,
 'a.2.12': 4,
 'a.2.13': 3,
 'a.2.14': 1,
 'a.2.15': 2,
 'a.2.16': 2,
 'a.2.17': 7,
 'a.2.18': 1,
 'a.2.19': 3,
 'a.2.20': 6,
 'a.2.21': 2,
 'a.3.1': 128,
 'a.4.1': 227,
 'a.4.2': 4,
 'a.4.3': 11,
 'a.4.5': 374,
 'a.4.6': 46,
 'a.4.7': 6,
 'a.4.8': 4,
 'a.4.9': 2,
 'a.4.10': 6,
 'a.4.11': 3,
 'a.4.12': 6,
 'a.4.13': 18,
 'a.4.14': 5,
 'a.4.15': 1,
 'a.5.1': 3,
 'a.5.2': 49,
 'a.5.3': 8,
 'a.5.4': 1,
 'a.5.6': 2,
 'a.5.7': 5,
 'a.5.8': 3,
 'a.5.9': 1,
 'a.5.10': 1,
 'a.6.1': 23,
 'a.7.1': 14,
 'a.7.2': 4,
 'a.7.3': 6,
 'a.7.4': 3,
 'a.7.5': 2,
 'a.7.6': 2,
 'a.7.7': 6,
 'a.7.8': 9,
 'a.7.10': 1,
 'a.7.11': 2,
 'a.7.12': 6,
 'a.7.13': 2,
 'a.7.14': 7,
 'a.7.15': 2,
 'a.7.16': 2,
 'a.7.17': 2,
 'a.8.1': 37,
 'a.8.2': 1,
 'a.8.3': 5,
 'a.8.4': 5,
 'a.8.5': 3,
 'a.8.6': 2,
 'a.8.7': 1,
 'a.8.8': 1,
 'a.8.9': 3,
 'a.8.10': 2,
 'a.8.

In [9]:
print(d.values())

dict_values([253, 8, 3, 6, 24, 2, 3, 6, 1, 2, 5, 59, 4, 3, 1, 2, 2, 7, 1, 3, 6, 2, 128, 227, 4, 11, 374, 46, 6, 4, 2, 6, 3, 6, 18, 5, 1, 3, 49, 8, 1, 2, 5, 3, 1, 1, 23, 14, 4, 6, 3, 2, 2, 6, 9, 1, 2, 6, 2, 7, 2, 2, 2, 37, 1, 5, 5, 3, 2, 1, 1, 3, 2, 3, 11, 5, 1, 11, 15, 2, 3, 9, 1, 17, 2, 1, 3, 7, 33, 98, 1, 2, 3, 1, 3, 2, 1, 1, 6, 29, 4, 4, 2, 2, 15, 13, 4, 6, 14, 4, 9, 7, 2, 1, 5, 1, 1, 2, 1, 1, 1, 3, 1, 2, 2, 4, 154, 15, 10, 2, 1, 1, 71, 20, 45, 10, 20, 69, 55, 5, 3, 6, 3, 1, 2, 1, 2, 4, 2, 1, 2, 1, 7, 6, 1, 1, 1, 1, 2, 1, 5, 4, 2, 3, 5, 1, 1, 75, 6, 2, 14, 1, 223, 20, 2, 1, 54, 1, 1, 5, 12, 29, 171, 1, 8, 1, 3, 11, 1, 5, 1, 1, 3, 2, 2, 4, 1, 1, 7, 2, 25, 18, 1, 22, 12, 2, 1, 5, 62, 19, 4, 10, 2, 8, 10, 11, 3, 1, 2, 7, 2, 3, 2, 2, 1, 13, 6, 2, 12, 1, 23, 6, 1, 28, 2, 2, 3, 1, 2, 3, 2, 1, 15, 30, 4, 5, 33, 2, 6, 14, 2, 20, 1, 2, 11, 13, 1, 13, 3, 28, 1, 42, 1, 3, 15, 10, 2, 10, 78, 6, 58, 5, 7, 28, 2, 1, 25, 159, 7, 1, 2, 6, 2, 6, 3, 17, 2, 88, 10, 1, 1, 6, 16, 87, 23, 2, 2, 1, 4, 1, 

In [12]:
print(sum(d.values()))

35494


In [10]:
len(d)

# dict1 = {'a':1,'b':2,'c':3}
# print(len(dict1))

2065

In [11]:
v = ['a', '1', '1']
v = '.'.join(v)
print(v)


a.1.1
