In [1]:
import os
from pathlib import Path
import sys
curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import pandas as pd
from src.utils.data import read_fasta
from src.data.datasets import ProteinDataset

In [21]:
train = read_fasta('../data/swissprot/proteinfer_splits/random/train_GO.fasta')
val = read_fasta('../data/swissprot/proteinfer_splits/random/dev_GO.fasta')
test = read_fasta('../data/swissprot/proteinfer_splits/random/test_GO.fasta')

In [22]:
train = [(k[0],j," ".join(k[1:])) for j,k in train]
test = [(k[0],j," ".join(k[1:])) for j,k in test]
val = [(k[0],j," ".join(k[1:])) for j,k in val]

df = train + val + test

In [23]:
df = pd.DataFrame(df,columns=['id','sequence','labels'])

In [24]:
num_sequences = len(df)
print('number of sequences:',num_sequences)

number of sequences: 548264


In [25]:
from collections import Counter
labels = Counter()
df['labels'].apply(lambda x: labels.update(x.split(" ")))

0         None
1         None
2         None
3         None
4         None
          ... 
548259    None
548260    None
548261    None
548262    None
548263    None
Name: labels, Length: 548264, dtype: object

In [26]:
vocab = set()
amino_freq = Counter()
for i in df['sequence']:
    aa_list = list(i)
    vocab.update(aa_list)
    amino_freq.update(aa_list)

In [30]:
amino_freq

Counter({'L': 19162898,
         'A': 16400419,
         'G': 14030164,
         'V': 13626228,
         'E': 13372871,
         'S': 13132144,
         'I': 11758825,
         'K': 11535558,
         'R': 10988445,
         'D': 10842858,
         'T': 10606287,
         'P': 9376845,
         'N': 8043462,
         'Q': 7803265,
         'F': 7659601,
         'Y': 5779957,
         'M': 4793137,
         'H': 4509581,
         'C': 2723764,
         'W': 2170949})

In [35]:
print('# GO Terms:',len(labels.keys()))

# GO Terms: 32103


In [36]:
print('GO Terms distribution')
pd.Series(labels.values()).describe()

GO Terms distribution


count     32103.000000
mean        778.025543
std        9115.702309
min           1.000000
25%           4.000000
50%          17.000000
75%          84.000000
max      462356.000000
dtype: float64

In [2]:
PD = ProteinDataset('../data/swissprot/proteinfer_splits/random/train_GO.fasta')

In [4]:
PD.get_max_seq_len()

35213

In [5]:
len(PD)

438522

In [10]:
next(iter(PD))

([10,
  15,
  8,
  7,
  7,
  3,
  19,
  2,
  3,
  16,
  0,
  14,
  14,
  0,
  7,
  3,
  0,
  5,
  17,
  11,
  16,
  9,
  0,
  2,
  0,
  17,
  14,
  17,
  16,
  9,
  5,
  12,
  14,
  5,
  14,
  6,
  17,
  17,
  9,
  0,
  8,
  0,
  4,
  5,
  5,
  12,
  0,
  17,
  16,
  11,
  2,
  5,
  17,
  16,
  17,
  0,
  14,
  3,
  7,
  2,
  9,
  3,
  2,
  12,
  4,
  3,
  11,
  9,
  5,
  0,
  13,
  9,
  17,
  8,
  15,
  17,
  0,
  16,
  8,
  16,
  11,
  2,
  17,
  0,
  5,
  2,
  5,
  16,
  16,
  16,
  0,
  16,
  17,
  9,
  0,
  13,
  0,
  9,
  17,
  8,
  5,
  5,
  9,
  14,
  9,
  17,
  0,
  0,
  5,
  0,
  11,
  12,
  7,
  3,
  9,
  5,
  0,
  5,
  7,
  15,
  8,
  0,
  0,
  2,
  0,
  17,
  15,
  3,
  0,
  9,
  9,
  0,
  15,
  0,
  16,
  12,
  17,
  15,
  5,
  8,
  2,
  0,
  7,
  0,
  13,
  17,
  0,
  16,
  17,
  15,
  15,
  14,
  2,
  13,
  17,
  9,
  5,
  3,
  9,
  17,
  5,
  3,
  0,
  10,
  16,
  8,
  17,
  5,
  17,
  2,
  5,
  17,
  17,
  15,
  17,
  3,
  3,
  15,
  15,
  16,
  9,
  11,
  16,
  3,
  