In [8]:
import os
from pathlib import Path
import sys
curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

import pandas as pd
import matplotlib.pyplot as plt
from src.utils.data import read_fasta, multihot_to_ints
from src.data.datasets import ProteinDataset

In [2]:
train = read_fasta('../data/swissprot/proteinfer_splits/random/train_GO.fasta')
val = read_fasta('../data/swissprot/proteinfer_splits/random/dev_GO.fasta')
test = read_fasta('../data/swissprot/proteinfer_splits/random/test_GO.fasta')

In [3]:
train = [(k[0],j," ".join(k[1:])) for j,k in train]
test = [(k[0],j," ".join(k[1:])) for j,k in test]
val = [(k[0],j," ".join(k[1:])) for j,k in val]

df = train + val + test

In [4]:
df = pd.DataFrame(df,columns=['id','sequence','labels'])

In [5]:
num_sequences = len(df)
print('number of sequences:',num_sequences)

number of sequences: 522607


In [6]:
from collections import Counter
labels = Counter()

vocab = set()
amino_freq = Counter()
for idx,row in df.iterrows():
    sequence = row['sequence']
    row_labels = row['labels']
    aa_list = list(sequence)
    if row_labels =='':
        print(row['id'],row['labels'])
    vocab.update(aa_list)
    amino_freq.update(aa_list)
    labels.update(row_labels.split(" "))

In [7]:
print('# GO Terms:',len(labels.keys()))

# GO Terms: 32102


In [8]:
print('GO Terms distribution')
pd.Series(labels.values()).describe()

GO Terms distribution


count     32102.000000
mean        777.250545
std        9114.786603
min           1.000000
25%           4.000000
50%          17.000000
75%          84.000000
max      462356.000000
dtype: float64

In [9]:
print('Sequence length distribution')

df['sequence'].apply(len).describe()

Sequence length distribution


count    522607.000000
mean        368.042215
std         334.721845
min           2.000000
25%         179.000000
50%         303.000000
75%         456.000000
max       35213.000000
Name: sequence, dtype: float64

In [2]:
PD = ProteinDataset(data_path='../data/swissprot/proteinfer_splits/random/train_GO.fasta',
                    sequence_vocabulary_path='../data/vocabularies/amino_acid_vocab.json',
                    label_vocabulary_path='../data/vocabularies/GO_label_vocab.json')

In [3]:
len(PD),PD.get_max_seq_len()

(418015, 35213)

In [4]:
PD.aminoacid2int['M']

10

In [20]:
idx = 10
set([PD.int2label[i] for i in multihot_to_ints(PD[idx][1].tolist())]) == set(PD.data[idx][1][1:])
set([PD.int2aminoacid[i] for i in PD[idx][0].tolist()]) == set(PD.data[idx][0])

True

True