# Notes

Used simplified version of the data used in the paper "Embeddings from protein language models predict conservation and variant effects" by Marquet et al.

**Data explanation**:
- `seq_and_conservation.txt`: FASTA file with 3 lines per entry: amino acid sequence, continuous conservation scores (ignored) and conservation classes between 1 and 9 (1 = very variable, 9 = very conserved)
- `train_ids.txt`: IDs of the proteins for training (9392 proteins)
- `val_ids.txt`: IDs of the proteins for validation (555 proteins)
- `test_ids.txt`: IDs of the proteins for testing (519 proteins)

**Proposed splits**:
- `sampled`: Randomly split sequences into `train`/`test` with 95/5% probability.

This is a well-known dataset used to validate the behavior of code and models. Only provided a `sampled` split for this purpose.

# Configs & Imports

In [1]:
from pathlib import Path

from pandas import DataFrame, read_json
import json

from Bio import SeqIO

%load_ext autoreload
%autoreload 2

In [2]:
# Where the raw data is stored and where processed data will be deposited
data_path = Path('') / '..' / 'data' / 'conservation'

sequences = data_path / 'seq_and_conservation.txt'
train = data_path / 'train_ids.txt'
validation = data_path / 'val_ids.txt'
test = data_path / 'test_ids.txt'

split_path = Path('') / '..' / 'splits' / 'conservation'

# Obtain original dataset

In [3]:
parser = SeqIO.parse(sequences, "fasta")

seqs_and_cons = list(parser)

dataset = DataFrame(columns = ["id", "sequence", "conservations"])
for i in range(0, len(seqs_and_cons), 3):
    id = seqs_and_cons[i].description[1:].replace(' ', '-')
    sequence = seqs_and_cons[i].seq
    conservations = seqs_and_cons[i+2].seq
    
    dataset = dataset.append({"id": id,
                              "sequence": str(sequence), 
                              "conservations": str(conservations)}, ignore_index=True)

In [4]:
dataset

Unnamed: 0,id,sequence,conservations
0,3p6z-C,AHHHHHHVGTWENLYFQSIPDDDEDSYEIFEPPESTVMATRKMHDR...,"6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,7,7,3,4,7,3,..."
1,6fb3-A,TGSLVSLIRGQVVTTDGTPLVGVNVSFVKYPKYGYTITRQDGMFDL...,"6,4,9,8,6,7,9,6,9,9,7,9,3,8,1,7,7,6,8,8,6,9,8,..."
2,2oqg-A,QGMTVGTYAELASVFAALSDETRWEILTELGRADQSASSLATRLPV...,"1,6,6,7,5,5,4,1,6,3,4,6,2,6,8,5,9,8,7,9,5,8,9,..."
3,3ix7-A,SNAPRGGKVLDTSVLVDGRVAEVAAVGFLEGPLWVPHFVLKELQHF...,"6,4,1,1,1,1,1,8,5,8,9,9,9,7,8,9,9,9,9,8,3,7,6,..."
4,5y9q-A,GTISIGCSSLIGQTLLPEVLSLYNAQFPNVEIQVQVGSTEQIKANH...,"9,8,9,7,7,8,7,9,8,7,6,8,7,8,5,9,9,1,6,9,6,1,7,..."
...,...,...,...
10502,1ws8-A,MATVHKVGDSTGWTTLVPYDYAKWASSNKFHVGDSLLFNYNNKFHN...,"8,9,6,3,6,4,9,9,6,2,1,7,9,3,2,4,3,2,3,5,7,3,3,..."
10503,4qa8-A,GAMDPEFGKKPTTASSPSPGSPSPEAQQILQDSSKATKGLHSVHVV...,"7,6,4,4,6,4,1,5,1,1,1,1,3,3,3,3,1,1,2,1,1,1,6,..."
10504,1v2z-A,STAFFFRRMSPADKRKLLDELRSIYRTIVLEYFNTDAKVNERIDEF...,"6,7,1,1,6,2,6,3,7,1,1,1,4,3,1,1,1,1,1,1,7,4,1,..."
10505,3h7h-B,GSHMDPNLWTVKCKIGEERATAISLMRKFIAYQFTDTPLQIKSVVA...,"6,9,1,3,9,8,6,8,8,5,7,7,8,5,2,9,6,9,6,2,6,6,2,..."


# Splits

In [5]:
# Let's create a sequences.fasta file with all the sequences
with open(split_path / 'splits' / 'sequences.fasta', 'w') as sequences_file:
    for index, row in dataset.iterrows():
        sequences_file.write('>{}\n'.format(row['id']))
        sequences_file.write('{}\n'.format(row['sequence']))

## sampled

In [6]:
# Obtain IDs of Train, Validation and Test proteins
with open(train) as train_file:
    train_ids = [line.rstrip() for line in train_file]

with open(validation) as validation_file:
    validation_ids = [line.rstrip() for line in validation_file]
    
with open(test) as test_file:
    test_ids = [line.rstrip() for line in test_file]

In [7]:
# Let's create the split dataset
sampled = DataFrame(columns = ["id", "conservations", "set", "validation"])

# Add train samples
tmp = dataset[dataset["id"].isin(train_ids)][["id", "conservations"]]
tmp.insert(2, "set", ["train"]*len(tmp))
sampled = sampled.append(tmp)

# Add validation samples
tmp = dataset[dataset["id"].isin(validation_ids)][["id", "conservations"]]
tmp.insert(2, "set", ["train"]*len(tmp))
tmp.insert(2, "validation", [True]*len(tmp))
sampled = sampled.append(tmp)

# Add test samples
tmp = dataset[dataset["id"].isin(test_ids)][["id", "conservations"]]
tmp.insert(2, "set", ["test"]*len(tmp))
sampled = sampled.append(tmp)

# Target to str
sampled["conservations"] = sampled["conservations"].apply(lambda x: ''.join(map(str, x)))
sampled["conservations"] = sampled["conservations"].apply(lambda x: x.replace(',', ''))

sampled = sampled.reset_index(drop = True)

In [8]:
sampled

Unnamed: 0,id,conservations,set,validation
0,3p6z-C,6666666666666666577347367897785511671549996336...,train,
1,6fb3-A,6498679699793817768869898782213149394992896997...,train,
2,2oqg-A,1667554163462685987958983762584125368548813516...,train,
3,3ix7-A,6411111858999789999837631668458154782998199928...,train,
4,5y9q-A,9897787987687859916961751179286768397595572334...,train,
...,...,...,...,...
10461,1vyb-A,6552654567486978867643884563436544457746699897...,test,
10462,4co6-A,8647745748566562341194165155674439135615526992...,test,
10463,3biy-A,6317638516287116919881171251111531926988675591...,test,
10464,1sdi-A,6886125876999767987179666953932411331455196525...,test,


In [9]:
with open(split_path / 'splits' / 'sampled.fasta', 'w') as sampled_file:
    for index, row in sampled.iterrows():
        validation = 'True' if row['validation'] == True else 'False'
        sampled_file.write('>{}\n'.format('{} SET={} VALIDATION={}'.format(row['id'], row['set'], validation)))
        sampled_file.write('{}\n'.format(row['conservations']))