# Open ESKAPE Data

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split

import os
from pathlib import Path

eskape = '/home/ec2-user/repos/project-delta/data/ESKAPE'
    
records_data = []

for ffn_path in Path(eskape).glob("*.ffn"):
    for record in SeqIO.parse(ffn_path, "fasta"):

        bad_annotations = [f"{record.id} hypothetical protein",
                          f"{record.id} unannotated protein",
                          f"{record.id} unknown function"]
        
        # Skip hypothetical proteins
        if (record.description in bad_annotations):
            continue
        
        # Skip if not a multiple of 3
        if len(record.seq) % 3 != 0:
            continue

        # Translate nucleotide to amino acid
        try:
            aa_seq = record.seq.translate(to_stop=True)
        except Exception as e:
            print(f"Translation error in {record.id}: {e}")
            continue

        records_data.append({
            "Sequence ID": record.id,
            "Label": record.description,
            "Sequence": str(aa_seq)
        })

# Convert to DataFrame
df = pd.DataFrame(records_data)
df["Label"] = df.Label.apply(lambda x : (" ").join(x.split(" ")[1:]))

In [None]:
# explore most abundant protein functional annotations
counts = df.Label.value_counts()
counts = counts[counts > 0.001 * np.sum(counts)]

# bar plot
counts[:30].plot(kind='bar')
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Value Counts')
plt.show()

# lost for prompt
for i in counts.index.values:
    print(f"\"{i}\"")