## Explore INPHARED

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO

from load_data_inphared import load_dataset, associate_label_from_metadata

In [None]:
full_dataset = load_dataset()
full_dataset.head()
associate_label_from_metadata(full_dataset, choice="Host",
                              label_name="Host", digitize = False)

In [None]:
host_df = associate_label_from_metadata(full_dataset, choice="Host",
                              label_name="Host", digitize = False)
host_df.Host.value_counts()[:15]

In [None]:
inphared = "/home/ec2-user/repos/project-delta/data/INPHARED"
metadata = pd.read_csv(inphared+'/14Apr2025_data_excluding_refseq.tsv', sep='\t')
metadata.columns

In [None]:
len(np.unique(metadata["Classification"].values))

In [None]:
def Simpson(df) :
    x = df.value_counts()
    return np.sum(x**2) / (len(df)**2)
rows = []

col_names = metadata.columns[14:19]

for c in col_names:
    df = metadata[c]
    N = len(np.unique(df))
    df = df[df != "Unclassified"]
    Ntrue = len(np.unique(df))
    size = len(df)
    simpson_value = Simpson(df)

    rows.append([c, N, Ntrue, size, simpson_value])

# Convert once at the end
explore = pd.DataFrame(rows, columns=["Label", "N", "Ntrue", "size", "Simpson"])
explore

In [None]:
sequence_ids = []
sequences = []
for record in SeqIO.parse(inphared+'/14Apr2025_genomes_excluding_refseq.fa', "fasta"):
    sequence_ids.append(record.id)
    sequences.append(str(record.seq))
genomes = pd.DataFrame({
    "Sequence ID": sequence_ids,
    "Sequence": sequences
})

In [None]:
import matplotlib.pyplot as plt

plt.figure()
_ = plt.hist(genomes["Sequence"].apply(lambda x: len(x)))

In [None]:
associate_label_from_metadata(genomes)

In [None]:
from get_data import load_dataset

df = load_dataset() # proteins by default

In [None]:
import sys
sys.path.append('/home/ec2-user/SageMaker/project-delta')

from ttd.datasets import load_dataset_eskape

df = load_dataset_eskape()

In [None]:
np.unique(df.Host)

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split

import os
from pathlib import Path

eskape = '/home/ec2-user/repos/project-delta/data/ESKAPE'
    
records_data = []

for ffn_path in Path(eskape).glob("*.ffn"):
    for record in SeqIO.parse(ffn_path, "fasta"):

        bad_annotations = [f"{record.id} hypothetical protein",
                          f"{record.id} unannotated protein",
                          f"{record.id} unknown function"]
        
        # Skip hypothetical proteins
        if (record.description in bad_annotations):
            continue
        
        # Skip if not a multiple of 3
        if len(record.seq) % 3 != 0:
            continue

        # Translate nucleotide to amino acid
        try:
            aa_seq = record.seq.translate(to_stop=True)
        except Exception as e:
            print(f"Translation error in {record.id}: {e}")
            continue

        records_data.append({
            "Sequence ID": record.id,
            "Label": record.description,
            "Sequence": str(aa_seq)
        })

# Convert to DataFrame
df = pd.DataFrame(records_data)
df["Label"] = df.Label.apply(lambda x : (" ").join(x.split(" ")[1:]))

In [None]:
# explore most abundanta portein functional annotations
counts = df.Label.value_counts()
counts = counts[counts > 0.001 * np.sum(counts)]
for i in counts.index.values:
    print(f"\"{i}\"")

# Plot bar plot
"""
counts[:30].plot(kind='bar')
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Value Counts')
plt.show()
"""
pass