In [10]:
import pandas as pd
from plotly import express as px
from Bio import SeqIO

In [31]:
fasta = SeqIO.parse("../../seq/NC_005966.1_all_orf.fa", format="fasta")

df_orfs = pd.DataFrame.from_records(
    dict(contig=contig.name, seq=contig.seq)
    for contig in fasta
)
df_orfs

Unnamed: 0,contig,seq
0,lcl|ORF1_NC_005966.1:898:993,"(M, T, F, T, C, W, R, V, K, K, P, V, W, S, N, ..."
1,lcl|ORF2_NC_005966.1:1264:1377,"(M, W, C, V, N, P, L, K, M, Y, W, R, F, V, R, ..."
2,lcl|ORF3_NC_005966.1:2281:2982,"(M, A, V, Q, D, V, R, F, Y, L, T, G, T, L, L, ..."
3,lcl|ORF4_NC_005966.1:2998:4074,"(M, Q, I, T, R, L, N, I, E, R, V, R, N, L, K, ..."
4,lcl|ORF5_NC_005966.1:6805:6897,"(M, L, A, L, S, S, L, I, I, F, G, K, L, D, F, ..."
...,...,...
326,lcl|ORF327_NC_005966.1:4569:4492,"(M, V, N, C, S, L, L, D, K, A, L, T, T, D, T, ..."
327,lcl|ORF328_NC_005966.1:4458:4357,"(M, V, R, I, T, S, A, A, E, T, P, S, S, G, W, ..."
328,lcl|ORF329_NC_005966.1:4356:4114,"(M, V, T, I, I, S, S, Q, C, P, A, K, A, S, S, ..."
329,lcl|ORF330_NC_005966.1:2943:2863,"(M, C, R, I, L, N, Q, N, R, L, I, G, L, S, H, ..."


In [32]:
df_orfs_exploded = df_orfs.explode('seq')
df_orfs_exploded

Unnamed: 0,contig,seq
0,lcl|ORF1_NC_005966.1:898:993,M
0,lcl|ORF1_NC_005966.1:898:993,T
0,lcl|ORF1_NC_005966.1:898:993,F
0,lcl|ORF1_NC_005966.1:898:993,T
0,lcl|ORF1_NC_005966.1:898:993,C
...,...,...
330,lcl|ORF331_NC_005966.1:759:628,Q
330,lcl|ORF331_NC_005966.1:759:628,Q
330,lcl|ORF331_NC_005966.1:759:628,P
330,lcl|ORF331_NC_005966.1:759:628,S


In [33]:
df_aa_counts = df_orfs_exploded.seq.value_counts(normalize=False)
df_aa_counts_normalized = df_orfs_exploded.seq.value_counts(normalize=True)

In [34]:
px.bar(
    df_aa_counts_normalized,
    x=df_aa_counts_normalized.index,
    y='proportion',
).update_layout(
    yaxis=dict(tickformat=',.0%'),
    title='Amino Acids in Putative Genes (by open-reading-frame) of ADP1',
)

In [None]:
# It indeed appears that R (arginine is ~5% of putatively expressed AAs) 1/20 = .05