In [1]:
from Bio import SeqIO

---

#### Check what the raw file looks like

In [14]:
%%bash

head data/S9_Viral_FA/Amalgaviridae.fa

>refseq|NC_035189|3387bp|Antonospora locustae virus 1, complete genome Amalgaviridae
ACAAATGGAAGCGAGTGTCTTCGTTAGCCGTCTCATGGCCATCGAGCCATCTGACTTCAG
CCAGCACTTTTCTTTACCAATGTCAATCTGCGACCATGGTCTTAAGATGATCAGAATAAG
CGCAGAGCAGACTCCGGCGTTCGTTAAAGCATTCACAACTCAGTATGCTTTAAGTAATGA
ATGTCCGCAAACAGAGTTTTTCAACGCTTTATTCGACTACATCATGGATGGTAAGATCAA
ACAGGCCATAGGTGAAGCAAGTGGTCGACTGAGGAAGGAGCAAAAGCTGCTTCAGTTCCA
GGAACTACACGGCTTTAGCAAAGACATTGAAGATGACTTCAATGAGGCTTTAATCCTCTA
TAACAAGGAGCGAGCTGATTTGAGGTCAGTCAAAGATAAGTTCGACGAGTTGACCAAAGA
CCTAAATGGCATCTTCAAGGTTCTTGAAGAGGAGCTGTCTCAAAGGTGGTCAGCACTTGA
GACTATTATCAATGAAAAGAAAACAAATGCTGCTGAAGCACAGAAGAGAGCTCGGACTAA


---

#### How can Biopython parse this data?

In [23]:
for seq_record in SeqIO.parse("data/S9_Viral_FA/Amalgaviridae.fa", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

refseq|NC_035189|3387bp|Antonospora
Seq('ACAAATGGAAGCGAGTGTCTTCGTTAGCCGTCTCATGGCCATCGAGCCATCTGA...GCC', SingleLetterAlphabet())
3387
refseq|NC_014593|3431bp|Blueberry
Seq('GTATTTTTATTTTCGGACACCGAGGTTCCTTCTGCGTGCGCTATCTGATAGTCT...TCC', SingleLetterAlphabet())
3431
refseq|NC_014481|3427bp|Rhododendron
Seq('GTATTAATTTATATCAAATACAAAAGACTGCAGGTGACTGATCTGCGATCAAAT...GTC', SingleLetterAlphabet())
3427
refseq|NC_011591|3437bp|Southern
Seq('GATAAATTTAGTAAGCTACCTAGCGGAATTAAGAACTTTCACAAAAGGGTGAGG...TTC', SingleLetterAlphabet())
3437
refseq|NC_035070|3420bp|Spinach
Seq('CCATATAATTGTTTTGCCATCGAGAAAAACGTCAAGAGGAAGAAGACTTCAGCA...ACC', SingleLetterAlphabet())
3420
refseq|NC_034614|3383bp|Zostera
Seq('GTTAAACACAGGCGTTGCGAACGGAAAAGGTAAGTACGTTCGTAAAGCACGTGC...GCG', SingleLetterAlphabet())
3383
refseq|NC_034615|3316bp|Zostera
Seq('GCAAGTTAATCAGGTAGTGAGACACAGCAGGCAGCTGACTGTGCAAATTTGTGA...TTC', SingleLetterAlphabet())
3316
refseq|NC_003874|3157bp|Zygosaccharomyces
Seq('GTAAAAGAACAAGCGTTTGTTCATTTTATCTTTTTAAG

---

#### How do we turn this into a Pandas Dataframe that we can work with?

In [17]:
import pandas as pd

In [27]:
# List of filenames 
list_of_filenames = ['Amalgaviridae','Badnavirus',
                     'Birnaviridae','Caulimovirus',
                     'Reptarenavirus','Soymovirus',
                     'Spumavirus']

# Initialize list to store info from each file
list_of_files_info = []

# Go through every file
for filename in list_of_filenames:
    
    # Go through every record in our file and parse it
    for seq_record in SeqIO.parse(f'data/S9_Viral_FA/{filename}.fa', 'fasta'):
        
        # Create a temporary dict to store following info
        temp_dict = {}

        # Store the Genus (from filename)
        temp_dict['Genus'] = filename
        
        # Grab the Species from 4th part of the sequence record id
        temp_dict['Species'] = seq_record.id.split('|')[3]
        
        # Grab the sequence length (may not be needed)
        temp_dict['Sequence_Length'] = len(seq_record)
        
        # Grab the sequence
        temp_dict['Sequence'] = repr(seq_record.seq)
        
        # Append this file info to our list
        list_of_files_info.append(temp_dict)

# Create the pandas DataFrame from our list of files info
df = pd.DataFrame(list_of_files_info)
df

Unnamed: 0,Genus,Species,Sequence_Length,Sequence
0,Amalgaviridae,Antonospora,3387,Seq('ACAAATGGAAGCGAGTGTCTTCGTTAGCCGTCTCATGGCCA...
1,Amalgaviridae,Blueberry,3431,Seq('GTATTTTTATTTTCGGACACCGAGGTTCCTTCTGCGTGCGC...
2,Amalgaviridae,Rhododendron,3427,Seq('GTATTAATTTATATCAAATACAAAAGACTGCAGGTGACTGA...
3,Amalgaviridae,Southern,3437,Seq('GATAAATTTAGTAAGCTACCTAGCGGAATTAAGAACTTTCA...
4,Amalgaviridae,Spinach,3420,Seq('CCATATAATTGTTTTGCCATCGAGAAAAACGTCAAGAGGAA...
...,...,...,...,...
91,Spumavirus,Bovine,12002,Seq('TGTGGTGGGAAGACCACCCGGAAATAAGCAAGGGCCAGCCC...
92,Spumavirus,Equine,12035,Seq('TGTCATGGAATGAGGATCCAGAGACTAAGAATATAGCTATT...
93,Spumavirus,Feline,10479,Seq('GAGCTCTTCTCACAGACTTGGCTGCGTCCAGGGTGAGATTG...
94,Spumavirus,Macaque,12972,Seq('TGTGGCAGACAGCCACTAAATGTATAGGACCAGAGGAGGAA...


### Make a histogram of all the sequences placed into buckets of 100 base pairs apart