## Pandas Lab 1
For this lab, you will need to read in the provided FASTA file and generate a DataFrame containing the following information for each sequence record:
1. Sequence Length
2. GC content
3. Begins with start codon?
4. Ends in stop codon?
5. Complete frame? (Meaning is the length a multiple of 3?)

The row index should be the sequence label, and columns should be labeled with descriptions of these 5 criteria I've given you. Be mindful of the datatype you choose for description 3-5.

In [1]:
import pandas as pd

#function to find GC content
def GC_content(sequence): #produces the proportion of the sequence that is G and C base pairs
    GC_count = 0
    for base in sequence:
        if base == "G" or base == "C":
            GC_count += 1
        else:
            continue
    proportionGC = round(GC_count/len(sequence),3)
    return proportionGC

#function to return all data needed for data frame
def pandas_lists(filename):
    #open the file and parse through it to get the header and sequence lines
    with open(filename) as fh:
        header = None
        seq = ""
        all_seqs = {}
        for line in fh:
            line = line.rstrip()
            if line.startswith(">"):
                if header:
                    all_seqs[header] = seq
                header = line
                seq = ""
            else:
                seq += line
        if header:
            all_seqs[header] = seq
    
    #initialize empty lists that will be returned
    sequence_labels = []
    data = []
    
    #parse through the dictionary created above
    for header,seq in all_seqs.items():
        #initialize list for just this sequence iteration
        seq_data = []
        
        #find the sequence label for this iteration
        header = header.split()
        label = header[0].lstrip(">")
        sequence_labels.append(label)
        
        #find the sequence length for this iteration
        length = len(seq)
        seq_data.append(length)
        
        #find the GC conent for this iteration using the GC function
        gc = GC_content(seq) 
        seq_data.append(gc)
        
        #find if the sequence starts with a start codon
        #start codon is ATG
        if seq[:3] == "ATG":
            seq_data.append(True)
        else:
            seq_data.append(False)
            
        #find if the sequence ends with a stop codon
        #stop codons are TAA, TAG and TGA
        if seq[length-3:] == "TAA":
            seq_data.append(True)
        elif seq[length-3:] == "TAG":
            seq_data.append(True)
        elif seq[length-3:] == "TGA":
            seq_data.append(True)
        else:
            seq_data.append(False)
        
        #find if the sequence is a complete frame
        #it is complete if it divides evenly by three
        frame = length % 3
        if frame == 0:
            seq_data.append(True)
        else:
            seq_data.append(False)
        
        #add the data from the iteration to the larger data list
        data.append(seq_data)
        
    #return the lists of labels and of data    
    return sequence_labels,data
        

In [3]:
lists = pandas_lists("Mdomestica-pandasLab.fa")
sequence_labels = lists[0]
data = lists[1]

df = pd.DataFrame(data, index=sequence_labels, columns=["Sequence Length", "GC%", "Valid Start", "Valid Stop", "Complete Frame"])
df                

#this worked and gave the intended table with sequence labels and correct columns
# a "True" in the "Valid Start" column means the sequence begins with a start codon, "False" means it does not
# a "True" in the "Valid Stop" column means the sequence ends with a stop codon, "False" means it does not
# a "True" in the "Complete Frame" column means the sequence is a mulitple of 3, "False"  means it is not

# Sequence Length	GC%	Valid Start	Valid Stop	Complete Frame
# MD10G1276500	2940	0.461	False	True	True
# MD10G1110200	1731	0.464	True	True	True
# MD10G1036500	468	0.545	True	True	True
# MD10G1170700	1728	0.447	True	True	True
# MD10G1250900	1278	0.403	True	True	True
# ...	...	...	...	...	...
# MD14G1066400	423	0.530	True	True	True
# MD14G1225900	594	0.503	True	True	True
# MD14G1120900	939	0.481	True	True	True
# MD14G1057700	483	0.524	True	True	True
# MD14G1237500	3270	0.442	True	True	True
# 7496 rows × 5 columns

Unnamed: 0,Sequence Length,GC%,Valid Start,Valid Stop,Complete Frame
MD10G1276500,2940,0.461,False,True,True
MD10G1110200,1731,0.464,True,True,True
MD10G1036500,468,0.545,True,True,True
MD10G1170700,1728,0.447,True,True,True
MD10G1250900,1278,0.403,True,True,True
...,...,...,...,...,...
MD14G1066400,423,0.530,True,True,True
MD14G1225900,594,0.503,True,True,True
MD14G1120900,939,0.481,True,True,True
MD14G1057700,483,0.524,True,True,True
