In [162]:
import pandas as pd
import numpy as np
from Bio import AlignIO
from glob import glob

In [235]:
amino_acids = pd.read_csv('AminoAcids.csv', index_col=0, squeeze=True).T
# Since this is a CSV we need to evaluate the codon string
for amino in amino_acids:
    amino_acids[amino]["codons"] = eval(amino_acids[amino]["codons"])
    
amino_letters = sorted(list(amino_acids.T.index))[1:]

In [236]:
amino_letters.insert(0,"-")
len(amino_letters)

21

In [237]:
amino_letters

['-',
 'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y']

In [163]:
# Found this on youtube
for filename in glob("*.fasta"):
    with open(filename) as f:
        output = str(filename)
        output += "-aligned.fasta"
        in_file = str(filename)
        from Bio.Align.Applications import MafftCommandline
        mafft_cline = MafftCommandline(input= in_file)
        print(mafft_cline)
        stdout, stderr = mafft_cline()
        with open(output, "w") as handle:
            handle.write(stdout)

mafft rcsb_model.fasta


In [164]:
%%bash

ls

AminoAcids.csv
Fasta_Alignment.ipynb
rcsb_model.fasta
rcsb_model.fasta-aligned.fasta


In [165]:
align = AlignIO.read("rcsb_model.fasta-aligned.fasta", "fasta")
entries = []
for i in align:
    entries.append(i)

In [166]:
vars(entries[0])

{'_seq': Seq('------------------------------------------------------...---', SingleLetterAlphabet()),
 'id': '1HC1:A|PDBID|CHAIN|SEQUENCE',
 'name': '1HC1:A|PDBID|CHAIN|SEQUENCE',
 'description': '1HC1:A|PDBID|CHAIN|SEQUENCE',
 'dbxrefs': [],
 'annotations': {},
 '_per_letter_annotations': {},
 'features': []}

In [170]:
entries[0].description.split("|")[0]

'1HC1:A'

In [174]:
for i in entries:
    if len(i.seq) != len(entries[0].seq):
        print(len(i.seq))

In [222]:
protein_dic = {}

# All of these entries should be the same length
for entry in entries:
    protein_dic[entry.id.split("|")[0]] = {i:entry.seq[i] for i in range(len(entries[0].seq))}
    protein_dic[entry.id.split("|")[0]]["protein"] = entry.id.split("|")[0].split(":")[0]
    protein_dic[entry.id.split("|")[0]]["class"] = entry.id.split("|")[0].split(":")[1]

In [223]:
protein_dic['1HC1:A']["class"]

'A'

In [224]:
hc = pd.DataFrame.from_dict(protein_dic).T

In [225]:
hc_count = {}
for i in range(len(entries[0].seq)):
    hc_count[i] = dict(hc[i].value_counts())
    for amino in amino_letters:
        if amino not in hc_count[i]:
            hc_count[i][amino] = 0
            
hcc = pd.DataFrame.from_dict(hc_count).T

In [226]:
hcc = hcc.fillna(0).astype(int)

In [227]:
hcc

Unnamed: 0,-,Y,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,X
0,89,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,89,0,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,89,0,0,0,0,0,0,10,0,0,...,0,0,0,0,0,0,0,0,0,0
3,89,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,10,0,0,0
4,89,0,0,0,0,0,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541,89,0,0,0,0,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3542,89,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3543,89,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3544,89,0,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [228]:
hcc[hcc["-"] < 3].T

Unnamed: 0,1674,1675,1677,1678,1680,1681,1683,1684,1685,1686,...,1741,1742,1743,1777,1778,1779,1780,1781,1782,1783
-,2,2,2,2,2,2,2,2,2,2,...,0,0,0,2,2,2,2,2,2,2
Y,0,0,0,0,0,0,97,0,55,0,...,0,28,0,0,27,26,0,35,0,0
A,0,0,0,0,0,0,0,2,0,0,...,0,0,18,0,0,0,0,0,0,0
C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
D,0,43,0,15,0,0,0,5,0,73,...,0,12,0,0,0,0,0,0,47,0
E,0,29,0,0,0,0,0,23,0,24,...,0,0,0,0,0,0,0,0,35,0
F,49,0,0,0,62,0,0,0,0,0,...,25,2,0,0,0,32,0,8,0,0
G,0,0,0,3,0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
H,3,0,2,6,0,57,0,0,18,0,...,0,0,0,0,0,39,27,0,0,42
I,0,0,0,0,0,0,0,0,0,0,...,40,0,31,0,0,0,0,0,0,55


In [230]:
alignment = [x for x in range(1674,1783)]
alignment.extend(["protein","class"])
hc[alignment]

Unnamed: 0,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,...,1775,1776,1777,1778,1779,1780,1781,1782,protein,class
1HC1:A,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,A
1HC1:B,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,B
1HC1:C,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,C
1HC1:D,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,D
1HC1:E,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6R83:5a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,5a
6R83:6a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,6a
6R83:7a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,7a
6R83:8a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,8a


In [239]:
from Bio import SeqIO
for record in SeqIO.parse("rcsb_model.fasta", "fasta"):
    print(record.id)

1HC1:A|PDBID|CHAIN|SEQUENCE
1HC1:B|PDBID|CHAIN|SEQUENCE
1HC1:C|PDBID|CHAIN|SEQUENCE
1HC1:D|PDBID|CHAIN|SEQUENCE
1HC1:E|PDBID|CHAIN|SEQUENCE
1HC1:F|PDBID|CHAIN|SEQUENCE
1HCY:A|PDBID|CHAIN|SEQUENCE
1HCY:B|PDBID|CHAIN|SEQUENCE
1HCY:C|PDBID|CHAIN|SEQUENCE
1HCY:D|PDBID|CHAIN|SEQUENCE
1HCY:E|PDBID|CHAIN|SEQUENCE
1HCY:F|PDBID|CHAIN|SEQUENCE
1JS8:A|PDBID|CHAIN|SEQUENCE
1JS8:B|PDBID|CHAIN|SEQUENCE
1LLA:A|PDBID|CHAIN|SEQUENCE
1LNL:A|PDBID|CHAIN|SEQUENCE
1LNL:B|PDBID|CHAIN|SEQUENCE
1LNL:C|PDBID|CHAIN|SEQUENCE
1NOL:A|PDBID|CHAIN|SEQUENCE
1OXY:A|PDBID|CHAIN|SEQUENCE
2N1C:A|PDBID|CHAIN|SEQUENCE
2N30:A|PDBID|CHAIN|SEQUENCE
3IXV:A|PDBID|CHAIN|SEQUENCE
3IXV:C|PDBID|CHAIN|SEQUENCE
3IXV:D|PDBID|CHAIN|SEQUENCE
3IXV:E|PDBID|CHAIN|SEQUENCE
3IXV:F|PDBID|CHAIN|SEQUENCE
3IXV:G|PDBID|CHAIN|SEQUENCE
3IXV:H|PDBID|CHAIN|SEQUENCE
3IXV:I|PDBID|CHAIN|SEQUENCE
3IXV:J|PDBID|CHAIN|SEQUENCE
3IXV:K|PDBID|CHAIN|SEQUENCE
3IXV:L|PDBID|CHAIN|SEQUENCE
3IXV:M|PDBID|CHAIN|SEQUENCE
3IXW:A|PDBID|CHAIN|SEQUENCE
3IXW:C|PDBID|CHAIN|S

In [238]:
align = AlignIO.read("rcsb_model.fasta", "fasta")
entries = []
for i in align:
    entries.append(i)

ValueError: Sequences must all be the same length