In [2]:
import pandas as pd
import numpy as np
from Bio import AlignIO
from glob import glob


In [2]:
amino_acids = pd.read_csv('AminoAcids.csv', index_col=0, squeeze=True).T
# Since this is a CSV we need to evaluate the codon string
for amino in amino_acids:
    amino_acids[amino]["codons"] = eval(amino_acids[amino]["codons"])
    
amino_letters = sorted(list(amino_acids.T.index))[1:]

In [3]:
amino_letters.insert(0,"-")
len(amino_letters)

21

In [4]:
amino_letters

['-',
 'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y']

In [5]:
# Found this on youtube
for filename in glob("*.fasta"):
    with open(filename) as f:
        output = str(filename)
        output += "-aligned.fasta"
        in_file = str(filename)
        from Bio.Align.Applications import MafftCommandline
        mafft_cline = MafftCommandline(input= in_file)
        print(mafft_cline)
        stdout, stderr = mafft_cline()
        with open(output, "w") as handle:
            handle.write(stdout)

mafft rcsb_model.fasta-aligned.fasta
mafft rcsb_model.fasta


In [6]:
%%bash

ls

AminoAcids.csv
Fasta_Alignment.ipynb
rcsb_model.fasta
rcsb_model.fasta-aligned.fasta
rcsb_model.fasta-aligned.fasta-aligned.fasta


In [7]:
align = AlignIO.read("rcsb_model.fasta-aligned.fasta", "fasta")
entries = []
for i in align:
    entries.append(i)

In [8]:
vars(entries[0])

{'_seq': Seq('------------------------------------------------------...---', SingleLetterAlphabet()),
 'id': '1HC1:A|PDBID|CHAIN|SEQUENCE',
 'name': '1HC1:A|PDBID|CHAIN|SEQUENCE',
 'description': '1HC1:A|PDBID|CHAIN|SEQUENCE',
 'dbxrefs': [],
 'annotations': {},
 '_per_letter_annotations': {},
 'features': []}

In [9]:
entries[0].description.split("|")[0]

'1HC1:A'

In [10]:
for i in entries:
    if len(i.seq) != len(entries[0].seq):
        print(len(i.seq))

In [11]:
protein_dic = {}

# All of these entries should be the same length
for entry in entries:
    protein_dic[entry.id.split("|")[0]] = {i:entry.seq[i] for i in range(len(entries[0].seq))}
    protein_dic[entry.id.split("|")[0]]["protein"] = entry.id.split("|")[0].split(":")[0]
    protein_dic[entry.id.split("|")[0]]["class"] = entry.id.split("|")[0].split(":")[1]

In [12]:
protein_dic['1HC1:A']["class"]

'A'

In [13]:
hc = pd.DataFrame.from_dict(protein_dic).T

In [14]:
hc_count = {}
for i in range(len(entries[0].seq)):
    hc_count[i] = dict(hc[i].value_counts())
    for amino in amino_letters:
        if amino not in hc_count[i]:
            hc_count[i][amino] = 0
            
hcc = pd.DataFrame.from_dict(hc_count).T

In [15]:
hcc = hcc.fillna(0).astype(int)

In [16]:
hcc

Unnamed: 0,-,Y,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,X
0,89,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,89,0,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,89,0,0,0,0,0,0,10,0,0,...,0,0,0,0,0,0,0,0,0,0
3,89,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,10,0,0,0
4,89,0,0,0,0,0,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541,89,0,0,0,0,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3542,89,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3543,89,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3544,89,0,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
hcc[hcc["-"] < 3].T

Unnamed: 0,1674,1675,1677,1678,1680,1681,1683,1684,1685,1686,...,1741,1742,1743,1777,1778,1779,1780,1781,1782,1783
-,2,2,2,2,2,2,2,2,2,2,...,0,0,0,2,2,2,2,2,2,2
Y,0,0,0,0,0,0,97,0,55,0,...,0,28,0,0,27,26,0,35,0,0
A,0,0,0,0,0,0,0,2,0,0,...,0,0,18,0,0,0,0,0,0,0
C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
D,0,43,0,15,0,0,0,5,0,73,...,0,12,0,0,0,0,0,0,47,0
E,0,29,0,0,0,0,0,23,0,24,...,0,0,0,0,0,0,0,0,35,0
F,49,0,0,0,62,0,0,0,0,0,...,25,2,0,0,0,32,0,8,0,0
G,0,0,0,3,0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
H,3,0,2,6,0,57,0,0,18,0,...,0,0,0,0,0,39,27,0,0,42
I,0,0,0,0,0,0,0,0,0,0,...,40,0,31,0,0,0,0,0,0,55


In [18]:
alignment = [x for x in range(1674,1783)]
alignment.extend(["protein","class"])
hc[alignment]

Unnamed: 0,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,...,1775,1776,1777,1778,1779,1780,1781,1782,protein,class
1HC1:A,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,A
1HC1:B,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,B
1HC1:C,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,C
1HC1:D,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,D
1HC1:E,K,K,H,T,D,S,F,P,P,Y,...,-,-,R,V,H,R,L,N,1HC1,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6R83:5a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,5a
6R83:6a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,6a
6R83:7a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,7a
6R83:8a,F,D,Y,Q,N,V,L,H,-,Y,...,F,D,R,L,F,R,Y,E,6R83,8a


In [19]:
from Bio import SeqIO

proteins = {}
for record in SeqIO.parse("rcsb_model.fasta", "fasta"):
    proteins[record.id.split("|")[0]] = str(record.seq)

In [57]:
# Make a dictionary for all blocks of letters in protein

protein_frames = {}
for protein, sequence in proteins.items():
    protein_frames[protein] = {}
    s_length = len(sequence)
    for i in range (1,15):
        for j in range(i, s_length - i):
            protein_frames[protein]["{}_{}".format(i,j - i)] = sequence[j - i:j]

In [58]:
protein_frames["1HC1:D"]["5_150"]

'IDKAY'

In [59]:
df = pd.DataFrame.from_dict(protein_frames).T.fillna("-")

In [60]:
frame_count = {}
for protein in df.T:
    frame_count[protein] = df.T[protein].value_counts().to_dict()

In [61]:
frame_count.keys()

dict_keys(['1HC1:A', '1HC1:B', '1HC1:C', '1HC1:D', '1HC1:E', '1HC1:F', '1HCY:A', '1HCY:B', '1HCY:C', '1HCY:D', '1HCY:E', '1HCY:F', '1JS8:A', '1JS8:B', '1LLA:A', '1LNL:A', '1LNL:B', '1LNL:C', '1NOL:A', '1OXY:A', '2N1C:A', '2N30:A', '3IXV:A', '3IXV:C', '3IXV:D', '3IXV:E', '3IXV:F', '3IXV:G', '3IXV:H', '3IXV:I', '3IXV:J', '3IXV:K', '3IXV:L', '3IXV:M', '3IXW:A', '3IXW:C', '3IXW:D', '3IXW:E', '3IXW:F', '3IXW:G', '3IXW:H', '3IXW:I', '3IXW:J', '3IXW:K', '3IXW:L', '3IXW:M', '3J32:A', '3J32:B', '3L6W:A', '3L6W:B', '3QJO:A', '3QJO:B', '4BED:A', '4BED:B', '4BED:C', '4BED:D', '4YD9:A', '4YD9:B', '4YD9:C', '4YD9:D', '4YD9:E', '4YD9:F', '4YD9:G', '4YD9:H', '4YD9:I', '4YD9:J', '4YD9:K', '4YD9:L', '4YD9:M', '4YD9:N', '4YD9:O', '4YD9:P', '4YD9:Q', '4YD9:R', '4YD9:S', '4YD9:T', '4YD9:U', '4YD9:V', '4YD9:W', '4YD9:X', '4YD9:Y', '4YD9:Z', '4YD9:a', '4YD9:b', '4YD9:c', '4YD9:d', '6L8S:A', '6L8S:B', '6L8S:C', '6R83:10a', '6R83:1a', '6R83:2a', '6R83:3a', '6R83:4a', '6R83:5a', '6R83:6a', '6R83:7a', '6R83:8a',

In [62]:
frames = pd.DataFrame.from_dict(frame_count, orient = 'index')

In [82]:
for i in range(98):
    print(frames[amino_letters].iloc[i])

-    37198.0
A       30.0
C        6.0
D       53.0
E       49.0
F       37.0
G       47.0
H       37.0
I       39.0
K       31.0
L       49.0
M       15.0
N       38.0
P       31.0
Q       18.0
R       32.0
S       35.0
T       33.0
V       38.0
W        9.0
Y       28.0
Name: 1HC1:A, dtype: float64
-    37198.0
A       30.0
C        6.0
D       53.0
E       49.0
F       37.0
G       47.0
H       37.0
I       39.0
K       31.0
L       49.0
M       15.0
N       38.0
P       31.0
Q       18.0
R       32.0
S       35.0
T       33.0
V       38.0
W        9.0
Y       28.0
Name: 1HC1:B, dtype: float64
-    37198.0
A       30.0
C        6.0
D       53.0
E       49.0
F       37.0
G       47.0
H       37.0
I       39.0
K       31.0
L       49.0
M       15.0
N       38.0
P       31.0
Q       18.0
R       32.0
S       35.0
T       33.0
V       38.0
W        9.0
Y       28.0
Name: 1HC1:C, dtype: float64
-    37198.0
A       30.0
C        6.0
D       53.0
E       49.0
F       37.0
G       47.0
H  

In [63]:
frames.T.dropna()

Unnamed: 0,1HC1:A,1HC1:B,1HC1:C,1HC1:D,1HC1:E,1HC1:F,1HCY:A,1HCY:B,1HCY:C,1HCY:D,...,6R83:10a,6R83:1a,6R83:2a,6R83:3a,6R83:4a,6R83:5a,6R83:6a,6R83:7a,6R83:8a,6R83:9a
D,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,...,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
E,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,...,211.0,211.0,211.0,211.0,211.0,211.0,211.0,211.0,211.0,211.0
L,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,...,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
G,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,...,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0
I,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,...,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0
N,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,...,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
V,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,...,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0
H,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,...,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
F,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,...,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0
K,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,...,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0


In [24]:
df.columns

Index(['2_0', '2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9',
       ...
       '14_3276', '14_3277', '14_3278', '14_3279', '14_3280', '14_3281',
       '14_3282', '14_3283', '14_3284', '14_3285'],
      dtype='object', length=42874)

In [25]:
entries = set()
for i in df.values:
    for j in i:
        entries.add(j)
        
entries = list(entries)
entries.sort(key=lambda item: (len(item), item))

In [26]:
df_count = {}
for column in df.columns:
    for entry in df[column]:
        if entry in df_count:
            df_count[entry] += 1
        else:
            df_count[entry] = 1

In [27]:
df_count

{'DA': 455,
 'AI': 471,
 'TL': 407,
 'GH': 236,
 'FE': 766,
 'XF': 1,
 'TV': 341,
 'DN': 342,
 'IL': 327,
 'EN': 353,
 'VM': 206,
 'NL': 452,
 'YA': 339,
 'SM': 217,
 'SS': 428,
 'AS': 419,
 'AL': 1065,
 'II': 190,
 'LH': 677,
 'HR': 410,
 'ED': 600,
 'VA': 471,
 'NV': 421,
 'LV': 378,
 'MI': 36,
 'LI': 279,
 'AG': 503,
 'MV': 142,
 'ST': 368,
 'LL': 569,
 'LG': 642,
 'IR': 211,
 'HD': 264,
 'RN': 189,
 'DL': 300,
 'AD': 467,
 'VV': 279,
 'VR': 341,
 'GT': 439,
 'TA': 460,
 'LR': 410,
 'RK': 403,
 'DK': 344,
 'LP': 649,
 'TF': 470,
 'AH': 235,
 'TG': 218,
 'KN': 325,
 'KQ': 205,
 'PN': 373,
 'KD': 389,
 'FA': 556,
 'HK': 103,
 'GN': 176,
 'QI': 137,
 'NF': 312,
 'QA': 367,
 'NI': 338,
 'KS': 306,
 'DI': 509,
 'AV': 513,
 'NA': 347,
 'VN': 247,
 'FG': 279,
 'AR': 338,
 'DV': 584,
 'IH': 457,
 'SV': 373,
 'IT': 433,
 'VD': 483,
 'VL': 455,
 'QQ': 153,
 'AQ': 157,
 'NS': 461,
 'RI': 388,
 'RL': 833,
 'VS': 347,
 'HS': 540,
 'VE': 376,
 'TQ': 144,
 'DT': 368,
 'QN': 146,
 'QK': 423,
 'SL':

In [28]:
df_count = pd.DataFrame.from_dict(df_count, orient = 'index')

In [29]:
df_count = df_count.sort_values([0], ascending = False)

In [30]:
df_count["length"] = df_count.index
df_count["length"] = df_count["length"].apply(lambda x: len(x))

In [31]:
df_count[df_count["length"] == 10]

Unnamed: 0,0,length
KDRVFAGFLL,60,10
VLGGETEMPW,60,10
GMATFPHWHR,46,10
MATFPHWHRL,46,10
HGMATFPHWH,46,10
...,...,...
DDLEFAGMVV,1,10
VAGMEYHLFV,1,10
GNVAGMEYHL,1,10
NVAGMEYHLF,1,10
