In [1]:
from pathlib import Path
import os


__file__ = os.getcwd()
data_path = Path(__file__) / 'data' / 'CAZyDB.07312018.fa.txt'

In [6]:
import tqdm
import hashlib


class Protein:
    def __init__(self):
        self.name = ''
        self.families = []
        self.seqence = ''
        self.hashed = ''
        
    def hash_seqence(self):
        self.hashed = hashlib.sha1(self.seqence.encode('ascii')).hexdigest()
        
    def __str__(self):
        return f'''\r{self.name}
        \r{self.families}
        \r{self.seqence[:20]}...{len(self.seqence)}...{self.seqence[-5:]}
        \r{self.hashed}'''

        
data_file = open(data_path, 'r')

proteins = []
temp_protein = None
count = 0
for line in data_file:
    
    count += 1
    if count > 200:
        break
        
    line = line.strip().upper()
    
    if line[0] == '>':
        if temp_protein is not None:
            temp_protein.hash_seqence()
            proteins.append(temp_protein)
        
        temp_protein = Protein()
        name, *families = line.split('|')
        name = name[1:]
        
        temp_protein.name = name
        for family in families:
            temp_protein.families.append(family.split('_'))
        
        temp_protein.families = sorted(temp_protein.families)
        
    else:
        temp_protein.seqence += line

print(f'total: {len(proteins)}\n')
for protein in proteins[:10]:
    print(protein)
    print()
    
print('''.
.
.
''')

for protein in proteins[-1:]:
    print(protein)

total: 99

AWI06117.1
[['GT2']]
MIVQSTNSNRIGENLLKLGY...745...GSLDV
bf32e340655bd58c0bd427390d1b5f2a9ccd1855

AWI06118.1
[['GT2']]
MLSVVVPVYNEEKNVEELVK...373...YRRSL
ff7121fd9fbe29aba56b4f02bd55e33bd48ee134

AWH92887.1
[['GH29']]
MSQSPGWARFAGRELPTWYD...489...IGLRR
fe2fc141f73c22d4e3cf08127627cc78ed8cc784

AWH89110.1
[['GT2']]
MGELYPVLKVSVVIPVYNEE...327...QEENE
f806ec0f56b963439aa98f74b0e9e4ab53bb850e

AWH84670.1
[['GH97']]
MKKLIALACFLLAVVNAANA...714...GLKNL
735d887ab7f7cecf2546017209141a122a1ddc68

AWI89010.1
[['GT2'], ['GT4']]
MIKLPKLFRPKRRDAATAAP...1005...EETRV
29f2f8d3e46a8bd4808e67315734c98b344f4e28

AWG68601.1
[['GT4']]
MSTSRVAIVHERFTEFGGSE...381...GAVRA
b8f8130dda955e6cb191779df116488adbbf1da5

AWI87414.1
[['GT21']]
MIALPESGAPAALALCAALV...386...GPSAA
8bde76c18b11d9b81398c4432ba4978513c1ee47

AWH98111.1
[['GH16']]
MLSIAIFCFFCLPTSIVSYQ...315...KIYKK
ea7b6c4cb40f688413f98156fc09285bbb50ec75

AWI56298.1
[['GH94']]
MHVPVRDSVKVMRLAVRNDG...674...RAILG
31184d4c588b9a014cf42d40d5cbff38231a

In [12]:
import pprint


def append_key(family_dict, family_levels, protein):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 1,
                '_element' : [],
            }
            
        else:
            cw_dict[family_levels[i]]['_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_element'].append(protein)

            
family_dict = {}

for protein in proteins:
    for family_levels in protein.families:
        append_key(family_dict, family_levels, protein)

pprint.pprint(family_dict)
# print(family_dict)

{'CBM32': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f1b10bd7f40>]},
 'CBM48': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f1b10dc16a0>]},
 'CBM5': {'_count': 1,
          '_element': [<__main__.Protein object at 0x7f1b10bd7790>]},
 'CBM50': {'_count': 5,
           '_element': [<__main__.Protein object at 0x7f1b10dc37c0>,
                        <__main__.Protein object at 0x7f1b10dc9820>,
                        <__main__.Protein object at 0x7f1b10dccf40>,
                        <__main__.Protein object at 0x7f1b10bd79d0>,
                        <__main__.Protein object at 0x7f1b10c463a0>]},
 'CBM51': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f1b10bd7f40>]},
 'CBM67': {'_count': 2,
           '_element': [<__main__.Protein object at 0x7f1b10dc1ac0>,
                        <__main__.Protein object at 0x7f1b10b7f5b0>]},
 'CBM73': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f1b10bd7