In [1]:
from pathlib import Path
import os


__file__ = os.getcwd()
data_path = Path(__file__) / 'data' / 'CAZyDB.07312018.fa.txt'

In [2]:
import tqdm
import hashlib


class Protein:
    def __init__(self):
        self.name = ''
        self.families = []
        self.seqence = ''
        self.hashed = ''
        
    def hash_seqence(self):
        self.hashed = hashlib.sha1(self.seqence.encode('ascii')).hexdigest()
        
    def __str__(self):
        return f'''\r{self.name}
        \r{self.families}
        \r{self.seqence[:20]}...{len(self.seqence)}...{self.seqence[-5:]}
        \r{self.hashed}'''

        
data_file = open(data_path, 'r')

proteins = []
temp_protein = None

for line in data_file:
    line = line.strip().upper()
    
    if line[0] == '>':
        if temp_protein is not None:
            temp_protein.hash_seqence()
            proteins.append(temp_protein)
        
        temp_protein = Protein()
        name, *families = line.split('|')
        name = name[1:]
        
        temp_protein.name = name
        for family in families:
            temp_protein.families.append(family.split('_'))
        
        temp_protein.families = sorted(temp_protein.families)
        
    else:
        temp_protein.seqence += line

print(f'total: {len(proteins)}\n')
for protein in proteins[:3]:
    print(protein)
    print()
    
print('''.
.
.
''')

for protein in proteins[-2:]:
    print(protein)

total: 99

AWI06117.1
[['GT2']]
MIVQSTNSNRIGENLLKLGY...745...GSLDV
bf32e340655bd58c0bd427390d1b5f2a9ccd1855

AWI06118.1
[['GT2']]
MLSVVVPVYNEEKNVEELVK...373...YRRSL
ff7121fd9fbe29aba56b4f02bd55e33bd48ee134

AWH92887.1
[['GH29']]
MSQSPGWARFAGRELPTWYD...489...IGLRR
fe2fc141f73c22d4e3cf08127627cc78ed8cc784

.
.
.

ASO21229.1
[['GH1']]
MSGTGTGFRERYESGLRSIR...294...RGNEA
55be006aceebf75dafbc76a74df4b4943f9e5d83
AWH34124.1
[['GT2'], ['GT4']]
MAMSLAEWRYLLNRLTGLAQ...699...RVFFR
b341be3068b3862ace8584eda6e2042f8e7bc1db


In [3]:
import pprint


def append_key(family_dict, family_levels, protein):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 1,
                '_element' : [],
            }
            
        else:
            cw_dict[family_levels[i]]['_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_element'].append(protein)

            
family_dict = {}

for protein in proteins:
    for family_levels in protein.families:
        append_key(family_dict, family_levels, protein)

pprint.pprint(family_dict)
# print(family_dict)

{'CBM32': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f304173b2e0>]},
 'CBM48': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f30417a2610>]},
 'CBM5': {'_count': 1,
          '_element': [<__main__.Protein object at 0x7f30417b0fd0>]},
 'CBM50': {'_count': 5,
           '_element': [<__main__.Protein object at 0x7f30417a2b50>,
                        <__main__.Protein object at 0x7f30417a2e50>,
                        <__main__.Protein object at 0x7f30417b0310>,
                        <__main__.Protein object at 0x7f304173b0d0>,
                        <__main__.Protein object at 0x7f304173b550>]},
 'CBM51': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f304173b2e0>]},
 'CBM67': {'_count': 2,
           '_element': [<__main__.Protein object at 0x7f30417a2820>,
                        <__main__.Protein object at 0x7f30417b0be0>]},
 'CBM73': {'_count': 1,
           '_element': [<__main__.Protein object at 0x7f30417b0