In [1]:
from pathlib import Path
import os


__file__ = os.getcwd()
data_path = Path(__file__) / 'data' / 'CAZyDB.07312018.fa.txt'

In [2]:
from tqdm import tqdm
import hashlib


class Protein:
    def __init__(self):
        self.name = ''
        self.families = []
        self.seqence = ''
        self.hashed = ''
        
    def hash_seqence(self):
        self.hashed = hashlib.sha1(self.seqence.encode('ascii')).hexdigest()
        
    def __str__(self):
        return f'''\r{self.name}
        \r{self.families}
        \r{self.seqence[:20]}...{len(self.seqence)}...{self.seqence[-5:]}
        \r{self.hashed}'''

        
data_file = open(data_path, 'r')

proteins = []
temp_protein = None

# count = 0

for line in tqdm(data_file):
    
#     count += 1
#     if count > 500:
#         break
    
    line = line.strip().upper()
    
    if line[0] == '>':
        if temp_protein is not None:
            temp_protein.hash_seqence()
            proteins.append(temp_protein)
        
        temp_protein = Protein()
        name, *families = line.split('|')
        name = name[1:]
        
        temp_protein.name = name
        for family in families:
            temp_protein.families.append(family.split('_'))
        
        temp_protein.families = sorted(temp_protein.families)
        
    else:
        temp_protein.seqence += line

print(f'total: {len(proteins)}\n')
for protein in proteins[:3]:
    print(protein)
    print()
    
print('''.
.
.
''')

for protein in proteins[-2:]:
    print(protein)
    print()

2134124it [00:08, 243901.21it/s]

total: 1066326

AWI06117.1
[['GT2']]
MIVQSTNSNRIGENLLKLGY...745...GSLDV
bf32e340655bd58c0bd427390d1b5f2a9ccd1855

AWI06118.1
[['GT2']]
MLSVVVPVYNEEKNVEELVK...373...YRRSL
ff7121fd9fbe29aba56b4f02bd55e33bd48ee134

AWH92887.1
[['GH29']]
MSQSPGWARFAGRELPTWYD...489...IGLRR
fe2fc141f73c22d4e3cf08127627cc78ed8cc784

.
.
.

EEE52851.1
[['GH106']]
MSVALYARGGGCGAKAAAAR...562...QDGMM
d0dd4ecf8f2287686b4888a63ac200089df2418d

NP_175672.2
[['GH106']]
MIGSPVKPLFVFVLTFSLLL...439...CEFEL
66e37dc226b04e4115b77e497f1d81e6c9585560






In [3]:
from tqdm import tqdm
import pickle
import gzip
import pprint


def append_key(family_dict, family_levels, protein):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 0,
                '_sub_count' : 1,
                '_elements' : [],
            }
            
        else:
            cw_dict[family_levels[i]]['_sub_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_count'] += 1
            cw_dict['_elements'].append(protein)
            

def append_key_count(family_dict, family_levels):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 0,
                '_sub_count' : 1,
            }
            
        else:
            cw_dict[family_levels[i]]['_sub_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_count'] += 1

            
family_dict = {}
family_count_dict = {}

for protein in tqdm(proteins):
    for family_levels in protein.families:
        append_key(family_dict, family_levels, protein)
        append_key_count(family_count_dict, family_levels)

pprint.pprint(family_count_dict)

100%|██████████| 1066326/1066326 [00:02<00:00, 488294.98it/s]


{'1.-.-.-': {'_count': 56, '_sub_count': 56},
 '1.1.3.-': {'_count': 4, '_sub_count': 4},
 '1.1.3.10': {'_count': 5, '_sub_count': 5},
 '1.1.3.13': {'_count': 10, '_sub_count': 10},
 '1.1.3.16': {'_count': 1, '_sub_count': 1},
 '1.1.3.38': {'_count': 1, '_sub_count': 1},
 '1.1.3.4': {'_count': 3, '_sub_count': 3},
 '1.1.3.7': {'_count': 2, '_sub_count': 2},
 '1.1.3.9': {'_count': 6, '_sub_count': 6},
 '1.1.99.18': {'_count': 14, '_sub_count': 14},
 '1.1.99.29': {'_count': 3, '_sub_count': 3},
 '1.10.3.-': {'_count': 1, '_sub_count': 1},
 '1.10.3.2': {'_count': 71, '_sub_count': 71},
 '1.11.1.-': {'_count': 1, '_sub_count': 1},
 '1.11.1.13': {'_count': 11, '_sub_count': 11},
 '1.11.1.14': {'_count': 6, '_sub_count': 6},
 '1.11.1.16': {'_count': 8, '_sub_count': 8},
 '1.2.3.15': {'_count': 2, '_sub_count': 2},
 '1.6.5.6': {'_count': 3, '_sub_count': 3},
 '2.-.-.-': {'_count': 6, '_sub_count': 6},
 '2.3.1.122': {'_count': 1, '_sub_count': 1},
 '2.3.1.20': {'_count': 1, '_sub_count': 1},
 

In [4]:
out_filename = 'CAZy_family_count.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(family_count_dict, out_file)
out_file.close()

In [5]:
out_filename = 'CAZy_family.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(family_dict, out_file)
out_file.close()