In [3]:
from pathlib import Path
import os


__file__ = os.getcwd()
data_path = Path(__file__) / 'data' / 'CAZyDB.07312018.fa.txt'

print(data_path)

/home/sunmoon/workspace/eCAMI/data/CAZyDB.07312018.fa.txt


In [25]:
from tqdm import tqdm
import pandas as pd
import hashlib


class Protein:
    def __init__(self):
        self.name = ''
        self.families = []
        self.seqence = ''
        self.hashed = ''
        
    def hash_seqence(self):
        self.hashed = hashlib.sha1(self.seqence.encode('ascii')).hexdigest()
        
    def __str__(self):
        return f'''\r{self.name}
        \r{self.families}
        \r{self.seqence[:20]}...{len(self.seqence)}...{self.seqence[-5:]}
        \r{self.hashed}'''

    def __format__(self, format):
        if format == 'list':
            return [self.name, self.families, self.seqence, self.hashed]
        return str(self)

        
data_file = open(data_path, 'r')

proteins = []
temp_protein = None

for line in tqdm(data_file):
    line = line.strip().upper()
    
    if line[0] == '>':
        if temp_protein is not None:
            temp_protein.hash_seqence()
            proteins.append(temp_protein)
        
        temp_protein = Protein()
        name, *families = line.split('|')
        name = name[1:]
        
        temp_protein.name = name
        for family in families:
            temp_protein.families.append(family.split('_'))
        
        temp_protein.families = sorted(temp_protein.families)
        
    else:
        temp_protein.seqence += line

print(f'total: {len(proteins)}\n')

unique_proteins = []
for protein in tqdm(proteins):
    overlapped = False
    for unique_protein in unique_proteins:
        if protein.hashed == unique_protein.hashed:
            overlapped = True
            break
    if not overlapped:
        unique_proteins.append(protein)

print(f'total: {len(unique_proteins)}\n')

for unique_protein in unique_proteins[:3]:
    print(unique_protein)
    print()
    
print('''.
.
.
''')

for unique_protein in unique_proteins[-2:]:
    print(unique_protein)
    print()

5000it [00:00, 302829.09it/s]
 57%|█████▋    | 1434/2499 [00:00<00:00, 14336.70it/s]total: 2499

100%|██████████| 2499/2499 [00:00<00:00, 8354.86it/s] total: 2284

AWI06117.1
[['GT2']]
MIVQSTNSNRIGENLLKLGY...745...GSLDV
bf32e340655bd58c0bd427390d1b5f2a9ccd1855

AWI06118.1
[['GT2']]
MLSVVVPVYNEEKNVEELVK...373...YRRSL
ff7121fd9fbe29aba56b4f02bd55e33bd48ee134

AWH92887.1
[['GH29']]
MSQSPGWARFAGRELPTWYD...489...IGLRR
fe2fc141f73c22d4e3cf08127627cc78ed8cc784

.
.
.

AWI09682.1
[['GH43', '26'], ['GH50']]
MRGRAQGAPHKMEIYIMKRT...1579...GGASE
648bdcd569252566d13faecc164f12c6a2a05d3b

AWI79846.1
[['GT4']]
MNILMISDVFFPRINGVSTS...395...QLSAA
1e59dd8ed5abd7a58cedb0345dd7aa97d720c4de




In [None]:
out_filename = 'proteins.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(proteins, out_file)
out_file.close()

In [None]:
out_filename = 'unique_proteins.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(unique_proteins, out_file)
out_file.close()

In [24]:
from tqdm import tqdm
import pickle
import pprint


def append_key(family_dict, family_levels, protein):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 0,
                '_sub_count' : 1,
                '_elements' : [],
            }
            
        else:
            cw_dict[family_levels[i]]['_sub_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_count'] += 1
            cw_dict['_elements'].append(protein)
            

def append_key_count(family_dict, family_levels):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 0,
                '_sub_count' : 1,
            }
            
        else:
            cw_dict[family_levels[i]]['_sub_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_count'] += 1

            
family_dict = {}
family_count_dict = {}

for protein in tqdm(unique_proteins):
    for family_levels in protein.families:
        append_key(family_dict, family_levels, protein)
        append_key_count(family_count_dict, family_levels)

pprint.pprint(family_count_dict)

35': {'_count': 20, '_sub_count': 20},
          '36': {'_count': 243, '_sub_count': 243},
          '37': {'_count': 185, '_sub_count': 185},
          '38': {'_count': 380, '_sub_count': 380},
          '39': {'_count': 397, '_sub_count': 397},
          '4': {'_count': 474, '_sub_count': 474},
          '40': {'_count': 399, '_sub_count': 399},
          '41': {'_count': 100, '_sub_count': 100},
          '42': {'_count': 68, '_sub_count': 68},
          '5': {'_count': 2609, '_sub_count': 2609},
          '6': {'_count': 258, '_sub_count': 258},
          '7': {'_count': 138, '_sub_count': 138},
          '8': {'_count': 913, '_sub_count': 913},
          '9': {'_count': 6899, '_sub_count': 6899},
          '_count': 4895,
          '_sub_count': 60205},
 'GH130': {'_count': 1518, '_sub_count': 1518},
 'GH131': {'_count': 46, '_sub_count': 46},
 'GH132': {'_count': 652, '_sub_count': 652},
 'GH133': {'_count': 627, '_sub_count': 627},
 'GH134': {'_count': 139, '_sub_count': 139},
 

In [26]:
out_filename = 'family.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(family_dict, out_file)
out_file.close()

In [25]:
out_filename = 'family_count.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(family_count_dict, out_file)
out_file.close()