In [2]:
from pathlib import Path
import pprint
import os


__file__ = os.getcwd()
data_paths = [Path(__file__) / 'data' / 'dbCAN2' / 'CAZyDB.07312018.fa']

pprint.pprint([str(path) for path in data_paths])

['/home/sunmoon/workspace/eCAMI/data/dbCAN2/CAZyDB.07312018.fa']


In [3]:
from tqdm import tqdm
import pandas as pd
import hashlib


class Protein:
    def __init__(self, name='', families='', sequence='', sequence_hash=''):
        self.name = name
        self.families = families
        self.sequence = sequence
        self.sequence_hash = sequence_hash
        
    def hash_sequence(self):
        self.sequence_hash = hashlib.sha1(self.sequence.encode('ascii')).hexdigest()

    def to_list(self):
        return [self.name, self.families, self.sequence, self.sequence_hash]
        
    def __repr__(self):
        return f'''\r{self.name}
        \r{self.families}
        \r{self.sequence[:20]}...{len(self.sequence)}...{self.sequence[-5:]}
        \r{self.sequence_hash}'''


proteins = []
temp_protein = None

for data_path in data_paths:
    data_file = open(data_path, 'r')

    for line in tqdm(data_file):
        line = line.strip().upper()

        if line[-1] == '|':
            line = line[:-1]
        
        if line[0] == '>':
            if temp_protein is not None:
                temp_protein.hash_sequence()
                proteins.append(temp_protein.to_list())
            
            temp_protein = Protein()

            try:
                name, families = line.split('|', 1)
            except:
                print(f'\n[ERROR] format error: {line}')
                name = line
                families = ''
            
            name = name[1:]
            
            temp_protein.name = name
            temp_protein.families = families
            
        else:
            temp_protein.sequence += line

columns = ['name', 'families', 'sequence', 'sequence_hash']
protein_df = pd.DataFrame(proteins, columns=columns)
print(f'Protein DF shape: {protein_df.shape}')

protein_df = protein_df.drop_duplicates(subset=['sequence_hash'], keep='first')
print(f'Unique protein DF shape: {protein_df.shape}')

2134124it [00:06, 341723.84it/s]
Protein DF shape: (1066326, 4)
Unique protein DF shape: (636599, 4)


In [7]:
from tqdm import tqdm
import pickle
import json
import operator
import pprint


def append_key(family_dict, family_levels, protein):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 0,
                '_sub_count' : 1,
                '_elements' : [],
            }
            
        else:
            cw_dict[family_levels[i]]['_sub_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_count'] += 1
            cw_dict['_elements'].append(protein)

def append_key_count(family_dict, family_levels):
    total_levels = len(family_levels)
    
    cw_dict = family_dict
    
    for i in range(total_levels):
        if family_levels[i] not in cw_dict:
            cw_dict[family_levels[i]] = {
                '_count' : 0,
                '_sub_count' : 1,
            }
            
        else:
            cw_dict[family_levels[i]]['_sub_count'] += 1
            
        cw_dict = cw_dict[family_levels[i]]
        
        if i == total_levels-1 :
            cw_dict['_count'] += 1
            
family_dict = {}
family_count_dict = {}

for _, protein in tqdm(protein_df.iterrows()):
    family_levels_list = [family.replace('.', '_').split('_') for family in protein['families'].split('|')]

    for family_levels in family_levels_list:
        append_key(family_dict, family_levels, Protein(name=protein['name'],
                                                       families=family_levels_list,
                                                       sequence=protein['sequence'],
                                                       sequence_hash=protein['sequence_hash']))
        append_key_count(family_count_dict, family_levels)

family_dict = sorted(family_dict.items())
family_count_dict = sorted(family_count_dict.items())

out_filename = 'family_count.json'
out_file = open(out_filename, 'w')
json.dump(family_count_dict, out_file)
out_file.close()

print('JSON created')

636599it [00:59, 10773.31it/s]JSON created



In [8]:
out_filename = 'family.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(family_dict, out_file)
out_file.close()

In [9]:
out_filename = 'family_count.pickle'
out_file = open(out_filename, 'wb')
pickle.dump(family_count_dict, out_file)
out_file.close()