### Protein Cleaner
This notebook cleans and parses the protein information from a CSV file

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json
file_path = "8kProteins.csv"
df = pd.read_csv(file_path)

Each protein has 3 components: uniprot ID, entry name, a list of protein names, and gene names associated with the protein

In [9]:
df.head()

Unnamed: 0,Entry,Entry name,Protein names,Gene names
0,P16860,ANFB_HUMAN,Natriuretic peptides B (Gamma-brain natriureti...,NPPB
1,P12821,ACE_HUMAN,Angiotensin-converting enzyme (ACE) (EC 3.2.1....,ACE DCP DCP1
2,Q14524,SCN5A_HUMAN,Sodium channel protein type 5 subunit alpha (H...,SCN5A
3,Q12809,KCNH2_HUMAN,Potassium voltage-gated channel subfamily H me...,KCNH2 ERG ERG1 HERG
4,P02741,CRP_HUMAN,C-reactive protein [Cleaved into: C-reactive p...,CRP PTX1


#### Cleans and parses protein names

In [10]:
def filter(line):
    proteins = set()
    line = str(line)
    line = line.lower()

    '''for lines without () or [] terms'''
    if "(" not in line or "[" not in line:
        proteins.add(line.strip().replace(' ', '_'))


    '''for line including () terms'''    
    if '(' in line:
        start = 0
        open_in = line.find('(')
        tmp = line[start:open_in].strip().replace(' ', '_')
        proteins.add(tmp)
        while open_in >=0:
            start = open_in+1
            end = line.find(')', start)
            proteins.add(line[start:end].strip().replace(' ', '_'))
            open_in = line.find('(', end)

    '''for lines including [] trems'''
    if '[' in line:
        raw = line[line.find('['):line.find(']')]
        #print("THIS IS RAW:", raw[15:-1])
        raw = raw[15:-1]
        lraw = raw.split("; ")
        for item in lraw:
            #print(item)
            if '(' in item:
                start = 0
                open_in = item.find('(')
                tmp = item[start:open_in].strip().replace(' ', '_')
                proteins.add(tmp)
            else:
                proteins.add(item.strip().replace(' ', '_'))
    return proteins

#### Example

In [11]:
allProteins = []
i = 0
for u,p in zip(df['Entry'],df['Protein names']):
    print(u,"|",p)
    print("------------")
    print(u,"|",filter(p))
    print("===================================================")
    i += 1
    if i>1:
        break

P16860 | Natriuretic peptides B (Gamma-brain natriuretic peptide) [Cleaved into: Brain natriuretic peptide 32 (BNP(1-32)) (BNP-32); BNP(1-30); BNP(1-29); BNP(1-28); BNP(2-31); BNP(3-32); BNP(3-30); BNP(3-29); BNP(4-32); BNP(4-31); BNP(4-30); BNP(4-29); BNP(4-27); BNP(5-32); BNP(5-31); BNP(5-29)]
------------
P16860 | {'gamma-brain_natriuretic_peptide', 'natriuretic_peptides_b', '4-32', '3-30', 'brain_natriuretic_peptide_32', '5-29', '1-29', '1-30', '2-31', '1-28', 'bnp', '4-30', '4-31', 'bnp-32', '4-27', '3-29', '3-32', '5-31', '4-29', '5-32', 'bnp(1-32'}
P12821 | Angiotensin-converting enzyme (ACE) (EC 3.2.1.-) (EC 3.4.15.1) (Dipeptidyl carboxypeptidase I) (Kininase II) (CD antigen CD143) [Cleaved into: Angiotensin-converting enzyme, soluble form]
------------
P12821 | {'ec_3.2.1.-', 'cd_antigen_cd143', 'ec_3.4.15.1', 'dipeptidyl_carboxypeptidase_i', 'kininase_ii', 'angiotensin-converting_enzyme,_soluble_for', 'angiotensin-converting_enzyme', 'ace'}


#### Cleans and parses gene names

In [12]:
def parse_genes(line):
    line = str(line)
    line = line.lower()
    
    genes = line.split()

    return genes

#### Creates protein dictionary
- Using the parsing methods, a dictionary for each protein is created
- In the format:
                        [{"uniprot_ID": XXXX,\
                          "names" : [xxx,xxx,xxx,xxx],\
                          "genes": [xxx,xxx]}]
- A list of dictionaries (aka list of proteins) is written to a file

In [13]:
allProteins = []
for u,p,g in zip(df['Entry'],df['Protein names'], df['Gene names']):
    allProteins.append({"id":u, "names":list(filter(p)), "genes":parse_genes(g)})

In [17]:
with open("protein_dict.json", 'w') as pd:
    json.dump(allProteins, pd)