In [2]:
#pip install bioservices
from bioservices import UniProt
import pandas as pd
import numpy as np


'''
@param query is the query search on uniprot
@return df is the dataframe with entryname, uniprotid, genes, protein names, proteome as columns
'''
def search(query): 
    u = UniProt()

    #get all of the columns
    entry_name = u.search(query,  frmt='tab', columns= 'entry name')
    unid = u.search(query, frmt='tab', columns= 'id')
    genes = u.search(query, frmt='tab', columns= 'genes') 
    protein_names = u.search(query, frmt='tab', columns= 'protein names')
    proteome = u.search(query, frmt='tab', columns= 'proteome')
    pathways = u.search(query,  frmt='tab', columns= 'pathway')
    
    #process the strings
    entry_name, unid, genes, protein_names, proteome, pathways = entry_name.split('\n'), unid.split('\n'), genes.split('\n'), protein_names.split('\n'), proteome.split('\n'), pathways.split('\n')
    
    #make genes into a list
    for i in range(len(genes)):
        genes[i] = genes[i].split()
    
    #create the df
    data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])
    df = pd.DataFrame(data, index = ['unid', 'entry_name', 'genes', 'other_names', 'proteome', 'pathways'])
    df = df.transpose()
    df = df.drop(0)
    
    #split the names into a list
    #starting at 1 because we dropped df[0]
    for i in range(1, len(df['other_names'])): 
        df['other_names'][i] = df['other_names'][i].split('(')
    
    #remove the end lines and the extra ')'
    for i in range(1, len(df['other_names'])):  #CHANGE THIS TO 0 
        for j in range(len(df['other_names'][i])): 
            df['other_names'][i][j] = df['other_names'][i][j].rstrip()
            df['other_names'][i][j] = df['other_names'][i][j].strip(')')
            
    #drop last row, as it is empty        
    df = df[:-1]
    
    #get firstnames
    firstnames = []
    for i in range(len(df)):
        firstnames.append(df.iloc[i]['other_names'][0])
        df.iloc[i]['other_names'].pop(0)

    #add firstnames
    df['name'] = firstnames
    return df

In [3]:
query = 'coronary heart disease AND organism:"Homo sapiens (Human) [9606]"'
coronaryHeartDisease = search(query)
coronaryHeartDisease

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,P21980,TGM2_HUMAN,[TGM2],"[EC 2.3.2.13, Erythrocyte transglutaminase, He...",UP000005640: Chromosome 20,,Protein-glutamine gamma-glutamyltransferase 2
2,O95477,ABCA1_HUMAN,"[ABCA1, ABC1, CERP]","[EC 7.6.2.1, ATP-binding cassette sub-family A...",UP000005640: Chromosome 9,,Phospholipid-transporting ATPase ABCA1
3,P46531,NOTC1_HUMAN,"[NOTCH1, TAN1]","[Notch 1, hN1, Translocation-associated notch ...",UP000005640: Chromosome 9,,Neurogenic locus notch homolog protein 1
4,Q02078,MEF2A_HUMAN,"[MEF2A, MEF2]",[Serum response factor-like protein 1],UP000005640: Chromosome 15,,Myocyte-specific enhancer factor 2A
5,P08254,MMP3_HUMAN,"[MMP3, STMY1]","[SL-1, EC 3.4.24.17, Matrix metalloproteinase-...",UP000005640: Chromosome 11,,Stromelysin-1
...,...,...,...,...,...,...,...
64,Q8N119,MMP21_HUMAN,[MMP21],"[MMP-21, EC 3.4.24.-]",UP000005640: Chromosome 10,,Matrix metalloproteinase-21
65,P37231,PPARG_HUMAN,"[PPARG, NR1C3]","[PPAR-gamma, Nuclear receptor subfamily 1 grou...",UP000005640: Chromosome 3,,Peroxisome proliferator-activated receptor gamma
66,P22415,USF1_HUMAN,"[USF1, BHLHB11, USF]","[Class B basic helix-loop-helix protein 11, bH...",UP000005640: Chromosome 1,,Upstream stimulatory factor 1
67,P13010,XRCC5_HUMAN,"[XRCC5, G22P2]","[EC 3.6.4.-, 86 kDa subunit of Ku antigen, ATP...",UP000005640: Chromosome 2,,X-ray repair cross-complementing protein 5


## Cardiovascular Disease Types
    Reference: https://www.nhs.uk/conditions/cardiovascular-disease/, https://www.webmd.com/heart-disease/guide/diseases-cardiovascular
    - Cardiomyopathy
    - Coronary Heart Disease/Coronary Artery Disease
    - Peripheral arterial disease
    - Aortic disease
    - Congenital Heart Disease
    - Congestive Heart Failure
    - Heart Protiens


## Cardiomyopathy:

In [4]:
#construct a list out of the arguments
listofqueries = []

query = 'cardiomyopathy AND organism:"Homo sapiens (Human) [9606]"'
cardiomyopathy = search(query)
listofqueries.append(cardiomyopathy)
cardiomyopathy

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,Q8N3K9,CMYA5_HUMAN,"[CMYA5, C5orf10, DTNBP2, SPRYD2, TRIM76]","[Dystrobrevin-binding protein 2, Genethonin-3,...",UP000005640: Chromosome 5,,Cardiomyopathy-associated protein 5
2,Q702N8,XIRP1_HUMAN,"[XIRP1, CMYA1, XIN]",[Cardiomyopathy-associated protein 1],UP000005640: Chromosome 3,,Xin actin-binding repeat-containing protein 1
3,A4UGR9,XIRP2_HUMAN,"[XIRP2, CMYA3]","[Beta-xin, Cardiomyopathy-associated protein 3...",UP000005640: Chromosome 2,,Xin actin-binding repeat-containing protein 2
4,Q5VU43,MYOME_HUMAN,"[PDE4DIP, CMYA2, KIAA0454, KIAA0477, MMGL]","[Cardiomyopathy-associated protein 2, Phosphod...",UP000005640: Chromosome 1,,Myomegalin
5,P12883,MYH7_HUMAN,"[MYH7, MYHCB]","[Myosin heavy chain 7, Myosin heavy chain slow...",UP000005640: Chromosome 14,,Myosin-7
...,...,...,...,...,...,...,...
372,A0A7R6M8Q3,A0A7R6M8Q3_HUMAN,[LDB3],[Fragment],,,LIM domain binding 3
373,A0A1B1HY28,A0A1B1HY28_HUMAN,[MYOZ2],[Fragment],,,Myozenin 2
374,Q9UI47,CTNA3_HUMAN,[CTNNA3],"[Alpha T-catenin, Cadherin-associated protein]",UP000005640: Chromosome 10,,Catenin alpha-3
375,Q5TAQ9,DCAF8_HUMAN,"[DCAF8, H326, WDR42A]",[WD repeat-containing protein 42A],UP000005640: Chromosome 1,Protein modification; protein ubiquitination.,DDB1- and CUL4-associated factor 8


## Coronary Heart Disease

In [5]:
query = 'coronary heart disease AND organism:"Homo sapiens (Human) [9606]"'
coronaryHeartDisease = search(query)
listofqueries.append(coronaryHeartDisease)
coronaryHeartDisease

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,P21980,TGM2_HUMAN,[TGM2],"[EC 2.3.2.13, Erythrocyte transglutaminase, He...",UP000005640: Chromosome 20,,Protein-glutamine gamma-glutamyltransferase 2
2,O95477,ABCA1_HUMAN,"[ABCA1, ABC1, CERP]","[EC 7.6.2.1, ATP-binding cassette sub-family A...",UP000005640: Chromosome 9,,Phospholipid-transporting ATPase ABCA1
3,P46531,NOTC1_HUMAN,"[NOTCH1, TAN1]","[Notch 1, hN1, Translocation-associated notch ...",UP000005640: Chromosome 9,,Neurogenic locus notch homolog protein 1
4,Q02078,MEF2A_HUMAN,"[MEF2A, MEF2]",[Serum response factor-like protein 1],UP000005640: Chromosome 15,,Myocyte-specific enhancer factor 2A
5,P08254,MMP3_HUMAN,"[MMP3, STMY1]","[SL-1, EC 3.4.24.17, Matrix metalloproteinase-...",UP000005640: Chromosome 11,,Stromelysin-1
...,...,...,...,...,...,...,...
64,Q8N119,MMP21_HUMAN,[MMP21],"[MMP-21, EC 3.4.24.-]",UP000005640: Chromosome 10,,Matrix metalloproteinase-21
65,P37231,PPARG_HUMAN,"[PPARG, NR1C3]","[PPAR-gamma, Nuclear receptor subfamily 1 grou...",UP000005640: Chromosome 3,,Peroxisome proliferator-activated receptor gamma
66,P22415,USF1_HUMAN,"[USF1, BHLHB11, USF]","[Class B basic helix-loop-helix protein 11, bH...",UP000005640: Chromosome 1,,Upstream stimulatory factor 1
67,P13010,XRCC5_HUMAN,"[XRCC5, G22P2]","[EC 3.6.4.-, 86 kDa subunit of Ku antigen, ATP...",UP000005640: Chromosome 2,,X-ray repair cross-complementing protein 5


## Coronary Artery Disease

In [6]:
query = 'coronary artery disease AND organism:"Homo sapiens (Human) [9606]"'
coronaryarterydisease = search(query)
listofqueries.append(coronaryarterydisease)
coronaryarterydisease

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,Q9P266,JCAD_HUMAN,"[JCAD, KIAA1462]",[JCAD],UP000005640: Chromosome 10,,Junctional protein associated with coronary ar...
2,O95477,ABCA1_HUMAN,"[ABCA1, ABC1, CERP]","[EC 7.6.2.1, ATP-binding cassette sub-family A...",UP000005640: Chromosome 9,,Phospholipid-transporting ATPase ABCA1
3,Q02078,MEF2A_HUMAN,"[MEF2A, MEF2]",[Serum response factor-like protein 1],UP000005640: Chromosome 15,,Myocyte-specific enhancer factor 2A
4,P16050,LOX15_HUMAN,"[ALOX15, LOG15]","[12/15-lipoxygenase, Arachidonate 12-lipoxygen...",UP000005640: Chromosome 17,Lipid metabolism; hydroperoxy eicosatetraenoic...,Polyunsaturated fatty acid lipoxygenase ALOX15
5,P16581,LYAM2_HUMAN,"[SELE, ELAM1]","[CD62 antigen-like family member E, Endothelia...",UP000005640: Chromosome 1,,E-selectin
...,...,...,...,...,...,...,...
90,P13010,XRCC5_HUMAN,"[XRCC5, G22P2]","[EC 3.6.4.-, 86 kDa subunit of Ku antigen, ATP...",UP000005640: Chromosome 2,,X-ray repair cross-complementing protein 5
91,E7E3A2,E7E3A2_HUMAN,[ATP6],[],,,ATP synthase subunit a
92,Q0ZFD6,Q0ZFD6_HUMAN,"[CYB, CYTB, cytb, MTCYB]",[],,,Cytochrome b
93,E7E2R2,E7E2R2_HUMAN,[ND5],[EC 7.1.1.2],,,NADH-ubiquinone oxidoreductase chain 5


## Aortic Disease

In [7]:
query = 'aortic disease AND organism:"Homo sapiens (Human) [9606]"'
aorticdisease = search(query)
listofqueries.append(aorticdisease)
aorticdisease

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,P62736,ACTA_HUMAN,"[ACTA2, ACTSA, ACTVS, GIG46]","[Alpha-actin-2, Cell growth-inhibiting gene 46...",UP000005640: Chromosome 10,,"Actin, aortic smooth muscle"
2,Q8IUX7,AEBP1_HUMAN,"[AEBP1, ACLP]","[AE-binding protein 1, Aortic carboxypeptidase...",UP000005640: Chromosome 7,,Adipocyte enhancer-binding protein 1
3,Q15772,SPEG_HUMAN,"[SPEG, APEG1, KIAA1297]","[EC 2.7.11.1, Aortic preferentially expressed ...",UP000005640: Chromosome 2,,Striated muscle preferentially expressed prote...
4,Q04656,ATP7A_HUMAN,"[ATP7A, MC1, MNK]","[EC 7.2.2.8, Copper pump 1, Menkes disease-ass...",UP000005640: Chromosome X,,Copper-transporting ATPase 1
5,P61812,TGFB2_HUMAN,[TGFB2],"[Cetermin, Glioblastoma-derived T-cell suppres...",UP000005640: Chromosome 1,,Transforming growth factor beta-2 proprotein
...,...,...,...,...,...,...,...
102,Q12768,WASC5_HUMAN,"[WASHC5, KIAA0196]","[Strumpellin, WASH complex subunit strumpellin]",UP000005640: Chromosome 8,,WASH complex subunit 5
103,Q5XNW2,Q5XNW2_HUMAN,[COL11A1],[Fragment],,,Mutant collagen XI alpha 1 variant isoform A
104,Q4FAC4,Q4FAC4_HUMAN,[COL11A1],[Fragment],,,Collagen XI alpha 1
105,P08123,CO1A2_HUMAN,[COL1A2],"[I) chain, Alpha-2 type I collagen]",UP000005640: Chromosome 7,,Collagen alpha-2


## Congenital Heart Disease

In [8]:
query = 'congenital heart disease AND organism:"Homo sapiens (Human) [9606]"'
congenitalheart = search(query)
listofqueries.append(congenitalheart)
congenitalheart

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,O00258,GET1_HUMAN,"[GET1, CHD5, WRB]","[Congenital heart disease 5 protein, Tail-anch...",UP000005640: Chromosome 21,,Guided entry of tail-anchored proteins factor 1
2,O75072,FKTN_HUMAN,"[FKTN, FCMD]","[EC 2.4.2.-, Fukuyama-type congenital muscular...",UP000005640: Chromosome 9,Protein modification; protein glycosylation.,Fukutin
3,P17302,CXA1_HUMAN,"[GJA1, GJAL]","[Connexin-43, Cx43, Gap junction 43 kDa heart ...",UP000005640: Chromosome 6,,Gap junction alpha-1 protein
4,P48544,KCNJ5_HUMAN,"[KCNJ5, GIRK4]","[GIRK-4, Cardiac inward rectifier, CIR, Heart ...",UP000005640: Chromosome 11,,G protein-activated inward rectifier potassium...
5,P12235,ADT1_HUMAN,"[SLC25A4, AAC1, ANT1]","[ADP,ATP carrier protein 1, ADP,ATP carrier pr...",UP000005640: Chromosome 4,,ADP/ATP translocase 1
...,...,...,...,...,...,...,...
429,B0LXF3,B0LXF3_HUMAN,[GATA4],[Fragment],,,GATA binding protein 4
430,B0LXF0,B0LXF0_HUMAN,[GATA4],[Fragment],,,GATA binding protein 4
431,C1INI6,C1INI6_HUMAN,[TBX20],[Fragment],,,T-box transcription factor TBX20
432,Q9UI47,CTNA3_HUMAN,[CTNNA3],"[Alpha T-catenin, Cadherin-associated protein]",UP000005640: Chromosome 10,,Catenin alpha-3


## Pehipheral Arterial Disease

In [9]:
query = 'peripheral arterial disease AND organism:"Homo sapiens (Human) [9606]"'
pehiperalarterial = search(query)
listofqueries.append(pehiperalarterial)
pehiperalarterial

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,Q9C000,NLRP1_HUMAN,"[NLRP1, CARD7, DEFCAP, KIAA0926, NAC, NALP1]","[EC 3.4.-.-, EC 3.6.4.-, Caspase recruitment d...",UP000005640: Chromosome 17,,"NACHT, LRR and PYD domains-containing protein 1"
2,O95255,MRP6_HUMAN,"[ABCC6, ARA, MRP6]","[EC 7.6.2.-, EC 7.6.2.3, Anthracycline resista...",UP000005640: Chromosome 16,,ATP-binding cassette sub-family C member 6
3,Q9BUM1,G6PC3_HUMAN,"[G6PC3, UGRP]","[G-6-Pase 3, G6Pase 3, EC 3.1.3.9, Glucose-6-p...",UP000005640: Chromosome 17,Carbohydrate biosynthesis; gluconeogenesis.,Glucose-6-phosphatase 3
4,Q03135,CAV1_HUMAN,"[CAV1, CAV]",[],UP000005640: Chromosome 7,,Caveolin-1
5,Q9BXB5,OSB10_HUMAN,"[OSBPL10, ORP10, OSBP9]","[ORP-10, OSBP-related protein 10]",UP000005640: Chromosome 3,,Oxysterol-binding protein-related protein 10
6,P20674,COX5A_HUMAN,[COX5A],[Cytochrome c oxidase polypeptide Va],UP000005640: Chromosome 15,Energy metabolism; oxidative phosphorylation.,"Cytochrome c oxidase subunit 5A, mitochondrial"
7,P02647,APOA1_HUMAN,[APOA1],"[Apo-AI, ApoA-I, Apolipoprotein A1) [Cleaved i...",UP000005640: Chromosome 11,,Apolipoprotein A-I
8,Q9Y5S8,NOX1_HUMAN,"[NOX1, MOX1, NOH1]","[NOX-1, EC 1.-.-.-, Mitogenic oxidase 1, MOX-1...",UP000005640: Chromosome X,,NADPH oxidase 1
9,Q9H1B5,XYLT2_HUMAN,"[XYLT2, XT2, UNQ3058/PRO9878]","[EC 2.4.2.26, Peptide O-xylosyltransferase 1, ...",UP000005640: Chromosome 17,Glycan metabolism; chondroitin sulfate biosynt...,Xylosyltransferase 2
10,P40818,UBP8_HUMAN,"[USP8, KIAA0055, UBPY]","[EC 3.4.19.12, Deubiquitinating enzyme 8, Ubiq...",UP000005640: Chromosome 15,,Ubiquitin carboxyl-terminal hydrolase 8


## Congestive Heart Failure

In [10]:
query = 'congestive heart failure AND organism:"Homo sapiens (Human) [9606]"'
congestiveheart = search(query)
listofqueries.append(congestiveheart)
congestiveheart

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,P02545,LMNA_HUMAN,"[LMNA, LMN1]","[70 kDa lamin, Renal carcinoma antigen NY-REN-...",UP000005640: Chromosome 1,,Prelamin-A/C [Cleaved into: Lamin-A/C
2,P12883,MYH7_HUMAN,"[MYH7, MYHCB]","[Myosin heavy chain 7, Myosin heavy chain slow...",UP000005640: Chromosome 14,,Myosin-7
3,P19429,TNNI3_HUMAN,"[TNNI3, TNNC1]",[Cardiac troponin I],UP000005640: Chromosome 19,,"Troponin I, cardiac muscle"
4,Q14524,SCN5A_HUMAN,[SCN5A],[Sodium channel protein cardiac muscle subunit...,UP000005640: Chromosome 3,,Sodium channel protein type 5 subunit alpha
5,P43694,GATA4_HUMAN,[GATA4],[GATA-binding factor 4],UP000005640: Chromosome 8,,Transcription factor GATA-4
6,O60706,ABCC9_HUMAN,"[ABCC9, SUR2]",[Sulfonylurea receptor 2],UP000005640: Chromosome 12,,ATP-binding cassette sub-family C member 9
7,P16860,ANFB_HUMAN,[NPPB],"[Brain natriuretic factor prohormone, preproBN...",UP000005640: Chromosome 1,,Natriuretic peptides B
8,Q8WZ42,TITIN_HUMAN,[TTN],"[EC 2.7.11.1, Connectin, Rhabdomyosarcoma anti...",UP000005640: Chromosome 2,,Titin
9,P26678,PPLA_HUMAN,"[PLN, PLB]",[PLB],UP000005640: Chromosome 6,,Cardiac phospholamban
10,P17661,DESM_HUMAN,[DES],[],UP000005640: Chromosome 2,,Desmin


## Heart Proteins

In [11]:
query = 'heart AND organism:"Homo sapiens (Human) [9606]"'
heartproteins = search(query)
listofqueries.append(heartproteins)
heartproteins

  data = np.array([unid, entry_name, genes, protein_names, proteome, pathways])


Unnamed: 0,unid,entry_name,genes,other_names,proteome,pathways,name
1,P05413,FABPH_HUMAN,"[FABP3, FABP11, MDGI]","[Fatty acid-binding protein 3, Heart-type fatt...",UP000005640: Chromosome 1,,"Fatty acid-binding protein, heart"
2,O96004,HAND1_HUMAN,"[HAND1, BHLHA27, EHAND]","[Class A basic helix-loop-helix protein 27, bH...",UP000005640: Chromosome 5,,Heart- and neural crest derivatives-expressed ...
3,P61296,HAND2_HUMAN,"[HAND2, BHLHA26, DHAND]","[Class A basic helix-loop-helix protein 26, bH...",UP000005640: Chromosome 4,,Heart- and neural crest derivatives-expressed ...
4,P17302,CXA1_HUMAN,"[GJA1, GJAL]","[Connexin-43, Cx43, Gap junction 43 kDa heart ...",UP000005640: Chromosome 6,,Gap junction alpha-1 protein
5,Q9Y5Q5,CORIN_HUMAN,"[CORIN, CRN, TMPRSS10]","[EC 3.4.21.-, Corin, Heart-specific serine pro...",UP000005640: Chromosome 4,,Atrial natriuretic peptide-converting enzyme
...,...,...,...,...,...,...,...
5330,P52799,EFNB2_HUMAN,"[EFNB2, EPLG5, HTKL, LERK5]",[EPH-related receptor tyrosine kinase ligand 5...,UP000005640: Chromosome 13,,Ephrin-B2
5331,P09471,GNAO_HUMAN,[GNAO1],[o) subunit alpha],UP000005640: Chromosome 16,,Guanine nucleotide-binding protein G
5332,Q9Y692,GMEB1_HUMAN,[GMEB1],"[GMEB-1, DNA-binding protein p96PIF, Parvoviru...",UP000005640: Chromosome 1,,Glucocorticoid modulatory element-binding prot...
5333,Q9P2T1,GMPR2_HUMAN,[GMPR2],"[GMPR 2, EC 1.7.1.7, Guanosine 5'-monophosphat...",UP000005640: Chromosome 14,,GMP reductase 2


In [13]:
#combine the queries
finaldf = pd.concat(listofqueries)

#drop duplicates
finaldf = finaldf.drop_duplicates(subset = ['unid'], keep = 'first')

#reset index
finaldf = finaldf.reset_index(drop = True)

finaldf
finaldf.to_csv('CVDProteins.csv')

AttributeError: 'DataFrame' object has no attribute 'to_xlsx'