In [1]:
import pandas as pd
import subprocess
import os
import time
import csv

import sys
sys.path.append('../GOCAM_Project/dev')
import utils

In [2]:
pd.options.display.max_colwidth = 100

In [3]:
identifiers = pd.read_csv('../data/identifiers.csv')
identifiers = identifiers['identifier']
identifiers= identifiers[identifiers.str.contains('reacto.owl')]
identifiers.reset_index(drop=True, inplace=True)

In [4]:
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-975294'in list(identifiers)

True

In [5]:
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in list(identifiers)

False

In [6]:
def preprocess_complex_sets(set_):
    #reformat physical entities to sets of complexes, where each single polypeptide in the set is treated as a complex with one member
    complexes = set_[set_.record == 'http://purl.obolibrary.org/obo/GO_0032991']
    proteins = set_[set_.record != 'http://purl.obolibrary.org/obo/GO_0032991']
    #proteins = proteins.drop_duplicates(subset=['record'])
    proteins.record_ = proteins.record
    proteins.complex_from_set = proteins.record
    return pd.concat([complexes,proteins]).dropna(subset=['record_']) #not sure why there are NaN values

In [48]:
def handle_complex_sets(c_sets_list,uIDs, members2setID,setID2members):
    rIDs = c_sets_list['crID'].unique()
    c_sets_list = preprocess_complex_sets(c_sets_list) #handles physical entities
    for s in rIDs:
        set_ = c_sets_list[c_sets_list.crID == s]
        #set_ = preprocess_complex_sets(set_) #handles physical entities
        set_['record_'] = set_['record_'].apply(lambda x: x.split('/')[-1]) #just keep the uniprot ID

        num_complexes = len(set_.complex_from_set.unique())
        value_counts = set_.record_.value_counts()
        common_subunits = list(value_counts[value_counts.values == num_complexes].index)
        specific_subunits = list(value_counts[value_counts.values < num_complexes].index)
        
        
        if len(common_subunits) >0:
            temp = pd.DataFrame({'record': common_subunits,'rID':s})
            utils.update_dict(uIDs,temp,'rID','record')
        if len(specific_subunits) >0:
            ss_name = 'ssubSET:'+s.split('REACTO_')[1]
            ss = pd.DataFrame({'set_ID':ss_name,'record':specific_subunits})
            setID2members = utils.update_dict(setID2members,ss,'set_ID','record') 
            members2setID = utils.update_dict(members2setID,ss,'record','set_ID')
            temp = pd.DataFrame({'record': [ss_name],'rID':[s]})
            utils.update_dict(uIDs,temp,'rID','record')
            

    
    

In [49]:
def handle_sets(results_file,uIDs,members2setID, setID2members):
    #HANDLE SETS
    results = pd.read_csv(results_file)
    
    #SORT RESULTS, REMOVE AND PROCESS COMPLEX SETS SEPARATELY
    temp_p_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.Protein']
    temp_c_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.Complex']
    c_sets_list = temp_c_list[temp_c_list['complex_from_set'].notna()]
    phy_sets_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.PhysicalEntity']
    c_phy_sets = pd.concat([c_sets_list,phy_sets_list])
    temp_p_list = temp_p_list.drop(['complex_from_set','record_'],axis=1)
    temp_c_list = temp_c_list[temp_c_list['component'].notna()]
    temp_c_list = temp_c_list.drop(['complex_from_set','record_'],axis=1)
    handle_complex_sets(c_phy_sets,uIDs,members2setID,setID2members)
    
    temp_p_list['record'] = temp_p_list['record'].apply(lambda x: x.split('/')[-1]) #just keep the uniprot ID
    sets = temp_p_list[temp_p_list['name'].duplicated(keep=False)]
    sets['name'] = sets['name'].apply(lambda x: x.replace('/','^') )#at least one name contains '/', which disrupts later string operations
    sets['set_ID'] = sets['name'].apply(lambda x: 'SET:'+x)
    members2setID = utils.update_dict(members2setID,sets,'record','set_ID') #dict of {uniprot records:{set_IDs}}
    setID2members = utils.update_dict(setID2members,sets,'set_ID','record') #dict of {set_ID:{uniprot records}}
    
    #need to drop entry in a set from the results before mapping records back to sets:
        #consider the situation where a particular record exists as part of a set from querying one crID
        #as well as the solo protein returned from querying another crID. mapping records back to sets would
        #falsely map the solo protein to being part of a set
    temp_p_list = temp_p_list.drop_duplicates(subset='name',keep=False)
    sets['record'] = sets['set_ID']
    sets = sets.drop(['set_ID'],axis=1)
    results = pd.concat([temp_p_list, sets,temp_c_list])
    return results

In [50]:
def map_results_to_rID(rIDs,results):
    #GET MAPPING OF RESULTS BACK TO rID
    crID2rID = pd.Series(rIDs.rID.values,index=rIDs.crID).to_dict()
    duplicates = rIDs[rIDs['crID'].duplicated(keep=False)]
    duplicates_dict = {}
    duplicates_dict = utils.update_dict(duplicates_dict,duplicates,'crID','rID')
    
    
    #APPLY MAPPING OF RESULTS BACK TO rID
    results['rID'] = results['crID'].map(crID2rID)
    results_to_be_duplicated = results[results['crID'].apply(lambda x: x in duplicates_dict)]
    results_to_be_duplicated = results_to_be_duplicated.drop(['rID'], axis=1)
    for index, row in results_to_be_duplicated.iterrows():
        rID_list = list(duplicates_dict.get(row.crID))
        temp = pd.DataFrame([row])
        temp = pd.concat([temp]*len(rID_list), ignore_index=True)
        temp = pd.concat([pd.Series(rID_list,name='rID'),temp],axis=1)
        results = pd.concat([results,temp],axis=0)
    results = results.drop_duplicates()
    return results

In [51]:
def query(rIDs,uIDs,sparql_file,o_file, members2setID, setID2members):
    #rIDs = [rID,crID]
    #FORMAT QUERY VALUES STRING
    queryIDs = rIDs['crID']
    queryIDs = queryIDs.drop_duplicates()
    queryIDs = queryIDs.apply(lambda x: '<'+x+'>') #convert to IRI
    queryIDs = queryIDs.to_string(index=False)
    
        
    #RUN QUERY WITH ROBOT ON REACTO.owl
    # '&' is used for sed, not '/' as a delimiter because the identifier contains '/'
    with open(o_file, "w") as outfile:
        subprocess.run(['sed',"s&REPLACETHIS&"+queryIDs.encode("unicode_escape").decode("utf-8")+"&",sparql_file], stdout=outfile)
    subprocess.run(['robot','query','--input', '../query/reacto.owl', '--query', o_file,'../temp/results.csv'])
    
    #log_results.append(pd.read_csv('../temp/results.csv'))
    results = handle_sets('../temp/results.csv', uIDs, members2setID, setID2members)
    #log_results.append(results)
    #RESULTS ONLY HAS crID, ADD rID column
    results = map_results_to_rID(rIDs,results)
    #log_results.append(results)
    return results

In [52]:
def f_recursive(rIDs,uIDs,depth, members2setID, setID2members):
    #QUERY
    if depth > 10:
        #Possible error
        return 1  #change to return the traceback instead
    results = query(rIDs,uIDs,'../query/query_for_sed-test.txt','../temp/query_for_robot.txt',  members2setID, setID2members )
    
    #SORT RESULTS
    p_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.Protein']
    c_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.Complex']

    #UPDATE uID DICTIONARY
    p_list['record'] = p_list['record'].apply(lambda x: x.split('/')[-1]) #just keep the uniprot ID
    uIDs = utils.update_dict(uIDs,p_list,'rID','record')
    
    
    #TERMINATING CONDITION (no more components to search)
    #missing = c_list[c_list['component'].isna()]
    #log_c.append(missing)
    log.append(c_list)
    c_list = c_list.dropna() #see http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-1475437 as an example
    if len(c_list.index) == 0:
        return 
    
    #UPDATE rIDs FOR NEXT QUERY
    new_rIDs = c_list[['rID','component']]
    #new_rIDs.dropna(inplace=True) #only keep rows with a value for component
    new_rIDs = new_rIDs.rename(columns={'component':'crID'})
    #log_recursion.append(new_rIDs)
    #RECURSE
    depth = depth+1
    f_recursive(new_rIDs,uIDs, depth,  members2setID, setID2members)
    #how to handle reactome sets? (rSet)
    return  #returns a set

test_list = ['http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-1168601',
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-1602359',
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-1602468',
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-158197',
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5359005',
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-9606895',
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-3238209'
            ]
test_list

In [53]:
#%%time
#log_c=[] #complex with no component
#log=[] #c_list
#log_rIDs = []
#uIDs = {}
#members2setID= {}
#setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
#rIDs = pd.DataFrame(data={'rID':test_list})#rIDs = identifiers.to_frame(name='rID')
#rIDs['crID']=rIDs.rID
#log_recursion = []
#log_results = []
#f_recursive(rIDs,uIDs,0, members2setID, setID2members)
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')
            

%%time
## testing with 100 random examples

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs = rIDs.sample(n=100,random_state = 1)
rIDs
f_recursive(rIDs,uIDs,0, members2setID, setID2members)
utils.dict2csv(uIDs,'../../Desktop/uIDs_100test2.csv')
#utils.dict2csv(members2setID,'../../Desktop/members2setID_100test2.csv')
utils.dict2csv(setID2members,'../../Desktop/setID2members_100test2.csv')

In [57]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3000:4000]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

CPU times: user 1.43 s, sys: 111 ms, total: 1.54 s
Wall time: 4min 59s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [58]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3500:4000]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

CPU times: user 839 ms, sys: 88.7 ms, total: 927 ms
Wall time: 4min 11s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

True

In [59]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3500:3750]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

CPU times: user 635 ms, sys: 81.8 ms, total: 717 ms
Wall time: 4min 6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [60]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3500:3600]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

CPU times: user 432 ms, sys: 82 ms, total: 514 ms
Wall time: 4min 7s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [61]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3500:3550]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

CPU times: user 210 ms, sys: 60.6 ms, total: 271 ms
Wall time: 3min 26s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [62]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3500:3520]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

CPU times: user 190 ms, sys: 61.4 ms, total: 252 ms
Wall time: 3min 28s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [63]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3500:3510]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

CPU times: user 186 ms, sys: 61.3 ms, total: 248 ms
Wall time: 3min 26s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [64]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3505:3510]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

CPU times: user 173 ms, sys: 60.1 ms, total: 233 ms
Wall time: 3min 26s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [69]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3505:3506]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

CPU times: user 162 ms, sys: 56.7 ms, total: 219 ms
Wall time: 3min 30s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


True

In [71]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3506:3507]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


KeyError: "['rID'] not found in axis"

In [72]:
%%time

log_c=[] #complex with no component
log=[] #c_list
log_rIDs = []
uIDs = {}
members2setID= {}
setID2members= {}
#df rID (the original rID from a GOCAM), crID (the current rID to search with)
#the first rIDs df has the original rIDs duplicated as the component to start the search
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs=rIDs[3507:3508]
rIDs = rIDs.drop_duplicates()

f_recursive(rIDs,uIDs,0, members2setID, setID2members)
'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713' in uIDs
#utils.dict2csv(uIDs,'../data/uIDs.csv')
#utils.dict2csv(members2setID,'../data/members2setID.csv')
#utils.dict2csv(setID2members,'../data/setID2members.csv')

CPU times: user 31.5 ms, sys: 12 ms, total: 43.5 ms
Wall time: 41.5 s


False

In [14]:
rIDs[rIDs.rID=='http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672713']

Unnamed: 0,rID,crID


In [75]:
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs[3505:3506]

Unnamed: 0,rID,crID
3505,http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672720,http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-5672720


In [30]:
len(rIDs)

5315

In [None]:
#uIDs.get('http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_R-HSA-186821')

In [None]:
#len(uIDs.keys())

In [None]:
#len(rIDs.drop_duplicates().rID)

In [None]:
#set(rIDs.drop_duplicates().rID) - set(uIDs.keys()) 

In [None]:
#set(uIDs.keys()) - set(rIDs.drop_duplicates().rID)

In [None]:
#setID2members

In [None]:
#members2setID

In [None]:
results = pd.read_csv('../temp/results.csv')
results = results[results.type == 'BioPAX type: interface org.biopax.paxtools.model.level3.PhysicalEntity']
results
complexes = results[results.record == 'http://purl.obolibrary.org/obo/GO_0032991']
proteins = complexes[complexes.record != 'http://purl.obolibrary.org/obo/GO_0032991']
proteins = proteins.drop_duplicates(['record'])
proteins.record_ = proteins.record
proteins.complex_from_set = None
c_set_list = pd.concat([complexes,proteins])
c_set_list

In [None]:
#HANDLE SETS
results = log_results[0]
members2setID = {}
setID2members = {}

#SORT RESULTS
temp_p_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.Protein']
temp_c_list = results[results['type']== 'BioPAX type: interface org.biopax.paxtools.model.level3.Complex']
c_sets_list = temp_c_list[temp_c_list['complex_from_set'].notna()]
temp_c_list = temp_c_list[temp_c_list['component'].notna()]
temp_c_list = temp_c_list.drop(['complex_from_set','record_'],axis=1)

temp_p_list['record'] = temp_p_list['record'].apply(lambda x: x.split('/')[-1]) #just keep the uniprot ID
sets = temp_p_list[temp_p_list['name'].duplicated(keep=False)]
sets['name'] = sets['name'].apply(lambda x: x.replace('/','^') )#at least one name contains '/', which disrupts later string operations
sets['set_ID'] = sets['name'].apply(lambda x: 'SET:'+x)
members2setID = utils.update_dict(members2setID,sets,'record','set_ID') #dict of {uniprot records:{set_IDs}}
setID2members = utils.update_dict(setID2members,sets,'set_ID','record') #dict of {set_ID:{uniprot records}}

#need to drop entry in a set from the results before mapping records back to sets:
    #consider the situation where a particular record exists as part of a set from querying one crID
    #as well as the solo protein returned from querying another crID. mapping records back to sets would
    #falsely map the solo protein to being part of a set
temp_p_list = temp_p_list.drop_duplicates(subset='name',keep=False)
sets['record'] = sets['set_ID']
sets = sets.drop(['set_ID'],axis=1)
#results = pd.concat([temp_p_list, sets,temp_c_list])

In [None]:
uIDs

In [None]:
#utils.dict2csv(uIDs,'../data/uIDs_test2.csv')
#utils.dict2csv(members2setID,'../data/members2setID_test2.csv')
#utils.dict2csv(setID2members,'../data/setID2members_test2.csv')

In [None]:
rIDs = identifiers.to_frame(name='rID')
rIDs['crID']=rIDs.rID
rIDs = rIDs.sample(n=100,random_state = 1)
rIDs = rIDs.drop_duplicates()
rIDs = rIDs.rID
og_rIDs = set(rIDs)

In [None]:
len(uIDs.keys())

In [None]:
mapped_rIDs = set(uIDs.keys())

In [None]:
og_rIDs - mapped_rIDs