### Query:
What genes encode proteins that physically interact with proteins encoded by the 11 Fanconi Anemia core complex genes (set FA-core)? 

### Input:

Hardcoded tsv file from:

https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_1_core_complex.txt

### Goal:
This simple query aims to expand the FA-core gene set based PPI network membership.

### Route:
1. Protein-protein interactions from Monarch Solr index, either direct or inferred through orthology.  Sources: [BioGRID](https://thebiogrid.org/) and [STRING](http://string-db.org/)
2. Gene-ortholog associations from Monarch Solr index.  Sources: [Panther](http://www.pantherdb.org/)


In [89]:
import requests
import pandas as pd
import copy

solr_url = 'https://solr-dev.monarchinitiative.org/solr/golr/select'
core_set = 'https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_1_core_complex.txt'

columns = ['gene', 'interactor_id', 'interactor_symbol', 'qualifier', 'inferred_gene']
dataframe = pd.read_csv(core_set, sep='\t', names=['gene', 'symbol'])

def get_solr_results(solr, params):
    resultCount = params['rows']
    while params['start'] < resultCount:
        solr_request = requests.get(solr, params=params)
        response = solr_request.json()
        resultCount = response['response']['numFound']
        params['start'] += params['rows']
        for doc in response['response']['docs']:
            yield doc

interaction_params = {
    'wt': 'json',
    'rows': 100,
    'start': 0,
    'q': '*:*',
    'fl': 'subject, subject_label, subject_closure, \
           object, object_label, object_taxon',
    'fq': ['relation_closure: "RO:0002434"']
}

# Make new dataframe for results
interact_table = pd.DataFrame(columns=columns)


# Get interactions, both direct and inferred
for index, row in dataframe.iterrows():
    params = copy.deepcopy(interaction_params)
    params['fq'].append('subject_closure: "{0}" \
                        OR subject_ortholog_closure: "{0}"'
                        .format(row['gene']))
    for doc in get_solr_results(solr_url, params):
        result = {}
        result['gene'] = row['symbol']
        result['interactor_id'] = doc['object']
        result['interactor_symbol'] = doc['object_label']
        if row['gene'] in doc['subject_closure']:
            result['qualifier'] = "direct"
        else:
            result['qualifier'] = "homology"    
        interact_table = interact_table.append(result, ignore_index=True)
            
interact_table.head(10)

Unnamed: 0,gene,interactor_id,interactor_symbol,qualifier,inferred_gene
0,FANCA,NCBIGene:4851,NOTCH1,direct,
1,FANCA,NCBIGene:6117,RPA1,direct,
2,FANCA,MGI:2384790,Fanci,homology,
3,FANCA,NCBIGene:2189,FANCG,direct,
4,FANCA,MGI:1914446,Ube2t,homology,
5,FANCA,MGI:3648788,Gm5239,homology,
6,FANCA,NCBIGene:10915,TCERG1,direct,
7,FANCA,NCBIGene:2783,GNB2,direct,
8,FANCA,MGI:1917178,Cenps,homology,
9,FANCA,MGI:3643360,Gm7808,homology,


In [92]:
# Define function to fetch orthologs given a gene ID
def get_human_ortholog(solr, gene):
    params = {
            'wt': 'json',
            'rows': 100,
            'start': 0,
            'q': '*:*',
            'fl': 'subject, subject_label,'
                  'object, object_label',
            'fq': ['subject_closure: "{0}"'.format(gene),
                   'relation_closure: "RO:HOM0000017"',
                   'object_taxon: "NCBITaxon:9606"'
            ]
    }
    for doc in get_solr_results(solr, params):
        yield doc

# Get interactions, both direct and inferred
for index, row in interact_table.iterrows():
    if row['qualifier'] == 'homology':
        for doc in get_human_ortholog(solr_url, row['interactor_id']):
            result = {}
            result['gene'] = row['gene']
            result['interactor_id'] = doc['object']
            result['interactor_symbol'] = doc['object_label']
            result['qualifier'] = "homology"    
            result['inferred_gene'] = row['interactor_symbol']
            interact_table = interact_table.append(result, ignore_index=True)
        
interact_table.tail(10)


Unnamed: 0,gene,interactor_id,interactor_symbol,qualifier,inferred_gene
3633,UBE2T,NCBIGene:2187,FANCB,homology,Fancb
3634,UBE2T,NCBIGene:548593,SLX1A,homology,Slx1b
3635,UBE2T,NCBIGene:983,CDK1,homology,cdk1
3636,UBE2T,NCBIGene:57531,HACE1,homology,Hace1
3637,UBE2T,NCBIGene:6047,RNF4,homology,rnf4
3638,UBE2T,NCBIGene:643904,RNF222,homology,rnf4
3639,UBE2T,NCBIGene:26091,HERC4,homology,herc4
3640,UBE2T,NCBIGene:55215,FANCI,homology,Fanci
3641,UBE2T,NCBIGene:643904,RNF222,homology,Rnf4
3642,UBE2T,NCBIGene:6047,RNF4,homology,Rnf4


In [93]:
# Across the list of gene pairs, which genes show up the most?

df = interact_table['interactor_symbol'].value_counts()
df.head(30)

FANCI      51
HES1       49
FANCD2     48
FANCL      48
FANCM      47
FAAP24     47
CENPX      47
RPA1       46
CENPS      46
UBE2T      45
ERCC1      44
FANCE      43
ERCC4      43
ATRIP      42
FANCB      42
RPA3       41
FANCG      41
FANCF      41
SLX4       41
FAN1       41
FANCC      40
EME1       40
ATR        39
USP1       38
ZBTB32     38
BRIP1      33
MUS81      32
FAAP100    32
RPA2       28
TOP3A      28
Name: interactor_symbol, dtype: int64

In [94]:
# Filter out genes from FA set
fa_all = 'https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt'

all_genes = pd.read_csv(fa_all, sep='\t', names=['gene', 'symbol'])

filtered_frame = interact_table[~interact_table['interactor_id'].isin(all_genes['gene'].tolist())]
filtered_frame = filtered_frame[~filtered_frame['interactor_symbol'].isin(interact_table['inferred_gene'].tolist())]


df = filtered_frame['interactor_symbol'].value_counts()
df.head(40)

HES1          49
RPA1          46
ERCC1         44
ATRIP         42
RPA3          41
FAN1          41
EME1          40
ATR           39
ZBTB32        38
USP1          38
MUS81         32
TOP3A         28
POLN          27
UBA52         27
RPS27A        26
EME2          26
CHEK1         25
BLM           25
HES4          19
HES3          19
TOPBP1        18
SLX1A         17
RFC4          17
RAD9A         16
CORT          16
RPA4          16
WDR48         16
RAD17         15
ZBTB16        15
RFC5          14
RAD1          14
PCNA          14
RFC3          14
XRCC3         13
DCLRE1A       12
UBC           12
DCLRE1B       11
RMI1          11
HUS1          11
CENPS-CORT    11
Name: interactor_symbol, dtype: int64