### Query:
What genes encode proteins that physically interact with proteins encoded by the 11 Fanconi Anemia core complex genes (set FA-core)? 

### Input:

Hardcoded tsv file from:

https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_1_core_complex.txt

### Goal:
This simple query aims to expand the FA-core gene set based PPI network membership.

### Route:
1. Protein-protein interactions from Monarch Solr index, either direct or inferred through orthology.  Sources: [BioGRID](https://thebiogrid.org/) and [STRING](http://string-db.org/)
2. Gene-ortholog associations from Monarch Solr index.  Sources: [Panther](http://www.pantherdb.org/)


In [95]:
import requests
import pandas as pd
import copy

solr_url = 'https://solr-dev.monarchinitiative.org/solr/golr/select'
core_set = 'https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_1_core_complex.txt'

columns = ['gene', 'interactor_id', 'interactor_symbol', 'qualifier', 'inferred_gene']
dataframe = pd.read_csv(core_set, sep='\t', names=['gene', 'symbol'])

def get_solr_results(solr, params):
    resultCount = params['rows']
    while params['start'] < resultCount:
        solr_request = requests.get(solr, params=params)
        response = solr_request.json()
        resultCount = response['response']['numFound']
        params['start'] += params['rows']
        for doc in response['response']['docs']:
            yield doc

interaction_params = {
    'wt': 'json',
    'rows': 100,
    'start': 0,
    'q': '*:*',
    'fl': 'subject, subject_label, subject_closure, \
           object, object_label, object_taxon',
    'fq': ['relation_closure: "RO:0002434"']
}

# Make new dataframe for results
interact_table = pd.DataFrame(columns=columns)


# Get interactions, both direct and inferred
for index, row in dataframe.iterrows():
    params = copy.deepcopy(interaction_params)
    params['fq'].append('subject_closure: "{0}" \
                        OR subject_ortholog_closure: "{0}"'
                        .format(row['gene']))
    for doc in get_solr_results(solr_url, params):
        result = {}
        result['gene'] = row['symbol']
        result['interactor_id'] = doc['object']
        result['interactor_symbol'] = doc['object_label']
        if row['gene'] in doc['subject_closure']:
            result['qualifier'] = "direct"
        else:
            result['qualifier'] = "homology"    
        interact_table = interact_table.append(result, ignore_index=True)
            
interact_table.head(10)

Unnamed: 0,gene,interactor_id,interactor_symbol,qualifier,inferred_gene
0,FANCA,NCBIGene:4851,NOTCH1,direct,
1,FANCA,NCBIGene:6117,RPA1,direct,
2,FANCA,MGI:2384790,Fanci,homology,
3,FANCA,NCBIGene:2189,FANCG,direct,
4,FANCA,MGI:1914446,Ube2t,homology,
5,FANCA,MGI:3648788,Gm5239,homology,
6,FANCA,NCBIGene:10915,TCERG1,direct,
7,FANCA,NCBIGene:2783,GNB2,direct,
8,FANCA,MGI:1917178,Cenps,homology,
9,FANCA,MGI:3643360,Gm7808,homology,


In [96]:
# Define function to fetch orthologs given a gene ID
def get_human_ortholog(solr, gene):
    params = {
            'wt': 'json',
            'rows': 100,
            'start': 0,
            'q': '*:*',
            'fl': 'subject, subject_label,'
                  'object, object_label',
            'fq': ['subject_closure: "{0}"'.format(gene),
                   'relation_closure: "RO:HOM0000017"',
                   'object_taxon: "NCBITaxon:9606"'
            ]
    }
    for doc in get_solr_results(solr, params):
        yield doc

# Get interactions, both direct and inferred
for index, row in interact_table.iterrows():
    if row['qualifier'] == 'homology':
        for doc in get_human_ortholog(solr_url, row['interactor_id']):
            result = {}
            result['gene'] = row['gene']
            result['interactor_id'] = doc['object']
            result['interactor_symbol'] = doc['object_label']
            result['qualifier'] = "homology"    
            result['inferred_gene'] = row['interactor_symbol']
            interact_table = interact_table.append(result, ignore_index=True)
        
interact_table.tail(10)


Unnamed: 0,gene,interactor_id,interactor_symbol,qualifier,inferred_gene
3111,UBE2T,NCBIGene:2187,FANCB,homology,Fancb
3112,UBE2T,NCBIGene:548593,SLX1A,homology,Slx1b
3113,UBE2T,NCBIGene:983,CDK1,homology,cdk1
3114,UBE2T,NCBIGene:57531,HACE1,homology,Hace1
3115,UBE2T,NCBIGene:6047,RNF4,homology,rnf4
3116,UBE2T,NCBIGene:643904,RNF222,homology,rnf4
3117,UBE2T,NCBIGene:26091,HERC4,homology,herc4
3118,UBE2T,NCBIGene:55215,FANCI,homology,Fanci
3119,UBE2T,NCBIGene:643904,RNF222,homology,Rnf4
3120,UBE2T,NCBIGene:6047,RNF4,homology,Rnf4


In [97]:
# Across the list of gene pairs, which genes show up the most?

df = interact_table['interactor_symbol'].value_counts()
df.head(30)

FANCI      37
HES1       34
FANCL      34
FANCD2     33
RPA1       33
CENPX      33
FANCM      33
FAAP24     32
CENPS      32
FANCB      31
FANCE      31
UBE2T      31
ERCC1      31
ERCC4      30
FANCG      30
SLX4       30
FAN1       30
ATRIP      29
FANCF      29
EME1       29
ATR        29
RPA3       29
FANCC      29
USP1       27
ZBTB32     27
MUS81      24
BRIP1      24
FAAP100    24
RPA2       22
UBA52      22
Name: interactor_symbol, dtype: int64

In [98]:
# Filter out genes from FA set
fa_all = 'https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt'

all_genes = pd.read_csv(fa_all, sep='\t', names=['gene', 'symbol'])

filtered_frame = interact_table[~interact_table['interactor_id'].isin(all_genes['gene'].tolist())]
filtered_frame = filtered_frame[~filtered_frame['interactor_symbol'].isin(interact_table['inferred_gene'].tolist())]


df = filtered_frame['interactor_symbol'].value_counts()
df.head(40)

HES1          34
RPA1          33
ERCC1         31
FAN1          30
ATRIP         29
RPA3          29
ATR           29
EME1          29
USP1          27
ZBTB32        27
MUS81         24
UBA52         22
POLN          21
BLM           21
RPS27A        20
EME2          20
CHEK1         20
TOP3A         19
TOPBP1        17
RFC4          16
RAD9A         15
RAD1          14
RAD17         14
WDR48         13
RFC5          13
RFC3          13
HES3          12
UBC           12
PCNA          12
DCLRE1A       12
HES4          12
SLX1A         11
CENPS-CORT    11
XRCC3         11
HUS1          11
RMI1          11
DCLRE1B       11
UBB           11
RPA4          10
CORT          10
Name: interactor_symbol, dtype: int64