In [1]:
import pandas as pd
import re
import os
import sys 
import glob
import warnings
import time
warnings.filterwarnings('ignore')
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'src/violin/')))
%load_ext autoreload
%autoreload 2
from violin.in_out import preprocessing_model, preprocessing_reading, output
from violin.scoring import score_reading
from violin.network import node_edge_list
from violin.visualize_violin import visualize 


In [5]:
## Parameters ## 
approach = '1'
kind_dict = {"strong corroboration" : 2, 
                "empty attribute" : 1,
                "indirect interaction" : 3,
                "path corroboration" : 5,
                "specification" : 7,
                "hanging extension" : 40, 
                "full extension" : 39, 
                "internal extension" : 38,  
                "dir contradiction" : 11,
                "sign contradiction" : 10, 
                "att contradiction" : 9,
                "dir mismatch" : 20,
                "path mismatch" : 19,
                "self-regulation" : 18}
match_dict = {"source present" : 1, 
                "target present" : 100, 
                "both present" : 10, 
                "neither present" : 0.1}
evidence_scoring_cols = ["Regulator Name", "Regulator Type", "Regulator Subtype", "Regulator HGNC Symbol", "Regulator Database", "Regulator ID", "Regulator Compartment", "Regulator Compartment ID",
                        "Regulated Name", "Regulated Type", "Regulated Subtype", "Regulated HGNC Symbol", "Regulated Database", "Regulated ID", "Regulated Compartment", "Regulated Compartment ID",
                        "Sign", "Connection Type", "Mechanism", "Site",
                        "Cell Line", "Cell Type", "Tissue Type", "Organism"]
attributes = ['Regulated Compartment ID', 'Regulator Compartment ID']
#attributes = ['Regulated Compartment ID', 'Regulator Compartment ID', 'Mechanism', 'Cell Line', 'Cell Type', 'Tissue Type', 'Organism']

In [16]:
reader = 'LLAMA'
model_files = ['input/models/SkMel133_biorecipe.xlsx', 'input/models/ModelB_discrete_biorecipe.xlsx']
reading_A_files = glob.glob(f'input/interactions/{reader}/RA*.xlsx')
reading_B_files = glob.glob(f'input/interactions/{reader}/RB*.xlsx')

model_A_df = preprocessing_model(model_files[0])
model_B_df = preprocessing_model(model_files[1])

graph_A = node_edge_list(model_A_df)
graph_B = node_edge_list(model_B_df)

## Data collections

In [17]:
#reader = 'REACH'
for reading_file in reading_A_files:
    output_file = f'output/{reader}' + '/' + reading_file.split('/')[-1].split('_reading_BioRECIPE')[0]
    print(output_file)
    time1 = time.time()
    reading_df = preprocessing_reading(reading=reading_file, 
                                       evidence_score_cols=evidence_scoring_cols, 
                                       atts=attributes)
    counter_A = {'corroboration': [], 'contradiction': []}
    scored = score_reading(reading_df, 
                       model_A_df, 
                       graph_A, 
                       counter=counter_A,
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach,
                       )
    output(scored, output_file, kind_values=kind_dict)
    print(time.time() - time1)
    print('corroboration in model: {}'.format(len(set(counter_A['corroboration']))))
    print('contradiction in model: {}'.format(len(set(counter_A['contradiction']))))

for reading_file in reading_B_files:
    output_file = f'output/{reader}' + '/' + reading_file.split('/')[-1].split('_reading_BioRECIPE')[0]
    print(output_file)
    time1 = time.time()
    reading_df = preprocessing_reading(reading=reading_file, 
                                       evidence_score_cols=evidence_scoring_cols, 
                                       atts=attributes)
    counter_B = {'corroboration': [], 'contradiction': []}
    scored = score_reading(reading_df, 
                       model_B_df, 
                       graph_B, 
                       counter=counter_B,
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach)
    output(scored, output_file, kind_values=kind_dict)
    print(time.time() - time1)
    print('corroboration in model: {}'.format(len(set(counter_B['corroboration']))))
    print('contradiction in model: {}'.format(len(set(counter_B['contradiction']))))


output/LLAMA/RA4
13
0.27515578269958496
corroboration in model: 0
contradiction in model: 0
output/LLAMA/RA3
183
1.5431809425354004
corroboration in model: 2
contradiction in model: 4
output/LLAMA/RA2
567
3.1756250858306885
corroboration in model: 3
contradiction in model: 9
output/LLAMA/RB_star_1
163
0.6803100109100342
corroboration in model: 0
contradiction in model: 0
output/LLAMA/RB_star_2
60
0.37079811096191406
corroboration in model: 0
contradiction in model: 0
output/LLAMA/RB1
171
0.7470047473907471
corroboration in model: 0
contradiction in model: 1
output/LLAMA/RB2
58
0.3725299835205078
corroboration in model: 0
contradiction in model: 0
output/LLAMA/RB3
55
0.34453916549682617
corroboration in model: 0
contradiction in model: 0


### TESTING task 1 & 2

#### Convert model format to interactions list to verify the corroborations

In [3]:
from translators.within_biorecipe.md_and_int import get_interactions_from_model

In [12]:
attributes = ['Regulated Compartment ID', 'Regulator Compartment ID', 'Mechanism', 'Cell Line', 'Cell Type', 'Tissue Type', 'Organism']
# Switch format
interactions_A = os.getcwd() + '/input/interactions/translated_SkeMel133_biorecipe.xlsx'
interactions_B = os.getcwd() + '/input/interactions/translated_ModelB_discrete_biorecipe.xlsx'

get_interactions_from_model(os.getcwd() + '/' + model_files[0], interactions_A)
get_interactions_from_model(os.getcwd() + '/' + model_files[1], interactions_B)

interactions_A_df = pd.read_excel(interactions_A, index_col=None)
interactions_B_df = pd.read_excel(interactions_B, index_col=None)

In [13]:
# Test in VIOLIN
reading_A_df = preprocessing_reading(reading=interactions_A, 
                                     evidence_score_cols=evidence_scoring_cols, 
                                     atts=attributes)
counter_A = {'corroboration': [], 'contradiction': []}
scored = score_reading(reading_A_df, 
                       model_A_df, 
                       graph_A, 
                       counter=counter_A,
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach,
                       )
output_file = f'test/test_result' + '/' + interactions_A.split('/')[-1].split('_biorecipe')[0]
output(scored, output_file, kind_values=kind_dict)
print('corroboration in model: {}'.format(len(set(counter_A['corroboration']))))
print('contradiction in model: {}'.format(len(set(counter_A['contradiction']))))

reading_B_df = preprocessing_reading(reading=interactions_B,
                                     evidence_score_cols=evidence_scoring_cols,
                                     atts=attributes)
counter_B = {'corroboration': [], 'contradiction': []}
scored = score_reading(reading_B_df, 
                       model_B_df, 
                       graph_B, 
                       counter=counter_B,
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach)
output_file = f'test/test_result' + '/' + interactions_B.split('/')[-1].split('_biorecipe')[0]
output(scored, output_file, kind_values=kind_dict)
print('corroboration in model: {}'.format(len(set(counter_B['corroboration']))))
print('contradiction in model: {}'.format(len(set(counter_B['contradiction']))))


266
corroboration in model: 264
contradiction in model: 0
72
corroboration in model: 71
contradiction in model: 0


### TESTING Task 3 & 4

#### Randomly sampled several interactions to test if VIOLIN can get corroborations

In [8]:
# Randomly choose some interactions from modelA
import random 
random.seed(10)

random_A_df = reading_A_df.sample(n=50).reset_index()
random_B_df = reading_B_df.sample(n=25).reset_index()

counter_A = {'corroboration': [], 'contradiction': []}
scored = score_reading(random_A_df, 
                       model_A_df, 
                       graph_A, 
                       counter=counter_A,
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach,
                       )
output_file = f'test/test_result' + '/' + interactions_A.split('/')[-1].split('_biorecipe')[0]
output(scored, output_file, kind_values=kind_dict)
print('corroboration in model: {}'.format(len(set(counter_A['corroboration']))))
print('contradiction in model: {}'.format(len(set(counter_A['contradiction']))))

counter_B = {'corroboration': [], 'contradiction': []}
scored = score_reading(random_B_df, 
                       model_B_df, 
                       graph_B, 
                       counter=counter_B,
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach)
output_file = f'test/test_result' + '/' + interactions_B.split('/')[-1].split('_biorecipe')[0]
output(scored, output_file, kind_values=kind_dict)
print('corroboration in model: {}'.format(len(set(counter_B['corroboration']))))
print('contradiction in model: {}'.format(len(set(counter_B['contradiction']))))


50
corroboration in model: 50
contradiction in model: 0
25
corroboration in model: 25
contradiction in model: 0


### TESTING Task 5 & 6 

#### Use RA2.0.1.1 to verify if VIOLIN can get extensions correctly

In [9]:
# import RA2.0.1.1 to verify the extension
extension_reading = os.getcwd() + '/'+ 'input/interactions/example/RA2_0_1_1_reading_BioRECIPE.xlsx'
extension_A_df = preprocessing_reading(reading=extension_reading, 
                                     evidence_score_cols=evidence_scoring_cols, 
                                     atts=attributes)
counter_A = {'corroboration': [], 'contradiction': []}
scored = score_reading(extension_A_df, 
                       model_A_df, 
                       graph_A, 
                       counter=counter_A, 
                       kind_values=kind_dict, 
                       match_values=match_dict, 
                       attributes=attributes, 
                       classify_scheme=approach, 
                       )
output_file = f'test/test_result' + '/' + extension_reading.split('/')[-1].split('_reading_BioRECIPE')[0]
output(scored, output_file, kind_values=kind_dict)
print('corroboration in model: {}'.format(len(set(counter_A['corroboration']))))
print('contradiction in model: {}'.format(len(set(counter_A['contradiction']))))

1006
corroboration in model: 23
contradiction in model: 13


## TESTING Task 7&8

#### FLUTE reading files

In [19]:
import httplib2 as http
import json
import time

try:
    from urlparse import urlparse
except ImportError:
    from urllib.parse import urlparse 
    
headers = {
    "Accept": 'application/json'
}

h = http.Http()
def get_hgnc_symbol(hgnc_id, url='https://rest.genenames.org/fetch/hgnc_id'):
    response, content = h.request(
    url+f'/{hgnc_id}',
    'GET',
    '',
    headers
    )
    print(hgnc_id)
    data = json.loads(content)
    status_code = False; i = 0; symbol = ''
    while status_code != True and i < 10:
        try:
            i += 1
            response, content = h.request(
                url+f'/{hgnc_id}',
                'GET',
                '',
                headers)
            
            if response['status'] == '200':
                symbol = data['response']['docs'][0]['symbol']
                status_code = True
            else:
                pass
        except Exception as e:
            print(e)
            time.sleep(1)
            
    return symbol

In [20]:
j = get_hgnc_symbol(11803)

11803
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range


In [11]:
re.findall(r'[0-9]+', 'HGNC: 5464')[0]

'5464'

In [None]:
dict_ = {}

In [53]:
### Switch HGNC ID to HGNC symbol
for machine in ['INDRA']:
    for file in glob.glob(f'input/interactions/FLUTE/{machine}/*.xlsx'):
        if 'grd_ints_scores' in file:
            pass
        
        else:
            df = pd.read_excel(file)
            basename = file.split('/')[-1]
            print(basename)
            if 'Regulated HGNC ID' in df.columns:
                list_ = []
                for x in df['Regulated HGNC ID'].to_list():
                    if len(re.findall(r'[0-9]+', str(x))) != 0:
                        x_id = re.findall(r'[0-9]+', str(x))[0]
                        if x_id in list(dict_.keys()):
                            list_.append(dict_[x_id])
                        else:
                            if x_id != '':
                                dict_[x_id] = get_hgnc_symbol(x_id)
                                list_.append(dict_[x_id])
                            else: pass

                    else:
                        list_.append('')
                df['Regulated HGNC Symbol'] = list_
                df.drop('Regulated HGNC ID', inplace=True, axis=1)
            else:
                pass 
            
            if 'Regulator HGNC ID' in df.columns:
                list_ = []
                for x in df['Regulator HGNC ID'].to_list():
                    if len(re.findall(r'[0-9]+', str(x))) != 0:
                        x_id = re.findall(r'[0-9]+', str(x))[0]
                        if x_id in list(dict_.keys()):
                            list_.append(dict_[x_id])
                        else:
                            if x_id != '':
                                dict_[x_id] = get_hgnc_symbol(x_id)
                                list_.append(dict_[x_id])
                            else: pass

                    else:
                        list_.append('')
                df['Regulator HGNC Symbol'] = list_
                df.drop('Regulator HGNC ID', inplace=True, axis=1)
            
            else:
                pass
            
            df.to_excel(file, index=False)

RB2_reading_BioRECIPE_FLUTE_filtered.xlsx
RA4_reading_BioRECIPE_FLUTE_filtered.xlsx
RB3_reading_BioRECIPE_FLUTE_filtered.xlsx
RB1_reading_BioRECIPE_FLUTE_filtered.xlsx
RA1_reading_BioRECIPE_FLUTE_filtered.xlsx
RA2_reading_BioRECIPE_FLUTE_filtered.xlsx
RA3_reading_BioRECIPE_FLUTE_filtered.xlsx
RB_star_2_reading_BioRECIPE_FLUTE_filtered.xlsx
RB_star_1_reading_BioRECIPE_FLUTE_filtered.xlsx


In [None]:
dict_

#### test tab data 

In [38]:
testing_A_readings = ["RA2",
                    "RA21",
                    "RA2_0_1",
                    "RA2_0_1_1"]
testing_A_readings = [f'input/interactions/example/{x}_reading_BioRECIPE.xlsx' for x in testing_A_readings]

testing_B_readings = ["RB2",
                      "RB21",
                      "RB2_0_1"]
testing_B_readings = [f'input/interactions/example/{x}_reading_BioRECIPE.xlsx' for x in testing_B_readings]

for reading_file in testing_A_readings:
    output_file = f'test/test_result' + '/' + reading_file.split('/')[-1].split('_reading_BioRECIPE')[0]
    print(output_file)
    time1 = time.time()
    reading_df = preprocessing_reading(reading=reading_file,
                                       evidence_score_cols=evidence_scoring_cols,
                                       atts=attributes)
    counter_A = {'corroboration': [], 'contradiction': []}
    scored = score_reading(reading_df,
                           model_A_df,
                           graph_A,
                           counter=counter_A,
                           kind_values=kind_dict,
                           match_values=match_dict,
                           attributes=attributes,
                           classify_scheme=approach,
                           )
    output(scored, output_file, kind_values=kind_dict)
    print(time.time() - time1)
    print('corroboration in model: {}'.format(len(set(counter_A['corroboration']))))
    print('contradiction in model: {}'.format(len(set(counter_A['contradiction']))))

for reading_file in testing_B_readings:
    output_file = f'test/test_result' + '/' + reading_file.split('/')[-1].split('_reading_BioRECIPE')[0]
    print(output_file)
    time1 = time.time()
    reading_df = preprocessing_reading(reading=reading_file,
                                       evidence_score_cols=evidence_scoring_cols,
                                       atts=attributes)
    counter_B = {'corroboration': [], 'contradiction': []}
    scored = score_reading(reading_df,
                           model_B_df,
                           graph_B,
                           counter=counter_B,
                           kind_values=kind_dict,
                           match_values=match_dict,
                           attributes=attributes,
                           classify_scheme=approach)
    output(scored, output_file, kind_values=kind_dict)
    print(time.time() - time1)
    print('corroboration in model: {}'.format(len(set(counter_B['corroboration']))))
    print('contradiction in model: {}'.format(len(set(counter_B['contradiction']))))


test/test_result/RA2
5725
18.219555139541626
corroboration in model: 34
contradiction in model: 32
test/test_result/RA21
918
3.2124831676483154
corroboration in model: 9
contradiction in model: 19
test/test_result/RA2_0_1
2584
8.524222373962402
corroboration in model: 31
contradiction in model: 26
test/test_result/RA2_0_1_1
1006
3.481868028640747
corroboration in model: 23
contradiction in model: 13
test/test_result/RB2
163
0.524526834487915
corroboration in model: 9
contradiction in model: 1
test/test_result/RB21
37
0.2357780933380127
corroboration in model: 0
contradiction in model: 0
test/test_result/RB2_0_1
102
0.4438438415527344
corroboration in model: 5
contradiction in model: 0


#### FLUTE tab data

In [9]:
attributes = ["Regulated Compartment", "Regulator Compartment"]

#for machine in ['gpt', 'INDRA', 'LLAMA', 'REACH']:
for machine in ['gpt']:
    print(f'=================={machine}=================')
    for file in glob.glob(f'input/interactions/FLUTE/{machine}/*.xlsx'):
        if 'grd_ints_scores' in file:
            pass
        
        else:
            output_file = f'output/FLUTE' + f'/{machine}/' + file.split('/')[-1].split('_reading_BioRECIPE')[0]
            df = pd.read_excel(file)
            basename = file.split('/')[-1]
            print(basename)
            # Choose model dataframe
            if re.findall(r'^RA', basename):
                model_df = model_A_df
                graph = graph_A
            else:
                model_df = model_B_df
                graph = graph_B
            print(f'model length: {len(model_df)}')
            # process through VIOLIN 
            time1 = time.time()
            reading_df = preprocessing_reading(reading=file,
                                               evidence_score_cols=evidence_scoring_cols,
                                               atts=attributes)
            counter_ = {'corroboration': [], 'contradiction': []}
            scored = score_reading(reading_df,
                                   model_df,
                                   graph,
                                   counter=counter_,
                                   kind_values=kind_dict,
                                   match_values=match_dict,
                                   attributes=attributes,
                                   classify_scheme=approach)
            output(scored, output_file, kind_values=kind_dict)
            print(time.time() - time1)
            print('corroboration in model: {}'.format(len(set(counter_['corroboration']))))
            print('contradiction in model: {}'.format(len(set(counter_['contradiction']))))1

RB2_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 39
104
0.4778470993041992
corroboration in model: 1
contradiction in model: 1
RA4_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 179
8
0.15981507301330566
corroboration in model: 1
contradiction in model: 0
RB3_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 39
66
0.41270875930786133
corroboration in model: 0
contradiction in model: 2
RB1_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 39
259
0.964310884475708
corroboration in model: 0
contradiction in model: 1
RA2_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 179
1158
7.202918767929077
corroboration in model: 18
contradiction in model: 37
RA3_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 179
254
1.7872989177703857
corroboration in model: 4
contradiction in model: 10
RB_star_2_reading_BioRECIPE_FLUTE_filtered.xlsx
model length: 39
58
0.3278920650482178
corroboration in model: 0
contradiction in model: 0
RB_star_1_reading_BioRECIPE_FLUTE_filtered.xlsx
model

## TESTING TASK 9 & 10

#### Randomly added contradiction interactions to see if there is any influence on mixing

In [13]:
testing_A_readings = ["translated_SkeMel133_biorecipe.xlsx",
                    "translated_SkeMel133_biorecipe_combined_10contradictions.xlsx",
                    "translated_SkeMel133_biorecipe_combined_10randoms.xlsx"]
testing_A_readings = [f'input/interactions/{x}' for x in testing_A_readings]

testing_B_readings = ["translated_ModelB_discrete_biorecipe.xlsx",
                      "translated_ModelB_discrete_biorecipe_combined_10contradictions.xlsx",
                      "translated_ModelB_discrete_biorecipe_combined_10randoms.xlsx"]
testing_B_readings = [f'input/interactions/{x}' for x in testing_B_readings]

for reading_file in testing_A_readings:
    output_file = f'test/test_result' + '/' + reading_file.split('/')[-1].split('.')[0]
    print(output_file)
    time1 = time.time()
    reading_df = preprocessing_reading(reading=reading_file,
                                       evidence_score_cols=evidence_scoring_cols,
                                       atts=attributes)
    counter_A = {'corroboration': [], 'contradiction': []}
    scored = score_reading(reading_df,
                           model_A_df,
                           graph_A,
                           counter=counter_A,
                           kind_values=kind_dict,
                           match_values=match_dict,
                           attributes=attributes,
                           classify_scheme=approach,
                           )
    output(scored, output_file, kind_values=kind_dict)
    print(time.time() - time1)
    print('corroboration in model: {}'.format(len(set(counter_A['corroboration']))))
    print('contradiction in model: {}'.format(len(set(counter_A['contradiction']))))

for reading_file in testing_B_readings:
    output_file = f'test/test_result' + '/' + reading_file.split('/')[-1].split('.')[0]
    print(output_file)
    time1 = time.time()
    reading_df = preprocessing_reading(reading=reading_file,
                                       evidence_score_cols=evidence_scoring_cols,
                                       atts=attributes)
    counter_B = {'corroboration': [], 'contradiction': []}
    scored = score_reading(reading_df,
                           model_B_df,
                           graph_B,
                           counter=counter_B,
                           kind_values=kind_dict,
                           match_values=match_dict,
                           attributes=attributes,
                           classify_scheme=approach)
    output(scored, output_file, kind_values=kind_dict)
    print(time.time() - time1)
    print('corroboration in model: {}'.format(len(set(counter_B['corroboration']))))
    print('contradiction in model: {}'.format(len(set(counter_B['contradiction']))))

test/test_result/translated_SkeMel133_biorecipe
266
1.4218220710754395
corroboration in model: 264
contradiction in model: 0
test/test_result/translated_SkeMel133_biorecipe_combined_10contradictions
276
1.0342469215393066
corroboration in model: 264
contradiction in model: 5
test/test_result/translated_SkeMel133_biorecipe_combined_10randoms
275
1.3475852012634277
corroboration in model: 264
contradiction in model: 1
test/test_result/translated_ModelB_discrete_biorecipe
72
0.28899717330932617
corroboration in model: 71
contradiction in model: 0
test/test_result/translated_ModelB_discrete_biorecipe_combined_10contradictions
83
0.28201889991760254
corroboration in model: 71
contradiction in model: 4
test/test_result/translated_ModelB_discrete_biorecipe_combined_10randoms
83
0.5087399482727051
corroboration in model: 71
contradiction in model: 0
