In [None]:
import pandas as pd
import re
import os
import sys 
import glob
import warnings
import time
warnings.filterwarnings('ignore')
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'src/violin/')))
%load_ext autoreload
%autoreload 2

### NOTES:
##### This script is to make sure the regulator list in model file correct. This notebook can standardize simulation model 

In [None]:
type_dict = {'proteinfamily': 'pf',
             'proteincomplex': 'pf',
             'protein': 'pn',
             'chemical': 'che',
             'chemicalfamily': 'cf',
             'gene': 'gene',
             'rna': 'rna',
             'mutation': 'mut',
             'biologicalprocess': 'bp'}
vocab_df = pd.read_excel('input/models/vocab.xlsx', index_col=None)
vocab_df['Full Name'] = vocab_df['Full Name'].str.strip().str.lower()

In [None]:
from formatting import get_element

df = pd.read_excel('input/models/ModelA_biorecipe.xlsx')
df = df.astype(str)
model_index = df.index
for row in range(len(df)):
    index = model_index[row]
    for sign in ['Positive', 'Negative']:
        if df.loc[index, f'{sign} Regulation Rule'] == '':
            df.loc[index, f'{sign} Regulator List'] = ''
        else:
            df.loc[index, f'{sign} Regulator List'] = ','.join(
                list(get_element(df.loc[index, f'{sign} Regulation Rule'], 0)))

In [None]:
for row in range(len(df)):
    df.loc[row, 'Element Name'] = re.match(r'[A-Z0-9\_]+', df.loc[row, 'Element Name']).group(0)
df = df.fillna('nan').astype(str)


In [None]:
abb_subtype_list = []
for row in range(len(df)):
    #abb_type = type_dict[df.loc[row, 'Element Type'].lower().replace(' ', '')] 
    subtype_string = df.loc[row, 'Element Subtype'].split(',')[0]
    list_ = []
    if subtype_string not in ['', 'nan']:
        for x in [subname for subname in subtype_string.replace('(', ' ').replace(')', ' ').split(' ') if subname not in ['', ' ']]:
            if x.lower() not in list(vocab_df['Full Name']):
                list_.append(x.strip())
            else:
                list_.append(vocab_df.loc[list(vocab_df['Full Name']).index(x.lower().strip()), 'Abbreivation'])
        abb_subtype = ''.join(list_)
    else:
        abb_subtype = 'nan'
    abb_subtype_list.append(abb_subtype)


for row in range(len(df)):
    pos_list = []
    if df.loc[row, 'Positive Regulator List'] != 'nan':
        # Format white space, separate it to list
        pos_reg_list = df.loc[row, 'Positive Regulator List'].replace(' ', '').split(',')

        for pos_reg in pos_reg_list:
            
            pos_idx = list(df['Variable']).index(pos_reg)
            if df.loc[pos_idx, 'Element Type'] not in type_dict:
                type_ = ''.join(re.findall(r'[a-z]+', df.loc[pos_idx, 'Element Type'].lower().replace(' ', '')))
            else:
                type_ = type_dict[df.loc[pos_idx, 'Element Type'].lower().replace(' ', '')]
            pos_list.append('{}_{}_{}_{}'.format(
                df.loc[pos_idx, 'Element Name'],
                type_,
                abb_subtype_list[pos_idx],
                df.loc[pos_idx, 'Compartment ID'].replace(':', '')
            ))
    df.loc[row, 'Positive Regulator List'] = ','.join(pos_list)
            
    neg_list = []
    if df.loc[row, 'Negative Regulator List'] != 'nan':
        # Format white space, separate it to list
        neg_reg_list = df.loc[row, 'Negative Regulator List'].replace(' ', '').split(',')

        for neg_reg in neg_reg_list:
            neg_idx = list(df['Variable']).index(neg_reg)
            if df.loc[neg_idx, 'Element Type'] not in type_dict:
                type_ = ''.join(re.findall(r'[a-z]+', df.loc[neg_idx, 'Element Type'].lower().replace(' ', '')))
            else:
                type_ = type_dict[df.loc[neg_idx, 'Element Type'].lower().replace(' ', '')]
            neg_list.append('{}_{}_{}_{}'.format(
                df.loc[neg_idx, 'Element Name'],
                type_,
                abb_subtype_list[neg_idx],
                df.loc[neg_idx, 'Compartment ID'].replace(':', '')
            ))
    df.loc[row, 'Negative Regulator List'] = ','.join(neg_list)

In [None]:
# proofreading
for row in range(len(df)):
    df.loc[row, 'Positive Mechanism List'] = df.loc[row, 'Positive Mechanism List'].replace('?', 'nan').replace(' ', '')
    df.loc[row, 'Negative Mechanism List'] = df.loc[row, 'Negative Mechanism List'].replace('?', 'nan').replace(' ', '')
    df.loc[row, 'Positive Mechanism List'] = df.loc[row, 'Positive Mechanism List'].replace(' ', '')
    df.loc[row, 'Negative Mechanism List'] = df.loc[row, 'Negative Mechanism List'].replace(' ', '')
    df.loc[row, 'Positive Connection Type List'] = df.loc[row, 'Positive Connection Type List'].replace(' ', '')
    df.loc[row, 'Negative Connection Type List'] = df.loc[row, 'Negative Connection Type List'].replace(' ', '')
    df.loc[row, 'Element HGNC Symbol'] = df.loc[row, 'Element HGNC Symbol'].replace(' ', '')
    df.loc[row, 'Element IDs'] = df.loc[row, 'Element IDs'].replace(' ', '')
    

for row in range(len(df)):
    pos_reg_list = df.loc[row, 'Positive Regulator List'].split(',')
    pos_cnx_list = df.loc[row, 'Positive Connection Type List'].split(',')
    if len(pos_reg_list) != len(pos_cnx_list):
        print(f'row: {row} positive not right')
    neg_reg_list = df.loc[row, 'Negative Regulator List'].split(',')
    neg_cnx_list = df.loc[row, 'Negative Connection Type List'].split(',')
    if len(neg_reg_list) != len(neg_cnx_list):
        print(f'row: {row} negative not right')

In [None]:
# Save your model
df.replace('nan', '').to_excel('input/models/ModelA_biorecipe.xlsx')