In [1]:
# Loading padas dataframe file
import pandas as pd

In [2]:
df = pd.read_csv('metric_dataframe.csv')

## Function to access data in python data structure form

In [3]:
def getItemsInDict(text):
    """
    This function takes a string which contains data in following format. 
    The function then process the string, extract the information in dictionary data structure.
     sample: VI) EDA
    
    params:
    
        text: string
        
    returns:
    
        dictionary with extracted information
    
    """
    # remove additional quotes
    text_no_quotes = text.replace('\"','')
    text_items = text_no_quotes.split(';')
    
    pre_text_items = [item for item in text_items if item != '' ]

    labels = [item.split(')')[1].strip() for item in pre_text_items]
    index = [item.split(')')[0].strip() for item in pre_text_items]
    
    pre_met = {}
    for ind,lab in zip(index,labels):
        pre_met[ind] = lab
    return pre_met


def getRelationshipData(data,metrics_org,outcome_smaller):
    """
    This function processess relationship codes and relate the metric and outcome indices to actual
    metrics and outcomes.
    
    params:
    
        data  : relationship string
        
        metrics_org: actual metrics reported in the paper
        outcome_smaller: actually outcome reported in the paper
        
    returns:
    
        returns a dictionary where keys are the metrics and values are the outcomes.
    """
    text_no_quotes = data.replace('\"','')
    text_items = text_no_quotes.split(';')
    
    pre_text_items = [item for item in text_items if item != '' ]
    rel_tuples = []
    for rel in pre_text_items:
        parts = rel.split(':')
        rel_type = '' if len(parts) < 3 else parts[2]
        rel_method = parts[1]
        rel_parts = parts[0].split('-')
        metrics = rel_parts[0]
        outcomes = rel_parts[1]
        metrics = [item.strip() for item in metrics.split(',')]
        outcomes = [item.strip() for item in outcomes.split(',')]
        for metric in metrics:
            for outcome in outcomes:
                #print('adding {}=>{}'.format([metrics_org[metric]],outcome_smaller[outcome]))
                try:
                    rel_tuples.append((metrics_org[metric],outcome_smaller[outcome],rel_method))
                except:
                    pass
    return rel_tuples   

In [None]:
## Prepare data for plotting sankey diagram
We are

In [None]:
# prepare data from sa    
def addSourceTarget(source_id,target_id,link,value):
    if (source_id,target_id) in link:    
        link_id = link.index((source_id,target_id))
        value[link_id] += 1
    else:      
        link.append((source_id,target_id))
        value.append(1)

    return link,value
    
def getSankeyData(paper_id):
    # data for creating sankey diagram with plotly
    nodes = []
    link = []
    value = []
    index = paper_id -1 
    
    tdf = pd.DataFrame(columns=['paper_id','year','source','target','weight','rel_type'])
    
    paper_record = metric_df.loc[metric_df['paper_id'] == paper_id,:].to_dict()

    sdf = pd.DataFrame(columns=['year','source','target','weight'])

    year = paper_record['year'][index]
    metrics_org = getItemsInDict(paper_record['metric'][index])
    metrics_larger = getItemsInDict(paper_record['metric_larger'][index])
    metrics_smaller = getItemsInDict(paper_record['metric_smaller'][index])

    outcome_larger = getItemsInDict(paper_record['outcome_larger'][index])
    outcome_smaller = getItemsInDict(paper_record['outcome_smaller'][index])

    relationship = getRelationshipData(paper_record['relationship'][index],metrics_org,outcome_smaller)
    
    nodes += list(metrics_org.values()) 
    nodes += list(metrics_smaller.values()) 
    nodes += list(metrics_larger.values()) 
    nodes += list(outcome_smaller.values()) 
    nodes += list(outcome_larger.values())
    
    nodes = list(set(nodes))
    
    for a,b,c in zip(metrics_larger,metrics_smaller,metrics_org):
        record_one = {
            'paper_id':paper_id,
            'year':year,
            'source':metrics_larger[a],
            'target':metrics_smaller[b],
            'weight':1,
            'rel_type':'met_larger_smaller'
        }
        
        #source_id = nodes.index(metrics_larger[a])
        #target_id = nodes.index(metrics_smaller[b])

        #link,value = addSourceTarget(source_id,target_id,link,value)
        
        record_two = {
            'paper_id':paper_id,
            'year':year,
            'source':metrics_smaller[b],
            'target':metrics_org[c],
            'weight':1,
            'rel_type':'met_smaller_org'
        }
        
        #source_id = nodes.index(metrics_smaller[b])
        #target_id = nodes.index(metrics_org[c])
        #link,value = addSourceTarget(source_id,target_id,link,value)
        
        tdf = tdf.append(record_one,ignore_index=True)
        tdf = tdf.append(record_two,ignore_index=True)
    
    for relation in relationship:

        record_rel = {
            'paper_id':paper_id,
            'year':year,
            'source':relation[0],
            'target':relation[1],
            'weight':1,
            'rel_type':'met_out'
        }
        
        #source_id = nodes.index(relation[0])
        #target_id = nodes.index(relation[1])
        #link,value = addSourceTarget(source_id,target_id,link,value)
        
        tdf = tdf.append(record_rel,ignore_index=True)
        
    for a,b in zip(outcome_larger,outcome_smaller):
        record_out = {
            'paper_id':paper_id,
            'year':year,
            'source':outcome_smaller[b],
            'target':outcome_larger[a],
            'weight':1,
            'rel_type':'out_smaller_larger'
        }
        #source_id = nodes.index(outcome_smaller[b])
        #target_id = nodes.index(outcome_larger[a])
        #link,value = addSourceTarget(source_id,target_id,link,value)
        tdf = tdf.append(record_out,ignore_index=True)
        
    return tdf