In [40]:
import pandas as pd
import numpy as np

In [302]:
df = pd.read_csv('data.tsv', sep='\t')

In [303]:
needs_list = ['G1', 'G2', 'O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 
         'T1', 'T2', 'T3', 'T4', 'T5']
for col in needs_list: 
    df[col] = np.nan

In [304]:
for i, row in enumerate(df.codes):
    row_codes = row.split(';')
    for need_string in row_codes: 
        need_string_split = need_string.split(":")
        if len(need_string_split) > 1: 
            label, snippet = need_string_split
        else: 
            label = need_string
            snippet = "general"
        
        df.loc[i, label.strip()] = snippet.strip()

In [305]:
df.drop('codes', axis=1, inplace=True)  

In [306]:
stakeholders = ['ML-Formal', 'ML-Instrumental', 'ML-Personal', 'Domain-Formal',
       'Domain-Instrumental', 'Domain-Personal', 'Milieu-Formal',
       'Milieu-Instrumental', 'Milieu-Personal']
goals = ['G1', 'G2']
objectives = [ 'O1', 'O2', 'O3',
       'O4', 'O5', 'O6', 'O7']
tasks = ['T1', 'T2', 'T3', 'T4', 'T5']

In [307]:
all_nan_stakeholders = df[stakeholders].isnull().apply(lambda x: all(x), axis=1) 
df['Stakeholders-Not-Specified'] = np.nan
df.loc[all_nan_stakeholders, 'Stakeholders-Not-Specified'] = 'general'
stakeholders.append('Stakeholders-Not-Specified')

In [308]:
all_nan_goals = df[goals].isnull().apply(lambda x: all(x), axis=1) 
df['Goal-Not-Specified'] = np.nan
df.loc[all_nan_goals, 'Goal-Not-Specified'] = 'general'
goals.append('Goal-Not-Specified')

In [309]:
all_nan_objectives = df[objectives].isnull().apply(lambda x: all(x), axis=1) 
df['Obj-Not-Specified'] = np.nan
df.loc[all_nan_objectives, 'Obj-Not-Specified'] = 'general'
objectives.append('Obj-Not-Specified')

In [310]:
link_data = []
for i in range(df.shape[0]):
    row = df.loc[i]
    keys = row.dropna().keys()
    stakeholders_in_row = [k for k in keys if k in stakeholders]
    goals_in_row = [k for k in keys if k in goals]
    objectives_in_row = [k for k in keys if k in objectives]
    tasks_in_row = [k for k in keys if k in tasks]
    for s in stakeholders_in_row: 
        for g in goals_in_row: 
            link_data.append({
                'source': s,
                'target': g, 
                'level': 0,
                'paper': i
            })
            
    for g in goals_in_row: 
        for o in objectives_in_row: 
            link_data.append({
                'source': g,
                'target': o,
                'level': 1,
                'paper': i
            })
            
    for o in objectives_in_row: 
        for t in tasks_in_row: 
            link_data.append({
                'source': o,
                'target': t,
                'level': 2, 
                'paper': i
            })

In [311]:
gb = pd.DataFrame(link_data).groupby(['source', 'target'])
paper_list = gb['paper'].apply(list)
level = gb['level'].mean()
count = gb.size()

In [312]:
idx = pd.Index(gb.groups.keys()).set_names(['source', 'target'])

In [313]:
agg_df = pd.DataFrame(index=idx, data={'paper_list': paper_list, 'level': level, 'count': count})

In [314]:
agg_df_json = agg_df.reset_index().to_json(orient='records')

In [315]:
agg_link_data = json.loads(agg_df_json)

In [336]:
nodes = []
for s in stakeholders: 
    nodes.append({'name': s, 'category': 'knowledge', 
                  'paper_list': list(df[~pd.isnull(df[s])].index), 
                  'snippets': {key: value for key, value in df[~pd.isnull(df[s])][s].to_dict().items() if value != 'general'}
})
for g in goals: 
    nodes.append({'name': g, 'category': 'goals', 
                  'paper_list': list(df[~pd.isnull(df[g])].index),
                  'snippets': {key: value for key, value in df[~pd.isnull(df[g])][g].to_dict().items() if value != 'general'}
})
for o in objectives: 
    nodes.append({'name': o, 'category': 'objectives', 
                  'paper_list': list(df[~pd.isnull(df[o])].index),
                  'snippets': {key: value for key, value in df[~pd.isnull(df[o])][o].to_dict().items() if value != 'general'}
})
for t in tasks: 
    nodes.append({'name': t, 'category': 'tasks', 
                  'paper_list': list(df[~pd.isnull(df[t])].index),
                  'snippets': {key: value for key, value in df[~pd.isnull(df[t])][t].to_dict().items() if value != 'general'}})

In [338]:
nodes

[{'name': 'ML-Formal',
  'category': 'knowledge',
  'paper_list': [3, 4, 5, 12, 13, 14, 16, 17, 20, 33, 37, 38, 41, 45, 53, 54],
  'snippets': {3: 'AI experts are machine learning scientists and engineers who design machine learning algorithms',
   4: 'Experts: This group of users are experienced in deep learning, and they wish to have a brief idea of the research field in deep learning visualization.',
   5: 'model builders',
   12: 'Model developers',
   13: ' ML Engineers',
   14: 'model developers',
   16: 'The very experts who understand decision-making models',
   17: 'DNN engineers ',
   20: 'Creators',
   33: 'machine learning researchers',
   37: 'Model-developers and builders',
   38: 'ML experts, which are people capable of building, training and testing machine learning models with different datasets from different domains.',
   41: 'Developers and AI researchers: investigators in AI, software developers, or data analysts who create the AI system.  KN: * Lay users: "with no

In [337]:
node_link_data = {'nodes': nodes, 'links': link_data}

In [339]:
with open('node_link_data.json', 'w') as fp:
    json.dump(node_link_data, fp)

In [340]:
bibData = df[['author', 'year', 'title']].reset_index().to_json(orient='records')
bibData = json.loads(bibData)
with open('bib.json', 'w') as fp:
    json.dump(bibData, fp)