In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
from pprint import pformat

In [2]:
def label_dist(df, single=True):
    dist = defaultdict(int)
    
    for i, row in df.iterrows():
        if isinstance(row['type'], str):
            if single:
                types = row['type'].split('/')
                for t in types:
                    dist[t] += 1
            else:
                dist[row['type']] += 1
    return dist

In [3]:
def no_prereq(df):
    return df.loc[df['prereq'].isnull()]

In [4]:
def no_subseq(df):
    return df.loc[df['subseq'].isnull()]

In [5]:
def dataframe_to_edges(df, key, cols, delim=''):
    for i, row in df.iterrows():
        src = row[key]
        for j in cols:
            if isinstance(row[j], str):
                for i in row[j].split(delim):
                    yield (src, i)

In [6]:
def dataframe_to_graph(df):
    edges = dataframe_to_edges(df, 'name', ['prereq', 'subseq'], ', ')
    G = nx.Graph(edges)

    roots = no_prereq(df)
    singles = no_subseq(roots)

    for i, node in singles.iterrows():
        G.add_node(node['name'])

    return G

In [7]:
src = '/media/ch3njus/Seagate4TB/research/parkourtheory/data/database/latest/moves.csv'
df = pd.read_csv(src, dtype={'id': int})
ml = label_dist(df, single=False)
sl = label_dist(df, single=True)
# multiclass = [(k, v) for k, v in sorted(dist.items(), key=lambda item: item[1], reverse=True)]
print(f'multi-label: {len(ml)}\tsingle-label: {len(sl)}')

In [8]:
G = dataframe_to_graph(df)

In [9]:
len(G.nodes())

1089

In [10]:
len(G.edges())

1120