# This script used simple joins based on parent_id and child_id for every channel and level to create the topic tree.

# Load Libraries

In [None]:
from tqdm import tqdm
import pandas as pd
import os
os.chdir('/kaggle/input/')

# Read Data

In [None]:
topics = pd.read_csv('learning-equality-curriculum-recommendations/topics.csv')
print(f'topics: {topics.shape}')

# Create Topic Trees

In [None]:
df = pd.DataFrame()

for channel in tqdm(topics['channel'].unique()):
    channel_df = topics[(topics['channel'] == channel)].reset_index(drop = True)
    for level in sorted(channel_df.level.unique()):
        #For level 0, it first creates a topic tree column which is the title of that topic.            
        if level == 0:
            topic_tree = channel_df[channel_df['level'] == level]['title'].astype(str)
            topic_tree_df = pd.DataFrame([channel_df[channel_df['level'] == level][['id']],topic_tree.values]).T
            topic_tree_df.columns = ['child_id','topic_tree']
            channel_df = channel_df.merge(topic_tree_df, left_on = 'id', right_on = 'child_id', how = 'left').drop(['child_id'], axis = 1)
        
        #Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id
        topic_df_parent = channel_df[channel_df['level'] == level][['id','title','parent','topic_tree']]
        topic_df_parent.columns = 'parent_' + topic_df_parent.columns
        
        topic_df_child = channel_df[channel_df['level'] == level + 1][['id','title','parent','topic_tree']]
        topic_df_child.columns = 'child_' + topic_df_child.columns
        
        topic_df_merged = topic_df_parent.merge(topic_df_child, left_on = 'parent_id', right_on = 'child_parent')[['child_id','parent_id','parent_title','child_title','parent_topic_tree']]

        #Topic tree is parent topic tree + title of the current child on that level
        topic_tree = topic_df_merged['parent_topic_tree'].astype(str) + ' > ' + topic_df_merged['child_title'].astype(str)
        
        topic_tree_df = pd.DataFrame([topic_df_merged['child_id'].values,topic_tree.values]).T
        topic_tree_df.columns = ['child_id','topic_tree']
        
        channel_df = channel_df.merge(topic_tree_df, left_on = 'id', right_on = 'child_id', how = 'left').drop(['child_id'], axis = 1)
        if 'topic_tree_y' in list(channel_df.columns):
            channel_df['topic_tree'] = channel_df['topic_tree_x'].combine_first(channel_df['topic_tree_y'])
            channel_df = channel_df.drop(['topic_tree_x','topic_tree_y'], axis = 1)
        
    df = pd.concat([df,channel_df])

# Visualize CBSE Tree

In [None]:
from collections import defaultdict
from pprint import pprint

#cbse
list_trees = list(df[(df['channel'] == 'ef2088')]['topic_tree'].values)

trees = list_trees
tree_dict = defaultdict(dict)

for path in trees:
    nodes = path.split(' > ')
    current = tree_dict
    for node in nodes:
        current = current.setdefault(node, {})
        
#pprint(tree_dict)

In [None]:
from graphviz import Digraph
from IPython.display import display, Image

def show_topic_tree(topic_id: str, topics: pd.DataFrame):

    # make id_title
    topics["id_title"] = topics["id"].astype(str) + "-" + topics["title"].astype(str)
    
    # make parent_id_title
    id_id_title_dict = dict(zip(topics["id"], topics["id_title"]))
    topics["parent_id_title"] = topics["parent"].map(id_id_title_dict)
    
    # get input_id_title
    input_id_title = topics.loc[topics["id"] == topic_id, "id_title"].values[0]
   
    # make edge
    edges = []
    
    def get_child_edge(parent_topic_id_title: str, topics: pd.DataFrame, edges: list) -> list:
        topic_id_title_list = topics[topics["parent_id_title"] == parent_topic_id_title]["id_title"].tolist()
        if topic_id_title_list:
            for topic_id_title in topic_id_title_list:
                edges.append((parent_topic_id_title, topic_id_title))
                get_child_edge(parent_topic_id_title = topic_id_title, topics = topics, edges = edges)
        return edges
        
    get_edge = get_child_edge(parent_topic_id_title = input_id_title, topics = topics, edges = edges)
    
    # render
    G = Digraph(format="png")
    G.attr("node", shape="circle")

    for i,j in edges:
        G.edge(str(i), str(j))
    G.render("/kaggle/working/tree")
    img = Image('/kaggle/working/tree.png')
    display(img)

In [None]:
show_topic_tree(topic_id="t_f468edbf3b1b",topics=topics)