In [1]:
#Load data and test data
import os
import json


udemy_folder = "udemy_data"
udemydata_folders = [
    'dev', 
    'acad',
    "it",
    'mkt',
    'op'
]

#Create data paths
data_paths = list()
for subf in udemydata_folders:
    folder_path = udemy_folder + "/" + subf
    for f in os.listdir(folder_path):
        data_paths.append(folder_path + "/" + f)
        
        
udemy_data = list()        
for i, d_path in enumerate(data_paths):
    course_data = json.load(open(d_path))
    udemy_data.append(course_data)
    #print("Done {}/{}".format(i+1, len(data_paths)))

print("Done")
print(udemy_data[1000]["t"])

Done
Swift 2.0 and Sprite Kit Basics for Game Developers


In [2]:
#Topics that are meaningless and too general such introduction and conclusion
#The generality also couples themes that have not in common
excluded_topics = [
    'introduction',
    'conclusion',
    'title to be edited here',
    'summary',
    'getting started',
    'course introduction',
    'introducción',
    'section introduction',
    'intro',
    'prerequisites',
    'introdução',
    'section summary',
    'bonus',
    'learning objectives',
    'course overview',
    'overview',
    'welcome',
    'course summary',
    'section recap',
    'section introduction',
    'introduction to the course',
    'project intro',
    'bonus lecture',
    'section intro',
    'comments',
    'exercise',
    'bonus material',
    '01. introduction',
    'section overview',
    'challenge',
    'resources',
    'outro',
    'course conclusion',
    'welcome!',
    'section review',
    'bonus section',
    'introduction to the course',
    'important - download these first - working files',
    'check your understanding',
    'quiz',
    'thank you',
    'thank you!',
    'introduction and objectives',
    'einleitung',
    'section conclusion',
    'congratulations!',
    'start here',
    'final thoughts',
    'final words',
    'next steps',
    'conclusions',
    'knowledge check',
    'chapter 2 & 3 quiz',
    'chapter 1 quiz',
    'chapter 1 & 2 quiz',
]

## Create topics counter and topics pre requisites counter
We must create two collections: **topics_counter** and **topics_prereqs**.

**topics_counter** will be a counter to accumulate frequency of all the different topics that occurs in the dataset.

**topics_prereqs** will be a dict that for a given topic, computes the frequency of its prerequisites (that are also topics).

In [3]:
#Get topics prereqs and function to get prereqs
from collections import Counter, defaultdict

#Topics pre requisites will be every topic that is cited before it 
#As we add more and more learn references, for each topic we sum up the contents
#So the more requested topics tend to have higher sum (score)

#We may take diferent approachs for diferent subsections levels, 
#like place as prereq every subsection level above the current target 
#or only the the subsections of the same level

topics_counter = Counter()
topics_prereqs = defaultdict(Counter)

for course in udemy_data: #Iterate thru every course
    
    #Lists to register the current course contents that already happend in subsecs degrees 1 and 2
    course_subsec1_contents = list() 
    course_subsec2_contents = list()    
    
    for subsec1_content in course["c"]: #Iterate thru content
        subsec1_title = subsec1_content["t"].lower()
        
        if subsec1_title in excluded_topics:
            continue
        
        topics_prereqs[subsec1_title].update(course_subsec1_contents) #For the current, append everything before
        course_subsec1_contents.append(subsec1_title)
        
        topics_counter[subsec1_title] += 1 #Update topics_counter
        #continue
        
        #Analog to sub sec 2
        for subsec2_content in subsec1_content["c"]:
            subsec2_title = subsec2_content["t"].lower()
            
            if subsec2_title in excluded_topics:
                continue
            
            #topics_prereqs[subsec2_title].update(course_subsec1_contents)
            topics_prereqs[subsec2_title].update(course_subsec2_contents)
            course_subsec2_contents.append(subsec2_title)
            
            topics_counter[subsec2_title] += 1 #Update topics_counter
        
print(len(topics_counter))
print(len(topics_prereqs))

494851
494851


In [4]:
#Function to return the calculated pre requisites of some topic
def get_topic_prereqs(topic):

    #subtract occurrences of the target knowledge from the pre reqs candidates
    #Must take care with low occurrences contents titles because they will tend to stay at 1, since they happen low times and rarely are cited by other things

    prereqs_diff = Counter()
    for prereq in topics_prereqs[topic].keys():

        #Use get function to avoid modify the prereqs counter dict
        prereq_count = topics_prereqs[topic].get(prereq, 0) #Get the degree of need of the prereq in the target
        topic_count = topics_prereqs[prereq].get(topic, 0) #Get the degree of need of the target in the prereq, return 0 if not found

        prereqs_diff[prereq] = prereq_count - topic_count
        
    return prereqs_diff.most_common()

#get_topic_prereqs("javascript")

In [5]:
#Search function to find topics
def search_topics(term):
    term = term.lower()
    #Return words that got the term, sorting by most used
    results = [(topic, count) for topic, count in topics_counter.items() if term in topic]
    
    #return sorted results by count
    return sorted(results, key=lambda a: a[1], reverse=True)

#search_topics("java")

In [6]:
#Function the parent section of some content
def find_content_parent_section(target_search):
    search_result = list()
    #target_search = "positioning"
    for course in udemy_data:
        course_title = course["t"]
        for c in course["c"]:
            c_title = c["t"].lower()

            if c_title == target_search:
                search_result.append(course_title)

            for c2 in c["c"]:
                c2_title = c2["t"].lower()

                if c2_title == target_search:
                    search_result.append(c_title)
                
    return search_result

#find_content_parent_section("javascript")

In [7]:
#Create function to get the prereqs that occurs above a certain cut value
def get_cutted_prereq(topic, cut_value=0):
    #get prereqs for the topic
    topic_prereqs = get_topic_prereqs(topic)
    cutted_prereqs = [pr for pr, count in topic_prereqs if count > cut_value]
    return cutted_prereqs

#Create function to get the entire prereq flow recursively
def get_prereq_flow_members(topic,cut_value, prereq_dict):
    topic_prereqs = get_cutted_prereq(topic, cut_value)
    prereq_dict[topic] = topic_prereqs
    
    for pr in topic_prereqs:
        
        if pr not in prereq_dict:
            prereq_dict = get_prereq_flow_members(pr, cut_value, prereq_dict)
        
    return prereq_dict  

#Function to get the entire prereq flow recursively,
#Limiting content to the target content cutted prereq
#So for child content we place cut_value to reasonable value, like 1
def get_filtered_prereq_flow_members(topic, cut_value, filter_set=None, prereq_dict=None):
    if prereq_dict == None:
        prereq_dict = dict()
        
    topic_cutted_prereqs = get_cutted_prereq(topic, cut_value)
    
    #If no filter set has been specified, create one with the result
    if filter_set == None:
        filter_set = set(topic_cutted_prereqs)
        
    #Filter the result with the filter set
    filtered_topic_prereqs = list(filter_set.intersection(set(topic_cutted_prereqs)))
    #filtered_topic_prereqs = topic_cutted_prereqs
    
    #Place data to dict
    prereq_dict[topic] = filtered_topic_prereqs
    
    for pr in filtered_topic_prereqs:
        if pr not in prereq_dict:
            prereq_dict = get_filtered_prereq_flow_members(pr, 1, filter_set, prereq_dict)
    
    return prereq_dict

#get_filtered_prereq_flow_members("binary trees", 1)

In [8]:
#Create function to create the spanning tree of prereq flows, that is the optimal "learning path"
#With optimal we mean learn only what is necessary and on the time that is necessary
import networkx as nx

#Deprecated due to not use
def DEPRECATED_get_topic_prereq_flow_graph(topic, cut_value=0):
    #prereq_flow_members = get_prereq_flow_members(topic, cut_value, dict())
    prereq_flow_members = get_filtered_prereq_flow_members(topic, cut_value)

    #MUST CHECK IF THE PROCESS TIL HERE ALREADY AVOID CREATION OF CIRCLES
    flow_graph = nx.DiGraph()
    
    for source, targets in prereq_flow_members.items():
        for target in targets:
            flow_graph.add_edge(source, target)
            
    return flow_graph  
    
    
def DEPRECATED_get_optimal_topic_prereq_flow_graph(topic, cut_value=0):
    #we treat every branch as diferent path to learn something,
    #But if we find some of the branchs inside other branch, we may remove it
    #In order to have a more clean visualization
    
    #The data origin naturally avoid parallel needing (MUST CHECK THIS)
    #MUST FIND A WAY TO DETECT TWO PARALLEL NEEDING AND TWO PATHS TO THE SAME THING
    #this is parallel needing since we take everything before and each thing we add as a need
    
    #Deprecated: we won't use arrow like stuff, but levels columns instead
    
    flow_graph = get_topic_prereq_flow_graph(topic, cut_value)
    optimal_flow_graph = nx.Edmonds(flow_graph)
    return optimal_flow_graph


### Function to return each node level for arrangement
Node levels should be low for the more basic stuff and high for complex

In [9]:
def get_levels_by_pagerank(graph, round_decimals=2):
    graph_pr = nx.pagerank(graph)
    graph_pr_rounded = dict([(node, round(level, round_decimals)) for node, level in graph_pr.items()])
    return graph_pr_rounded

def get_levels_by_depth_first(graph, source):
    node_levels = defaultdict(int)
    node_levels[source] = 0
    for source, target in nx.dfs_edges(graph, source):
        node_levels[target] = min(node_levels[target], node_levels[source] - 1)
    return node_levels


#We may improve this doing by hand returning in case the path length already passed the previous
def get_levels_by_paths_length(graph, source_node, cutoff=None):
    node_levels = defaultdict(int)
    node_levels[source_node] = 0
    
    for target_node in graph.nodes():
        for path in nx.all_simple_paths(graph, source_node, target_node, cutoff):
            node_levels[target_node] = min(len(path)*-1, node_levels[target_node])
        
    #print(node_levels)
    
    return node_levels
    
#Get node levels by going thru a depth first but including already passed node
def get_levels_by_inclusive_depth_first(graph, source):
    node_levels = dict()
    visited_list = list()
    recursion_depth = [0]
    
    def get_node_level(node):
        if node in node_levels:
            return node_levels[node]
        
        recursion_depth[0] += 1
        visited_list.append(node)
        
        node_level = 0
        for target in graph[node].keys():
            if target in visited_list:
                continue
            
            target_level = get_node_level(target)
            node_level = max(target_level+1, node_level)
        
        node_levels[node] = node_level
        
        visited_list.remove(node)
        recursion_depth[0] -= 1
        return node_level
    
    try:
        get_node_level(source)
    except RecursionError:
        print("ResursionError:\nRecursion depth: ", recursion_depth[0])
        assert False

    
    return node_levels

    #def transverse_and_set_level(source_node):
        #node_level = 0
        #for target_node in graph[source_node].keys():
            #target_level 
    
    #def set_node_level(node, level):
        #global recurse_depth
        #global max_depth
        #node_levels[node] = max(node_levels[node], level)
        #for target in flow_graph[node].keys():
            #recurse_depth += 1
            #max_depth = max(max_depth, recurse_depth)
            #set_node_level(target, level+1)
            #recurse_depth -=1
    
    
    
def get_levels_by_starting_nodes_propagation():
    pass
    

In [10]:
from tabulate import tabulate

def print_data(headers, data_list):
    """Function to print data tabulated """
    return print(tabulate(data_list, headers=headers))

#cut value is relative for each topic, so we cant propagate it down the prerequisite chain
#This way we will limit the prereq chain to only the members of the main chain 
#and place their levels according to the pre requisites of the filtered pre requisites


def print_optimal_topic_prereq_flow(topic, cut_value=0):
    #prereq_flow_members = get_prereq_flow_members(topic, cut_value, dict())
    prereq_flow_members =  get_filtered_prereq_flow_members(topic, cut_value)


    #1. Construct graph
    flow_graph = nx.DiGraph()
    flow_graph.add_node(topic) #Ensure main node is present in the graph
    for source, targets in prereq_flow_members.items():
        for target in targets:
            flow_graph.add_edge(source, target)

    
    #2. Get node levels
    #node_levels = get_levels_by_depth_first(flow_graph, topic)
    #node_levels = get_levels_by_pagerank(flow_graph, 2)
    #node_levels = get_levels_by_paths_length(flow_graph, topic, None)
    node_levels = get_levels_by_inclusive_depth_first(flow_graph, topic)
    
        
    #3. Create print table    
    print_tables = defaultdict(list)
    for node, level in node_levels.items():
        print_tables[level].append(node)
        
    sorted_print_table = sorted(print_tables.items(), key=lambda a: int(a[0]))
    print_table_list = [cont for _, cont in sorted_print_table]
    
    for sec in print_table_list:
        print("--------------------")
        for topic in sec:
            print(topic)
    
    return flow_graph
    
    print_table_list_dict = dict(enumerate(print_table_list))
        

        
    #4. Print topics prereq flow
    tabulate_print = tabulate(print_table_list_dict, headers="keys")
    #print(len(tabulate_print))
    print(tabulate_print)
        
    return flow_graph
    

In [11]:
#topics_counter.most_common(50)
search_topics("xml")[:50]

[('xml', 13),
 ('introduction to xml', 7),
 ('xml sitemaps', 6),
 ('parsing xml', 5),
 ('xml intro', 4),
 ('xml serialization', 4),
 ('xml basics', 4),
 ('testng.xml code', 3),
 ('javascript xmlhttprequest & web apis', 3),
 ('working with xml', 3),
 ('why xml?', 3),
 ('build.xml code', 3),
 ('xml attributes', 3),
 ('including and excluding the testcases from execution with testng xml file',
  3),
 ('xml sitemap', 3),
 ('json vs xml', 3),
 ('build phases & pom.xml', 2),
 ('becoming familiar with the xml layouts', 2),
 ('google xml sitemaps', 2),
 ('pom xml file code', 2),
 ('maven pom.xml file explanation', 2),
 ('linear layout designing of questions.xml of android interview app : part-18',
  2),
 ('adding live web content with xml and json', 2),
 ('maven configuration and pom.xml', 2),
 ('xml y onclick', 2),
 ('xml schema hands on', 2),
 ('adding graphics to " questions.xml " for multiple screen size & density',
  2),
 ('understanding build.xml file', 2),
 ('criando xml resource color 

## keep on it!!
## maybe use rnn to detect the data we should display as prereq
## how will differ two things that have the same name?

In [22]:
flow_graph = print_optimal_topic_prereq_flow('javascript',10)
#print(flow_graph.edges())
#flow_graph = print_optimal_topic_prereq_flow("jquery", 7)
#flow_graph.edges()
#keep working on this
#maybe create interface to search and check
#get more data to test

--------------------
html
--------------------
css
--------------------
javascript


# 