In [1]:
# Probabilistic Fuzzy Set Inference

Related:
https://www.mathworks.com/help/fuzzy/fuzzy-inference-process.html

## Load course data

In [2]:
from learndata import LearnContents
ld = LearnContents()

## Fill the prerequisites list of each content that applies
It will not apply if the content has no siblings (subcontents of the same content)

In [3]:
#Topics pre requisites will be every topic that is cited before it 

#We may take diferent approachs for diferent subsections levels, 
#like place as prereq every subsection level above the current target 
#or only the the subsections of the same level

#Iterate thru every learn content
for lc in ld:
    #Work only with top contents (with no parents)
    if not lc.parent == None:
        continue
    
    #Lists to register the current course contents that already happend in subsecs degrees 1 and 2
    course_subsec1_contents = list() 
    course_subsec2_contents = list()    
    
    for subsec1_content in lc: #Iterate thru content
        #We may option to not include excluded topics later (but best way is to apply probabilistic models to it too)
        subsec1_content.prereqs = list(course_subsec1_contents) #For the current, append everything before    
        course_subsec1_contents.append(subsec1_content)

        #Analog to sub sec 2
        for subsec2_content in subsec1_content:
            subsec2_content.prereqs = list(course_subsec2_contents)
            course_subsec2_contents.append(subsec2_content)

In [4]:
#ld[995].prereqs

## Create similarity functions

In [6]:
from similarity_functions import text_match, TokenSimilarity, StemTokenSimilarity

In [4]:
from nltk.stem import LancasterStemmer,PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

stemmer = PorterStemmer()
#stemmer = LancasterStemmer()

stop_words = list(punctuation)# + stopwords.words("english")
stem_dict = dict()

def get_stem(word):
    if word not in stem_dict:
        stem_dict[word] = stemmer.stem(word)
    return stem_dict[word]

def get_tokens(sentence):
    return set([token for token in word_tokenize(sentence.lower()) if token not in stop_words])

def get_stem_tokens(sentence):
    #return set([stemmer.stem(token) for token in word_tokenize(sentence) if token not in stop_words])
    return set(map(get_stem, get_tokens(sentence)))

def stem_tokens_similarity(lc1, lc2):
    """
    Function to get similiarity of two contents getting
    the jaccard similarity of the stemmed tokens of the words.
    """
    
    #MUST PRE CREATE A DICT WITH STEMMED TERMS TO SEARCH
    
    #Function to compute similarity between two learn contents.
    #This may compare their childs, titles, anything.
    #For now, get the jaccard similarity of the terms in the title.
    
    #Returns: float value between 0 and 1
    
    lc1_tokens = get_tokens(str(lc1).lower())
    lc2_tokens = get_tokens(str(lc2).lower())
    
    #Jaccard similarity: |intersection| over |union|
    
    return len(lc1_tokens.intersection(lc2_tokens)) / len(lc1_tokens.union(lc2_tokens))

def case_insensitive_word_match_similarity(lc1, lc2):
    word1 = str(lc1).lower()
    word2 = str(lc2).lower()
    if word1 == word2:
        return 1.0
    else:
        return 0.0
    
assert case_insensitive_word_match_similarity("lucas", "Lucas") == 1.0
assert case_insensitive_word_match_similarity("lucass", "Lucas") == 0.0

import time
start = time.time()
#Pre create stem_tokens for every article
for lc in ld:
    lc.tokens = get_tokens(lc.title)
    lc.stem_tokens = set(map(get_stem, lc.tokens))
    
    
print(time.time() - start)

239.8987214565277


In [7]:
ld[850000].stem_tokens

{'affili', 'introduct', 'is', 'market', 'what'}

In [8]:
def query(term, similarity_func=case_insensitive_word_match_similarity, cut_score=1.0):
    """Function to query content based on a term, similarity function and cut_score."""
    
    results = list()
    
    for lc in ld:
        similarity_score = similarity_func(term, lc)
        if similarity_score >= cut_score:
            results.append((lc, similarity_score))
    
    return results
    
    
    term_tokens = term.lower().split()
    idx_set = set()
    for tok in term_tokens:
        for _,_,idx in ld.search(tok):
            idx_set.add(idx)
        
    results = list()
    for idx in idx_set:
        results.append((ld[idx], similarity_func(term, ld[idx])))
        
    results = [(result, score) for result, score in results if score >= cut_score]
        
    return sorted(results, key=lambda a:a[1], reverse=True)    

In [9]:
query_results = query("recurrent neural networks")

In [10]:
query_results

[(Recurrent Neural Networks, 1.0),
 (Recurrent neural networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent neural networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0)]

### Probabilistic Set Inference

In [12]:
from collections import Counter

def get_term_prereqs(term, similarity_func, cut_score):
    #1. Query contents that match the specified term with the similarity function.
    query_results = query(term, similarity_func, cut_score)
    
    #2. Group all prereqs with their scores
    raw_prereqs = list()
    for cont, score in query_results:
        if not hasattr(cont, 'prereqs'):
            continue
        for pr in cont.prereqs:
            raw_prereqs.append((pr, score))
            
    #3. Get n similar content to the prereqs based in a cut value (TO BE DONE)
    #for pr, pr_score in raw_prereqs:
        
            
    #4. Apply loopback to all prereqs
    non_loopback_prereqs = list()
    for pr, pr_score in raw_prereqs:
        if not hasattr(pr, 'prereqs'):
            non_loopback_prereqs.append((pr, pr_score))
            continue
            
        max_term_pr_similarity = 0
            
        #Iterate thru the prereqs of the prereq
        for pr_pr in pr.prereqs:
            term_pr_similarity = similarity_func(term, pr_pr)
            max_term_pr_similarity = max(max_term_pr_similarity, term_pr_similarity)
            
        #Subtract the max term_prereq similarity from the pr_score to compute the score after loopback
        pr_score -= max_term_pr_similarity
        
        #If the pr_score is bigger than 0, mean the pr is more prereq of the query than the inverse
        if pr_score > 0:
            non_loopback_prereqs.append((pr, pr_score))
            
            
    #5. Merge results for presentation
    #We can do merge by:
    #  -Matching terms
    #  -Checking similarities between then
    #  -Creating clusters based on features such title terms
    #  etc
    
    #Merge simpling matching terms
    merged_prereqs = Counter([str(pr).lower() for pr,_ in non_loopback_prereqs])
        
    return merged_prereqs.most_common()

In [15]:
pr_list = get_term_prereqs("html", case_insensitive_word_match_similarity, 1)
print(len(pr_list))
print("\n".join(map(str,pr_list)))

#proper implement stem token similarity
#implement step 3 to get similar prereqs
#think a better approach for the loopback max term_pr_similarity

377
('summary', 22)
('introduction', 14)
('preparation', 3)
('getting started', 3)
('forms', 3)
('links to download programs', 2)
('overview', 2)
('colors', 2)
('about slices', 2)
('preparando-se para o desenvolvimento', 2)
('links', 2)
('powerpoint image filetypes', 2)
('vocabulary', 2)
('udemy psd to html intro 1', 2)
('final website ', 2)
('lists', 2)
('course introduction', 2)
('homepage psd', 2)
('session 3 slicing and dicing ', 2)
('choosing the right image format checklist ', 2)
('conceptos básicos de desarrollo web', 2)
('images', 2)
('about no slices', 2)
('giriş', 2)
('introdução', 2)
('submit', 1)
('logout', 1)
('cherrypy toolbox', 1)
('un mundo post-pc', 1)
('what is full stack development?', 1)
('exercise', 1)
('パソコンの仕組み', 1)
('development process', 1)
('web development learning style', 1)
('hide show a div', 1)
('key up-down / change', 1)
('course intro and outline', 1)
('websockets over other methods', 1)
('xml', 1)
('simplicity', 1)
('overview - tokenization', 1)
('ux t