# Probabilistic Fuzzy Set Inference

Related:
https://www.mathworks.com/help/fuzzy/fuzzy-inference-process.html

## Load course data

In [1]:
from learndata import LearnContents, LearnContent
ld = LearnContents()

## Fill the prerequisites list of each content that applies
It will not apply if the content has no siblings (subcontents of the same content)

In [2]:
#Topics pre requisites will be every topic that is cited before it 

#We may take diferent approachs for diferent subsections levels, 
#like place as prereq every subsection level above the current target 
#or only the the subsections of the same level

#Iterate thru every learn content
for lc in ld:
    #Ensure every content has prereq list
    if not hasattr(lc, 'prereqs'):
        lc.prereqs = list()
    
    #Work only with top contents (with no parents)
    if not lc.parent == None:
        continue
    
    #Lists to register the current course contents that already happend in subsecs degrees 1 and 2
    course_subsec1_contents = list() 
    course_subsec2_contents = list()    
    
    for subsec1_content in lc: #Iterate thru content
        #We may option to not include excluded topics later (but best way is to apply probabilistic models to it too)
        #subsec1_content.prereqs = list(course_subsec1_contents) #For the current, append everything before
        #Set prereq factor to 1. In the future this should be calculated
        subsec1_content.prereqs = [(cont,1.0) for cont in course_subsec1_contents]
        course_subsec1_contents.append(subsec1_content)

        #Analog to sub sec 2
        for subsec2_content in subsec1_content:
            #subsec2_content.prereqs = list(course_subsec2_contents)
            subsec2_content.prereqs = [(cont,1.0) for cont in course_subsec2_contents]
            course_subsec2_contents.append(subsec2_content)

In [3]:
#ld[995].prereqs

## Load similarity functions
Functions to compute similarity between two learn contents.<br>
This may compare their childs, titles, anything based on any kind of inference.

In [4]:
from similarity_functions import text_match, TokenSimilarity, StemTokenSimilarity

In [5]:
def similarity_func(obj1, obj2):
    return text_match(obj1.title, obj2.title)

## Probabilistic Set Inference Functions

In [6]:
def generate_query_object(term):
    """Function to generate the query object that will be used for comparison."""
    query_obj = LearnContent(term)
    return query_obj

In [7]:
def get_n_similar(obj, iterable, similarity_func=similarity_func, cut_value=1.0, n=-1):
    """Function to return the n similar objects to the passed object."""

    similar_objs = list()
    
    if n == 0:
        return similar_objs
    
    for cmp_obj in iterable:
        similarity_value = similarity_func(obj, cmp_obj)
        if similarity_value >= cut_value:
            similar_objs.append((cmp_obj, similarity_value))
            
    sorted_list = sorted(similar_objs, key=lambda a: a[1], reverse=True)
    if n >= 0:
        sorted_list = sorted_list[:n]

    return sorted_list

In [13]:
def join_prereqs(content_list):
    """Function to join all the prereqs of the content list with their score."""
    joined_prereqs = list()
    for cont, score in content_list:
        for pr, pr_score in cont.prereqs:
            #Multiply the prereq factor to the similarity of the content with the query
            joined_prereqs.append((pr, score*pr_score)) 
    return joined_prereqs

In [49]:
def get_loopback_value(query_obj, loopback_obj, similarity_func, n_similars):
    """Apply loopback algorithm."""
    
    #Get prereqs from lp_obj with score of 1
    lp_obj_prereqs = list(loopback_obj.prereqs)
    
    max_term_pr_similarity = 0
        
    for pr, pr_score in lp_obj_prereqs:
        #Multiply the similarity between the terms by the current prereq factor(its weight)
        query_lp_similarity = similarity_func(query_obj, pr) * pr_score
        max_term_pr_similarity = max(max_term_pr_similarity, query_lp_similarity)
        
    return max_term_pr_similarity

In [58]:
def pr_loopback(query_obj, prereqs, similarity_func, n_similars):
    """Apply loopback algorithm to the prereqs and query_obj to refine results."""
    
    loopback_prereqs = list()
    
    for pr, pr_score in prereqs:
        loopback_value = get_loopback_value(query_obj, pr, similarity_func, n_similars)
        lp_pr_score = pr_score - loopback_value
        
        #If the pr_score is bigger than 0, mean the pr is more prereq of the query than the inverse
        if lp_pr_score > 0:
            loopback_prereqs.append((pr, lp_pr_score))
            
    return loopback_prereqs

In [59]:
def query_prereq_loopback(query_obj, prereqs, similarity_func=similarity_func, similar_sample=0):
    """Apply loopback algorithm to the prereqs and query_obj to refine results."""
    
    loopback_prereqs = list()
    
    for pr, pr_score in prereqs:
        
        #Get similar objects to pr to ensure enough samples of pr's prereqs
        #pr_similars = get_n_similar(pr, ld, similarity_func, 1.0, similar_sample)
        
        
        pr_prereqs = pr.prereqs
            
        max_term_pr_similarity = 0
            
        #Iterate thru the prereqs of the prereq
        for pr_pr in pr.prereqs:
            term_pr_similarity = similarity_func(query_obj, pr_pr)
            max_term_pr_similarity = max(max_term_pr_similarity, term_pr_similarity)
            
        #Subtract the max term_prereq similarity from the pr_score to compute the score after loopback
        pr_score -= max_term_pr_similarity
        
        #If the pr_score is bigger than 0, mean the pr is more prereq of the query than the inverse
        if pr_score > 0:
            loopback_prereqs.append((pr, pr_score))
    

In [60]:
from collections import Counter

def merge_func(prereqs):
    #Merge results for presentation
    #We can do merge by:
    #  -Matching terms
    #  -Checking similarities between then
    #  -Creating clusters based on features such title terms
    #  etc
    
    #Merge simpling matching terms
    merged_prereqs = Counter([str(pr).lower() for pr,_ in prereqs])
        
    return merged_prereqs.most_common()

In [61]:
query_obj = generate_query_object("javascript")
content_list = get_n_similar(query_obj, ld)
joined_prereqs = join_prereqs(content_list)
lp_prereqs = pr_loopback(query_obj, joined_prereqs, similarity_func, 0)
merged = merge_func(lp_prereqs)
merged

[('summary', 22),
 ('css', 21),
 ('html', 16),
 ('lists', 7),
 ('introduction', 7),
 ('colors', 7),
 ('headings', 6),
 ('what is html?', 6),
 ('comments', 5),
 ('tables', 5),
 ('app cache', 4),
 ('entities', 4),
 ('html 5', 4),
 ('forms', 4),
 ('basic example', 4),
 ('css styles', 4),
 ('\u200bhistory of html', 4),
 ('encoding - meta tag', 4),
 ('head elements', 4),
 ('indentation', 4),
 ('case sensitive', 4),
 ('paragraph', 3),
 ('links', 3),
 ('block and inline elements', 3),
 ('elements and attributes', 3),
 ('html and css', 3),
 ('css3', 3),
 ('base element', 3),
 ('what is html5?', 3),
 ('course introduction', 3),
 ('forms and inputs', 3),
 ('bootstrap', 3),
 ('css 3', 3),
 ('choosing an editor html: brackets', 3),
 ('html5 vs flash', 3),
 ('html5', 3),
 ('choosing an editor html: notepad++', 3),
 ('sql and pl/sql', 2),
 ('apex architecture', 2),
 ('creating your first website!', 2),
 ("let's learn javascript arrays and objects!", 2),
 ('presentación', 2),
 ('globalization, locali

In [12]:
#import time
#start = time.time()
#Pre create stem_tokens for every article
#for lc in ld:
    #lc.tokens = get_tokens(lc.title)
    #lc.stem_tokens = set(map(get_stem, lc.tokens))
    
    
#print(time.time() - start)

In [13]:
#ld[850000].stem_tokens

In [14]:
def query(term, similarity_func=similarity_func, cut_score=1.0):
    """Function to query content based on a term, similarity function and cut_score."""
    
    results = list()
    
    for lc in ld:
        similarity_score = similarity_func(term, lc)
        if similarity_score >= cut_score:
            results.append((lc, similarity_score))
    
    return results
    
    
    term_tokens = term.lower().split()
    idx_set = set()
    for tok in term_tokens:
        for _,_,idx in ld.search(tok):
            idx_set.add(idx)
        
    results = list()
    for idx in idx_set:
        results.append((ld[idx], similarity_func(term, ld[idx])))
        
    results = [(result, score) for result, score in results if score >= cut_score]
        
    return sorted(results, key=lambda a:a[1], reverse=True)    

In [15]:
query_results = query("recurrent neural networks")

In [16]:
query_results

[(Recurrent Neural Networks, 1.0),
 (Recurrent neural networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent neural networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0),
 (Recurrent Neural Networks, 1.0)]

### Probabilistic Set Inference

In [17]:
from collections import Counter

def get_term_prereqs(term, similarity_func, cut_score):
    #1. Query contents that match the specified term with the similarity function.
    query_results = query(term, similarity_func, cut_score)
    
    #2. Group all prereqs with their scores
    raw_prereqs = list()
    for cont, score in query_results:
        if not hasattr(cont, 'prereqs'):
            continue
        for pr in cont.prereqs:
            raw_prereqs.append((pr, score))
            
    #3. Get n similar content to the prereqs based in a cut value (TO BE DONE)
    #for pr, pr_score in raw_prereqs:
        
            
    #4. Apply loopback to all prereqs
    non_loopback_prereqs = list()
    for pr, pr_score in raw_prereqs:
        if not hasattr(pr, 'prereqs'):
            non_loopback_prereqs.append((pr, pr_score))
            continue
            
        max_term_pr_similarity = 0
            
        #Iterate thru the prereqs of the prereq
        for pr_pr in pr.prereqs:
            term_pr_similarity = similarity_func(term, pr_pr)
            max_term_pr_similarity = max(max_term_pr_similarity, term_pr_similarity)
            
        #Subtract the max term_prereq similarity from the pr_score to compute the score after loopback
        pr_score -= max_term_pr_similarity
        
        #If the pr_score is bigger than 0, mean the pr is more prereq of the query than the inverse
        if pr_score > 0:
            non_loopback_prereqs.append((pr, pr_score))
            
            
    #5. Merge results for presentation
    #We can do merge by:
    #  -Matching terms
    #  -Checking similarities between then
    #  -Creating clusters based on features such title terms
    #  etc
    
    #Merge simpling matching terms
    merged_prereqs = Counter([str(pr).lower() for pr,_ in non_loopback_prereqs])
        
    return merged_prereqs.most_common()

In [20]:
pr_list = get_term_prereqs("javascript", similarity_func, 1)
print(len(pr_list))
print("\n".join(map(str,pr_list)))

#proper implement stem token similarity
#implement step 3 to get similar prereqs
#think a better approach for the loopback max term_pr_similarity

650
('summary', 22)
('css', 21)
('html', 16)
('lists', 7)
('colors', 7)
('introduction', 7)
('what is html?', 6)
('headings', 6)
('comments', 5)
('tables', 5)
('indentation', 4)
('case sensitive', 4)
('entities', 4)
('css styles', 4)
('head elements', 4)
('\u200bhistory of html', 4)
('html 5', 4)
('app cache', 4)
('forms', 4)
('basic example', 4)
('encoding - meta tag', 4)
('bootstrap', 3)
('base element', 3)
('css3', 3)
('what is html5?', 3)
('course introduction', 3)
('paragraph', 3)
('html5', 3)
('links', 3)
('choosing an editor html: notepad++', 3)
('block and inline elements', 3)
('html and css', 3)
('elements and attributes', 3)
('choosing an editor html: brackets', 3)
('forms and inputs', 3)
('css 3', 3)
('html5 vs flash', 3)
('rad tool', 2)
('the power of frameworks', 2)
('getting started', 2)
('typography', 2)
('more about css', 2)
('add images', 2)
('finishing css', 2)
('java', 2)
('data centric', 2)
('<div>iding up our website', 2)
('presentación', 2)
('globalization, locali