In [None]:
from scipy.spatial.distance import cosine
import numpy as np

In [None]:
%run utils.ipynb
%run nlp_functions.ipynb

In [None]:
def highlight_words(project_doc, goal_docs, use_word_relevance=True, use_goal_similarity=True, 
                    percentile_threshold=50, n_goals_to_consider=3):
    assert use_word_relevance or use_goal_similarity
    
    goals_to_consider = [g for g,s in project_doc._.predicted_goal_scores][:n_goals_to_consider]
    
    relevant_words = [w for w,s in project_doc._.word_relevances]
    relevance_scores = [s for w,s in project_doc._.word_relevances]
    word_scores = []
    for token in project_doc:
        if token.text.lower() in relevant_words:
            if use_word_relevance:
                word_relevance = relevance_scores[relevant_words.index(token.text.lower())]
            else:
                word_relevance = 1
                
            if use_goal_similarity:
                max_similarity = -999
                assigned_goal = -1
                for gdoc in goal_docs:
                    if gdoc._.goal_labels in goals_to_consider:   
                        goal_sim = 1 - cosine(token.vector, gdoc._.custom_vector)  
                        if goal_sim > max_similarity:
                            max_similarity = goal_sim
                            assigned_goal = gdoc._.goal_labels
            else:
                assigned_goal = -1
                max_similarity = 1
            
            score = word_relevance * max_similarity
            word_scores.append((token.text, assigned_goal, score))
    
    score_threshold = np.percentile([s for w,g,s in word_scores], percentile_threshold)
    return [(w,g,s) for (w,g,s) in word_scores if s >= score_threshold]

In [None]:
"""
Selects a list of sentences from a spacy document, that should represent the summary of the document for the final user. Each sentence is assigned a sdg and a score. The returned list respects the order of the sentences in the original documents.

The score assigned to a sentence is given by the product of its relevance score (as computed by "compute_sentence_relevance") and the cosine similarity with the closest sdg. The idea is to keep into account both the semantic importance of the sentence for the global meaning of the document and its relevance for the sdg.

The parameters "similarity", "perc_relevant_words" and "scale_by_tfidf" are used for the costruction of the sentence vector and for the computation of the similarity with goals.
"""

def highlight_sentences(project_doc, goal_docs, use_sentence_relevance=True, use_goal_similarity=True,
                        n_sentences=5, n_highlighted_words=5, n_goals_to_consider=3):
    assert use_sentence_relevance or use_goal_similarity
    
    goals_to_consider = [g for g,s in project_doc._.predicted_goal_scores][:n_goals_to_consider]
    
    sentence_scores = []
    for sentence in project_doc.sents:
        if use_sentence_relevance:
            sentence_relevance = compute_sentence_relevance(sentence, project_doc, perc_relevant_words=1.0)
        else:
            sentence_relevance = 1
        
        if use_goal_similarity:
            refine_sentence_vector(sentence, project_doc, perc_relevant_words=1.0, scale_by_tfidf=True)
            max_similarity = -999
            assigned_goal = -1
            for gdoc in goal_docs:
                if gdoc._.goal_labels in goals_to_consider:   
                    goal_sim = sentence._.custom_similarity(gdoc)   
                    if goal_sim > max_similarity:
                        max_similarity = goal_sim
                        assigned_goal = gdoc._.goal_labels
        else:
            max_similarity = 1
        
        score = sentence_relevance * max_similarity
        sentence_scores.append((sentence.start, sentence.end, assigned_goal, score))
    
    sorted_scores = sorted([x[-1] for x in sentence_scores], reverse=True)
    threshold = sorted_scores[min(n_sentences,len(sorted_scores)-1)]
    selected_sentences = [(project_doc[start:end],g,s) for (start,end,g,s) in sentence_scores if s > threshold]
    
    return selected_sentences

In [None]:
def visualize_output(project_docs, goal_docs, n_highlighted_senteces=3, percentile_highlighted_words=50, use_colors=True):
    with open('output.html', 'w') as html_writer:
        for pdoc in project_docs:
            html_writer.write('<h2>'+pdoc._.project_title+'</h2>')
            html_writer.write('<h3>True goals:</h3>')
            for sdg in pdoc._.goal_labels:
                html_writer.write('<p style="color:'+goal_color_mapping[sdg]+'">' + goal_name_mapping[sdg]+'</p>')
            html_writer.write('<h3>Predicted Ranking (Top 5):</h3>')
            for sdg,score in pdoc._.predicted_goal_scores[:5]:
                html_writer.write('<p style="color:'+goal_color_mapping[sdg]+'">' 
                                  + goal_name_mapping[sdg]+' ('+str(round(score,2))+')</p>')

            html_writer.write('<h3>Abstract:</h3>')
            highlighted_sentences = highlight_sentences(pdoc, goal_docs, 
                                                        n_goals_to_consider=3, 
                                                        use_goal_similarity=True,
                                                        use_sentence_relevance=False,
                                                        n_sentences=n_highlighted_senteces)

            highlighted_words = highlight_words(pdoc, goal_docs,
                                                n_goals_to_consider=3,
                                                use_goal_similarity=True,
                                                use_word_relevance=False,
                                                percentile_threshold=percentile_highlighted_words)
            
            for sent in pdoc.sents:
                if sent in [hs[0] for hs in highlighted_sentences]:
                    html_writer.write('<b>')
                for token in sent:
                    if token.text in [hw[0] for hw in highlighted_words]:
                        word,assigned_goal,score = highlighted_words.pop(0)
                        if use_colors:
                            html_writer.write('<span style="color:'+goal_color_mapping[assigned_goal]+'">'+token.text+' </span>')
                        else:
                            html_writer.write('<mark>'+token.text+'</mark> ')
                    else:
                        html_writer.write('<span style="color:black">'+token.text+' </span>')
                if sent in [hs[0] for hs in highlighted_sentences]:
                    html_writer.write('</b>')
                html_writer.write('<br><br>')   
            html_writer.write('<br><br>')