In [47]:
import unidecode, re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import spacy, en_trf_bertbaseuncased_lg
from gensim.parsing import preprocessing as pproc

In [48]:
with open('data/stopwords-ca.txt', 'r') as f:
    stopwords_ca = [unidecode.unidecode(word.strip()) for word in f]

In [None]:
%run utils.ipynb

In [49]:
"""
Apply standard pre-processing techniques to a text and return the normalized string.
"""
def process_text(string, remove_stopwords=True, stemming=False, language='en'):
    
    if language == 'ca' and stemming:
        print('Warning: cannot perform stemming on catalan.')
        
    string = unidecode.unidecode(string)
    string = string.lower()
    abbreviations = re.findall(r'(?:[a-z]\.)+', string)
    for abbr in abbreviations:
        string = string.replace(abbr, abbr.replace('.',''))
    string = pproc.strip_punctuation2(string)
    
    if remove_stopwords:
        if language == 'en':
            string = pproc.remove_stopwords(string)
        else:
            string = ' '.join([t for t in string.split() if t not in stopwords_ca and len(t)>1])
    
    if stemming and language == 'en':
        string = pproc.stem_text(string)
        
    string = string.strip()
    return string

In [50]:
"""
Given a collection of spacy documents, estimate the relevance of each word inside each document by means of TF-IDF. Words that appear in more than max_df (in percentage) documents or in less than min_df (in absolute value) documents are filtered out.

The word relevances are stored in an extended attribute of the docs called "word_relevances", as a list of tuples (word,relevance), sorted by relevance.
"""
def compute_word_relevances(documents, max_df=0.8, min_df=1, language='en'):
    
    texts = [process_text(doc.text, language=language) for doc in documents]
    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df)           
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names()
       
    for tfidf_vector,doc in zip(tfidf_matrix,documents):
        
        tfidf_vector = tfidf_vector.toarray().flatten()
        sorted_indices = np.argsort(tfidf_vector, axis=None)  
        word_relevances = [(feature_names[index], round(tfidf_vector[index],2)) 
                           for index in reversed(sorted_indices) 
                           if (not any(char.isdigit() for char in feature_names[index])) and (tfidf_vector[index]>0)]
        doc._.word_relevances = word_relevances

In [51]:
"""
Estimates the relevance of a sentence for a document that contains it. The relevance is computed as the average of the relevance scores of its words, stored in the extended attribute called "word_relevances". For this reason, this function must be called only after "compute_word_relevances". The parameter "perc_relevant_words" regulates the percentage of document words (ranked by relevance) that should be included in the computation.
"""
def compute_sentence_relevance(sentence, doc, perc_relevant_words=1.0):
    
    top_n = int(perc_relevant_words * len(doc._.word_relevances))
    relevance = 0
    n = 0
    for w,s in doc._.word_relevances[:top_n]:
        if w in sentence.text:
            n += 1
            relevance += s
    
    if n > 0:
        relevance /= n
    return relevance

In [52]:
"""
Build a custom vector representation for a given spacy document. Stopwords, punctuation, spaces and number characters are always removed. The parameter "perc_relevant_words" regulates the percentage of document words (ranked by relevance) that should be included in the computation of the vector. The final vector is obtained as the average of the remaining words. If "scale_by_tfidf" is True, then each word-embedding is weighted by its relevance inside the document.

The custom vector is stored in the extended attribute of the doc called "custom_vector".
"""

def refine_doc_vector(doc, perc_relevant_words=1.0, scale_by_tfidf=False, language='en'):
    
    #if language == 'ca':
    #    assert catalan_word_embeddings != None
        
    top_n = int(perc_relevant_words * len(doc._.word_relevances))
    relevant_words = [w for w,s in doc._.word_relevances][:top_n]
    relevance_scores = [s for w,s in doc._.word_relevances][:top_n]
    
    new_vector = np.zeros(768)
    normalizer = 0
    for token in doc:
        if not (token.is_stop or token.is_punct or token.is_digit or token.is_space or token.is_bracket or any(char.isdigit() 
                                                                                                               for char in token.text)):
            if token.text.lower() in relevant_words:
                if scale_by_tfidf:
                    factor = relevance_scores[relevant_words.index(token.text.lower())]
                else:
                    factor = 1
                #if language == 'en':
                #    vec = token.vector
                #else:
                #    vec = ft.get_word_vector(token.text)
                new_vector += (token.vector * factor)
                normalizer += factor
            
    if normalizer > 0:
        new_vector /= normalizer
    doc._.custom_vector = new_vector

In [53]:
"""
Applies the function "refine_doc_vector" to a sentence, treating it exactly as if it were a spacy document. Before calling that function, the word relevances associated to the document are trasferred to the sentence.
"""
def refine_sentence_vector(sentence, doc, perc_relevant_words=1.0, scale_by_tfidf=False, language='en'):
    sentence._.word_relevances = doc._.word_relevances
    return refine_doc_vector(sentence, perc_relevant_words=perc_relevant_words, scale_by_tfidf=scale_by_tfidf,
                             language=language)

In [54]:
"""
Given a document, computes its relevance for each of the sustainable development goals. The relevance is computed as the cosine similarity between the vectors (either default or custom).

The returned list is not sorted by relevance.

ATTENTION:
S’han exclòs l’ODS 8 (treball digne i creixement econòmic), l’ODS 9 (indústria, innovació i
infraestructures) i l’ODS 17 (aliança pels objectius), perquè no tindrien cap efecte en el filtratge
de dades, atès que aquests tres objectius són inherents a tots els projectes col·laboratius
d’R+D+I que integra la Plataforma.
"""
def compute_goal_scores(project_doc, goal_docs, similarity='custom'):
    assert similarity in ('default', 'custom')
    
    scores = []
    for gdoc in goal_docs:
        if similarity == 'default':
            s = project_doc.similarity(gdoc)
        elif similarity == 'custom':
            s = project_doc._.custom_similarity(gdoc)
        scores.append(s)
    return scores

In [55]:
def generate_project_and_goal_docs(projects_df, goals_df, use_goal_descriptions=True, use_goal_facts=True, 
                                   use_goal_targets=True, max_document_frequency=0.3, min_document_frequency=2, 
                                   perc_relevant_words=1.0, scale_by_tfidf=True, language='en'):
    assert language in ('en','ca')
    
    if language == 'ca':
        nlp = spacy.load('ca_fasttext_wiki_lg')
        #catalan_word_embeddings = fasttext.load_model('models/word-embeddings/cc.ca.300.bin')
    else:
        # load the spacy model with BERT embeddings
        nlp = en_trf_bertbaseuncased_lg.load()
        #catalan_word_embeddings = None
        
    set_spacy_extensions()
    
    # read the project data.
    projects_df['sdgName'] = [g.split(',') if type(g) == type('str') else [] for g in projects_df['sdgName']]
    project_docs = []
    project_attributes = [{'projectId':row['projectId'], 'projectTitle':row['projectTitle'], 'sdgName':row['sdgName']} 
                              for _,row in projects_df.iterrows()]
    print('\nCreating the project docs...')
    h = 0
    for doc, attr in nlp.pipe(zip(projects_df['projectAbstract'], project_attributes), as_tuples=True):
        if h % 100 == 0:
            print(100 * (h/ len(projects_df['projectAbstract'])), '%')
        h += 1
        doc._.project_id = attr['projectId']
        doc._.project_title = attr['projectTitle']
        doc._.goal_labels = [clean_labels_mapping[l] for l in attr['sdgName']]
        project_docs.append(doc)
            
    # read the sdg data.    
    goal_texts = get_goal_texts(goals_df, description=use_goal_descriptions, facts=use_goal_facts, targets=use_goal_targets)
    goal_docs = []
    for doc, goal_label in nlp.pipe(zip(goal_texts, goals_df['goal_label']), as_tuples=True):
        if str(goal_label) in clean_labels_mapping:
            doc._.goal_labels = clean_labels_mapping[str(goal_label)] 
            goal_docs.append(doc)
        
    print('\nComputing word relevances...')
    compute_word_relevances(goal_docs+project_docs, max_df=max_document_frequency, min_df=min_document_frequency,
                            language=language)
    print('Refining vectors...')
    for doc in goal_docs+project_docs:
        refine_doc_vector(doc, perc_relevant_words=perc_relevant_words, scale_by_tfidf=scale_by_tfidf, 
                          language=language)
    print('Done')
    
    labeled_project_docs = [pdoc for pdoc in project_docs if len(pdoc._.goal_labels) > 0]
    unlabeled_project_docs = [pdoc for pdoc in project_docs if len(pdoc._.goal_labels) == 0]
    
    return labeled_project_docs, unlabeled_project_docs, goal_docs