#### In this exercise we compute the similarity between the definitions given by us (unito students) for 4 terms. The purpose of the exercise is to prove that defining a term is more complicated than one might think, especially if the term is abstract and/or generic.
#### The terms used for the experiment are:
- Emotion
- Person
- Revenge
- Brick

In [60]:
from collections import defaultdict, Counter
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from typing import AnyStr, Set, List, Dict
from numpy import dot
from numpy.linalg import norm

### Pre-processing

In [47]:
# Pre-processing of a sentence
def bag_of_words(sentence: AnyStr) -> Set[AnyStr]:
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


# Remove stopwords from a word list
def remove_stopwords(words: List[AnyStr]) -> List[AnyStr]:
    return [value.lower() for value in words if value.lower() not in stop_words]


# Get tokens from sentence
def tokenize_sentence(sentence: AnyStr) -> List[AnyStr]:
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        words.append(lmtzr.lemmatize(tag[0]).lower())
    return words


# Remove punctuation and multiple spaces
def remove_punctuation(sentence: AnyStr) -> AnyStr:
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

### Data Load

In [52]:
# Load stop words from file
def import_stop_words() -> Set[AnyStr]:
    with open('data/stop_words_FULL.txt') as f:
        stop_words = {line for line in f.read().splitlines()}
    return stop_words


# Load definitions and build a map
def import_definitions_map() -> Dict[AnyStr, List[Set[AnyStr]]]:
    definitions_map = defaultdict(list)
    with open('data/definitions.csv') as f:
        for line in f.readlines()[1:]:
            splits = line.split("~|~")
            word = splits[0]
            definitions = splits[1:]
            for definition in definitions:
                if bow_words := bag_of_words(definition):
                    definitions_map[word].append(bow_words)
    return dict(definitions_map)
            

stop_words = import_stop_words()
definitions_map = import_definitions_map()

for word, definitions in definitions_map.items():
    print(f"{word}: {definitions}\n")

Emotion: [{'concept', 'feel', 'human', 'situation', 'range'}, {'feel'}, {'animal', 'feel'}, {'bad', 'good', 'feel'}, {'sensation', 'arising', 'feeling', 'human', 'form'}, {'mind', 'living', 'percieve'}, {'human', 'feeling', 'express', 'animal'}, {'word', 'entity', 'feel', 'sentiment', 'living', 'express', 'throw', 'body'}, {'moment', 'feel'}, {'ming', 'well', 'experienced', 'happening', 'event', 'feeling', 'sentient'}, {'reation', 'action', 'sentimental'}, {'feel'}, {'love', 'sadness', 'happyness', 'feeling', 'human'}, {'general', 'concept', 'feeling'}, {'deriving', 'feeling', 'human', 'life'}, {'human', 'sensation'}, {'feeling', 'animal'}, {'psychic', 'emotion'}, {'human', 'feel'}, {'sensation', 'positive', 'affect', 'negative', 'mood', 'human'}, {'feeling', 'living'}, {'feeling'}, {'internal', 'external', 'person', 'caused', 'mind'}, {'reaction', 'mental'}, {'strong', 'feeling'}, {'felt', 'living', 'feelingstate'}, {'agent', 'mental'}, {'mood', 'mental'}, {'feeling', 'dued'}, {'physi

In [49]:
def cosine_similarity(def1, def2) -> float:
    # To avoid set of a single string, we do this check
    def1 = def1 if type(def1) is set else {def1}
    def2 = def2 if type(def2) is set else {def2}

    # Union of all the words for two definitions
    defs_union = list(def1 | def2)
    def_vec1 = []
    def_vec2 = []

    # Binary vectors computation that we'll use to compute cosine score
    for word in defs_union:
        def_vec1.append(1) if word in def1 else def_vec1.append(0)
        def_vec2.append(1) if word in def2 else def_vec2.append(0)
    
    # Cosine similarity computation (overlap)
    cosine_score = dot(def_vec1, def_vec2) / (norm(def_vec1) * norm(def_vec2))
    return cosine_score

In [81]:
# Compute cosine scores for all the possible combinations between definitions
def compute_scores(definitions_map: Dict[AnyStr, List[Set[AnyStr]]]) -> float:
    scores = dict()
    for concept, definitions in definitions_map.items():
        avg_similarity = 0
        combinations_count = 0

        for def1 in definitions:
            for def2 in definitions:
                if def1 != def2:
                    avg_similarity += cosine_similarity(def1, def2)
                    combinations_count += 1
        
        # Compute average similarity
        scores[concept] = avg_similarity / combinations_count
    return scores


# Compute mean score for the 3 most frequent words
def most_frequent_words_score(definitions_map: Dict[AnyStr, List[Set[AnyStr]]], words_num: int = 3) -> float:
    scores = dict()
    for concept, definitions in definitions_map.items():
        concept_counter = Counter()
        
        for definition in definitions:
            concept_counter.update(definition)
        
        frequent_words_counter = {entry[0] : 0 for entry in concept_counter.most_common(words_num)}
        print(f"The most frequent words for {concept} are: {list(frequent_words_counter.keys())}")

        for definition in definitions:
            for word in frequent_words_counter:
                if word in definition:
                    frequent_words_counter[word] += 1
        
        mean = 0
        for word, count in frequent_words_counter.items():
            mean += count/len(definitions)

        scores[concept] = mean/words_num
    return scores
    

In [87]:
# Compute cosine scores for all the definitions of the words
scores = compute_scores(definitions_map)
print(f"Average definition cosine similarity: \n{scores}\n")

# Compute mean scores for only the most 3 frequent words in the definitions
mfw_score = most_frequent_words_score(definitions_map, 3)
print(f"Average definition mean score for the most 3 frequent words: \n{mfw_score}")

Average definition cosine similarity: 
{'Emotion': 0.11487493413331017, 'Person': 0.3086137065020268, 'Revenge': 0.07504048944139909, 'Brick': 0.2985398157677549}

The most frequent words for Emotion are: ['feeling', 'feel', 'human']
The most frequent words for Person are: ['human', 'person', 'living']
The most frequent words for Revenge are: ['anger', 'feeling', 'action']
The most frequent words for Brick are: ['material', 'object', 'construction']
Average definition mean score for the most 3 frequent words: 
{'Emotion': 0.3111111111111111, 'Person': 0.3655913978494623, 'Revenge': 0.2333333333333333, 'Brick': 0.5161290322580645}


### Conclusions

1. Through the cosine similarity, we see that the similarity between the definitions is higher for "Person" that is our most generic concepts
2. Through the most frequent words similarity, instead we got that the similarity is highest for the definition of "Brick", followed by "Person" that we got before

As we can see, both the scores method return low scores, therefore we can conclude that (as we expected) giving definitions to a concept is really a hard task!