In [9]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from itertools import combinations
from scipy.spatial.distance import euclidean
from sklearn.decomposition import TruncatedSVD
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import spacy
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import requests
import gzip
import xmltodict

from typing import List, Set, Dict

In [2]:
# Ensure that necessary nltk resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 8] nodename nor servname provided, or not
[nltk_data]     known>
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading omw-1.4: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


## Retrieve data
----

In [3]:
def get_primary_name(names):
    if isinstance(names, list):
        for name in names:
            if name.get('@type') == 'primary':
                return name.get('@value')
    elif isinstance(names, dict) and names.get('@type') == 'primary':
        return names.get('@value')
    return None

def retrieve_comments(index):
    URL = "https://boardgamegeek.com/xmlapi2/thing?id=" + str(index) + "&type=boardgame&comments=1"
    response = requests.get(URL)
    data = xmltodict.parse(response.content)
    game_name = get_primary_name(data["items"]["item"]['name'])

    comments = []
    if 'items' in data and 'item' in data['items']:
        item = data['items']['item']
        if 'comments' in item and 'comment' in item['comments']:
            for comment in item['comments']['comment']:
                comments.append({game_name: comment['@value']})

    return pd.DataFrame(comments)

In [45]:
retrieve_comments(20)

Unnamed: 0,Full Metal Planète
0,"Box has shelfwear, contents in great condition..."
1,"Excellent mechanics, but the gameplay may be a..."
2,Exciting and tense game so far out of print yo...
3,I made a homemade copy. Still working on finis...
4,2 exemplaires
...,...
95,+ expansion!
96,An almost purely tactical game with very littl...
97,This is certainly one of the best game I playe...
98,A very good strategy game


----

## Event concept extraction algorithm
----

The algorithm for capturing event concepts matches object concepts with normalized verb chunks. This is achieved by utilizing a parse graph that maps all the multi-word expressions contained in the knowledge bases.
1.	Match Object and Verb Phrases:
	- The algorithm searches for matches between the object concepts and the normalized verb phrases.
2.	Utilize a Parse Graph:
	- A directed, unweighted parse graph is used to quickly detect multi-word concepts without performing an exhaustive search through all possible word combinations that can form a commonsense concept.
3.	Remove Redundant Terms:
	- Single-word concepts, such as “house,” that already appear in the clause as part of a multi-word concept, like “beautiful house,” are considered pleonastic (providing redundant information) and are discarded.
4.	Extract Event Concepts:
	- The algorithm extracts event concepts such as “go market,” “buy some fruits,” “buy fruits,” and “buy vegetables.”
	- These event concepts represent Script-Based Object Concepts (SBoCs) and can be fed into a commonsense reasoning algorithm for further processing.

---- 

In [39]:
def event_concept_extraction(sentence: str) -> List[str]:
    """
    Extracts event concepts from a given sentence.

    Args:
        sentence (str): sentence to process

    Returns:
        List[str]: extracted event concepts
    """
    concepts: Set[str] = set()  # initialize an empty set to store unique concepts
    doc = nlp(sentence)  # process the sentence with spaCy

    for sent in doc.sents:
        # identify all verbs in the sentence
        verbs = [token for token in sent if token.pos_ == 'VERB']
        # identify all noun phrases in the sentence
        noun_phrases = list(sent.noun_chunks)

        for verb in verbs:
            # stem the verb
            stemmed_verb = verb.lemma_
            # find noun phrases associated with the verb
            associated_nouns = [np for np in noun_phrases if np.root.head == verb]

            for np in associated_nouns:
                # extract adjectives in the noun phrase
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                if len(np) > 1:
                    # if the noun phrase contains more than one word, form a concept with the verb and noun phrase
                    concept = f"{stemmed_verb} {' '.join([token.text for token in np])}"
                    if adjectives:
                        concept += f" {' '.join(adjectives)}"
                    concepts.add(concept)
                else:
                    # handle single-word noun phrases
                    single_word_concept = np.text
                    if not any(single_word_concept in concept for concept in concepts):
                        concept = f"{stemmed_verb} {single_word_concept}"
                        if adjectives:
                            concept += f" {' '.join(adjectives)}"
                        concepts.add(concept)

        for np in noun_phrases:
            # handle noun phrases associated with auxiliary verbs
            if np.root.head.pos_ == 'AUX':
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                concept = f"be {' '.join([token.text for token in np])}"
                if adjectives:
                    concept += f" {' '.join(adjectives)}"
                concepts.add(concept)

    return list(concepts)

def extract_concepts_from_text(text: str) -> List[str]:
    """
    Extracts event concepts from a given text by splitting it into sentences first.

    Args:
        text (str): text to process

    Returns:
        List[str]: extracted event concepts
    """
    all_concepts = set()
    doc = nlp(text)
    for sent in doc.sents:
        concepts = event_concept_extraction(sent.text)
        all_concepts.update(concepts)
    return list(all_concepts)

In [46]:
sentence = str(retrieve_comments(50).iloc[0, 0])
print("Original comment:\n")
print(f"""\"{sentence}\"""")
concepts = extract_concepts_from_text(sentence)
print("\nExtracted concepts: ", concepts)

Original comment:

"Like it.  Lightweight.  Scoring requires some calculation."

Extracted concepts:  ['require Scoring', 'require some calculation']


In [47]:
# prepare embedding
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ')
            word = parts[0]
            vector = np.array(parts[1:], dtype=float)
            embeddings[word] = vector
    return embeddings

def get_combined_embedding(words, embeddings):
    valid_embeddings = [embeddings.get(word) for word in words if word in embeddings]
    if not valid_embeddings:
        return None
    return np.mean(valid_embeddings, axis=0)

embeddings = load_embeddings('./data/numberbatch-en.txt')

In [48]:
concept_embeddings = []
valid_concepts = []
for concept in concepts:
    words = concept.split()
    embedding = get_combined_embedding(words, embeddings)
    if embedding is not None:
        concept_embeddings.append(embedding)
        valid_concepts.append(concept)

In [49]:
num_clusters = 5 if len(concepts) > 5 else len(concepts)

kmedoids = KMedoids(n_clusters=num_clusters, random_state=0).fit(concept_embeddings)
labels = kmedoids.labels_

In [50]:
clustered_concepts = {}
for label, concept in zip(labels, valid_concepts):
    if label not in clustered_concepts:
        clustered_concepts[label] = []
    clustered_concepts[label].append((concept, concept_embeddings[valid_concepts.index(concept)]))

In [51]:
reduced_concepts = []
for label, items in clustered_concepts.items():
    medoid_index = kmedoids.medoid_indices_[label]
    reduced_concepts.append(valid_concepts[medoid_index])

In [52]:
print("Reduced Concepts:")
for concept in reduced_concepts:
    print(concept)

Reduced Concepts:
require Scoring
require some calculation


## Simple matching for classification
--- 

In [53]:
# simplified classes
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

In [54]:
def classify_concepts(concepts: List[str], class_keywords: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    """
    Classifies extracted concepts into predefined categories based on keywords.

    Args:
        concepts (List[str]): The list of extracted concepts.
        class_keywords (Dict[str, List[str]]): A dictionary where keys are class names and values are lists of keywords.

    Returns:
        Dict[str, Set[str]]: A dictionary where keys are class names and values are sets of concepts that match the keywords.
    """
    classified_concepts: Dict[str, Set[str]] = {category: set() for category in class_keywords}

    for concept in concepts:
        for category, keywords in class_keywords.items():
            for keyword in keywords:
                if keyword in concept:
                    classified_concepts[category].add(concept)

    return classified_concepts

In [55]:
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

classified_concepts = classify_concepts(concepts, class_keywords)
print("\nClassified concepts: ", classified_concepts)


Classified concepts:  {'luck or alea': set(), 'bookkeeping': set(), 'downtime': set(), 'interaction': set(), 'bash the leader': set(), 'complicated': set(), 'complex': set()}


---
## Use of similarity meassure
--- 

In [56]:
def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None:
        return None
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_similarity(concept, class_keywords, embeddings, similarity_threshold=0.5, top_n_classes=3):
    concept_words = concept.split()
    concept_embedding = get_combined_embedding(concept_words, embeddings)
    if concept_embedding is None:
        return None, None
    
    similarities = []
    for class_name, keywords in class_keywords.items():
        class_embedding = get_combined_embedding(keywords, embeddings)
        if class_embedding is None:
            continue
        similarity = cosine_similarity(concept_embedding, class_embedding)
        if similarity is not None:
            similarities.append((class_name, similarity))
    
    # Filter classes based on similarity threshold
    similarities = [item for item in similarities if item[1] >= similarity_threshold]
    # Sort by similarity and get top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_classes = similarities[:top_n_classes]
    
    if not top_classes:
        return None, None
            
    return top_classes[0]  # Return the best class and its similarity

# compute the similarity
assignments = {}
for concept in concepts:
    best_class, similarity = compute_similarity(concept, class_keywords, embeddings, similarity_threshold=0.1, top_n_classes=1)
    assignments[concept] = (best_class, similarity)

In [57]:
# ouput of the resutls
print("Assignments:")
for concept, best_class in assignments.items():
    if best_class != (None, None):
        class_name, similarity = best_class
        print(f"Concept '{concept}' is assigned to class '{class_name}' with similarity {similarity:.2f}")
    else:
        print(f"Concept '{concept}' could not be assigned to any class")

Assignments:
Concept 'require Scoring' is assigned to class 'complicated' with similarity 0.32
Concept 'require some calculation' is assigned to class 'complicated' with similarity 0.35


---