In [2]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from itertools import combinations
from sklearn.decomposition import TruncatedSVD
from sklearn_extra.cluster import KMedoids
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import spacy
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import requests
import gzip
import xmltodict

from typing import List, Set, Dict

In [3]:
# Ensure that necessary nltk resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lukas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/lukas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lukas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Retrieve data
----

In [4]:
def get_primary_name(names):
    if isinstance(names, list):
        for name in names:
            if name.get('@type') == 'primary':
                return name.get('@value')
    elif isinstance(names, dict) and names.get('@type') == 'primary':
        return names.get('@value')
    return None

def retrieve_comments(index):
    URL = "https://boardgamegeek.com/xmlapi2/thing?id=" + str(index) + "&type=boardgame&comments=1"
    response = requests.get(URL)
    data = xmltodict.parse(response.content)
    game_name = get_primary_name(data["items"]["item"]['name'])

    comments = []
    if 'items' in data and 'item' in data['items']:
        item = data['items']['item']
        if 'comments' in item and 'comment' in item['comments']:
            for comment in item['comments']['comment']:
                comments.append({game_name: comment['@value']})

    return pd.DataFrame(comments)

----

## Event concept extraction algorithm
----

The algorithm for capturing event concepts matches object concepts with normalized verb chunks. This is achieved by utilizing a parse graph that maps all the multi-word expressions contained in the knowledge bases.
1.	Match Object and Verb Phrases:
	- The algorithm searches for matches between the object concepts and the normalized verb phrases.
2.	Utilize a Parse Graph:
	- A directed, unweighted parse graph is used to quickly detect multi-word concepts without performing an exhaustive search through all possible word combinations that can form a commonsense concept.
3.	Remove Redundant Terms:
	- Single-word concepts, such as “house,” that already appear in the clause as part of a multi-word concept, like “beautiful house,” are considered pleonastic (providing redundant information) and are discarded.
4.	Extract Event Concepts:
	- The algorithm extracts event concepts such as “go market,” “buy some fruits,” “buy fruits,” and “buy vegetables.”
	- These event concepts represent Script-Based Object Concepts (SBoCs) and can be fed into a commonsense reasoning algorithm for further processing.

---- 

In [5]:
def event_concept_extraction(sentence: str) -> List[str]:
    """
    Extracts event concepts from a given sentence.

    This function processes a natural language sentence to extract event concepts by linking verbs with associated
    noun phrases. It stems the verbs and constructs concepts by combining them with the nouns and adjectives.

    Args:
        sentence (str): sentence to process

    Returns:
        List[str]: extracted event concepts
    """
    concepts: Set[str] = set()  # initialize an empty set to store unique concepts
    doc = nlp(sentence)  # process the sentence with spaCy

    for sent in doc.sents:
        # identify all verbs in the sentence
        verbs = [token for token in sent if token.pos_ == 'VERB']
        # identify all noun phrases in the sentence
        noun_phrases = list(sent.noun_chunks)

        for verb in verbs:
            # stem the verb
            stemmed_verb = verb.lemma_
            # find noun phrases associated with the verb
            associated_nouns = [np for np in noun_phrases if np.root.head == verb]

            for np in associated_nouns:
                # extract adjectives in the noun phrase
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                if len(np) > 1:
                    # if the noun phrase contains more than one word, form a concept with the verb and noun phrase
                    concept = f"{stemmed_verb} {' '.join([token.text for token in np])}"
                    if adjectives:
                        concept += f" {' '.join(adjectives)}"
                    concepts.add(concept)
                else:
                    # handle single-word noun phrases
                    single_word_concept = np.text
                    if not any(single_word_concept in concept for concept in concepts):
                        concept = f"{stemmed_verb} {single_word_concept}"
                        if adjectives:
                            concept += f" {' '.join(adjectives)}"
                        concepts.add(concept)

        for np in noun_phrases:
            # handle noun phrases associated with auxiliary verbs
            if np.root.head.pos_ == 'AUX':
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                concept = f"be {' '.join([token.text for token in np])}"
                if adjectives:
                    concept += f" {' '.join(adjectives)}"
                concepts.add(concept)

    return list(concepts)

In [6]:
sentence = str(retrieve_comments(3).iloc[0, 0])
print("Original comment:\n")
print(f"""\"{sentence}\"""")
concepts = event_concept_extraction(sentence)
print("\nExtracted concepts: ", concepts)

Original comment:

"What an elegant game.   Light rules, deep gameplay, language independent, attractive presentation. Do they still make games like this?  Some say it's abstract but to me it looks thematic. Castes, tiles, board and screen all make sense. Abstract is what Ingenious looks like; Samurai is not that abstract.   Rules are clear and concise. Knizia usually puts me off with intricate end game conditions but here the scoring is pretty logical. It's simple as that: you aim to either get a majority in two castes, or get a majority in one caste while staying competitive in the other two castes. In other terms: you need two big scores (AB) or one big score and two small ones (Abc).  There is a slight difference between editions: in the original if no player has majority in a caste, everyone loses. In the zman edition the player with most pieces of all castes wins.  First player advantage is overrated and is of no significance if players draw starting tiles randomly."

Extracted c

## Simple matching for classification
--- 

In [None]:
# simplified classes
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

In [7]:
def classify_concepts(concepts: List[str], class_keywords: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    """
    Classifies extracted concepts into predefined categories based on keywords.

    Args:
        concepts (List[str]): The list of extracted concepts.
        class_keywords (Dict[str, List[str]]): A dictionary where keys are class names and values are lists of keywords.

    Returns:
        Dict[str, Set[str]]: A dictionary where keys are class names and values are sets of concepts that match the keywords.
    """
    classified_concepts: Dict[str, Set[str]] = {category: set() for category in class_keywords}

    for concept in concepts:
        for category, keywords in class_keywords.items():
            for keyword in keywords:
                if keyword in concept:
                    classified_concepts[category].add(concept)

    return classified_concepts

In [8]:
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

classified_concepts = classify_concepts(concepts, class_keywords)
print("\nClassified concepts: ", classified_concepts)


Classified concepts:  {'luck or alea': set(), 'bookkeeping': set(), 'downtime': set(), 'interaction': set(), 'bash the leader': set(), 'complicated': set(), 'complex': set()}


---
## Use of similarity meassure
--- 

In [22]:
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ')
            word = parts[0]
            vector = np.array(parts[1:], dtype=float)
            embeddings[word] = vector
    return embeddings

def get_combined_embedding(words, embeddings):
    valid_embeddings = [embeddings.get(word) for word in words if word in embeddings]
    if not valid_embeddings:
        return None
    return np.mean(valid_embeddings, axis=0)

def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None:
        return None
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_similarity(concept, class_keywords, embeddings, similarity_threshold=0.5, top_n_classes=3):
    concept_words = concept.split()
    concept_embedding = get_combined_embedding(concept_words, embeddings)
    if concept_embedding is None:
        return None, None
    
    similarities = []
    for class_name, keywords in class_keywords.items():
        class_embedding = get_combined_embedding(keywords, embeddings)
        if class_embedding is None:
            continue
        similarity = cosine_similarity(concept_embedding, class_embedding)
        if similarity is not None:
            similarities.append((class_name, similarity))
    
    # Filter classes based on similarity threshold
    similarities = [item for item in similarities if item[1] >= similarity_threshold]
    # Sort by similarity and get top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_classes = similarities[:top_n_classes]
    
    if not top_classes:
        return None, None
            
    return top_classes[0]  # Return the best class and its similarity

# load the embeddings
embeddings = load_embeddings('./data/numberbatch-en.txt')

# compute the similarity
assignments = {}
for concept in concepts:
    best_class, similarity = compute_similarity(concept, class_keywords, embeddings, similarity_threshold=0.1, top_n_classes=1)
    assignments[concept] = (best_class, similarity)

In [23]:
# ouput of the resutls
print("Assignments:")
for concept, best_class in assignments.items():
    if best_class != (None, None):
        class_name, similarity = best_class
        print(f"Concept '{concept}' is assigned to class '{class_name}' with similarity {similarity:.2f}")
    else:
        print(f"Concept '{concept}' could not be assigned to any class")

Assignments:
Concept 'look it' could not be assigned to any class
Concept 'aim you' is assigned to class 'luck or alea' with similarity 0.15
Concept 'overrate First player advantage First' is assigned to class 'luck or alea' with similarity 0.22
Concept 'be Rules' could not be assigned to any class
Concept 'be it' could not be assigned to any class
Concept 'be It' could not be assigned to any class
Concept 'be the scoring' is assigned to class 'bookkeeping' with similarity 0.17
Concept 'make they' is assigned to class 'bash the leader' with similarity 0.15
Concept 'put Knizia' is assigned to class 'bash the leader' with similarity 0.14
Concept 'make games' is assigned to class 'complicated' with similarity 0.18
Concept 'start tiles' could not be assigned to any class
Concept 'make Castes' is assigned to class 'bash the leader' with similarity 0.19
Concept 'say Some' could not be assigned to any class
Concept 'be a slight difference slight' is assigned to class 'complicated' with simila

---