In [368]:
import gzip
import requests
from itertools import combinations
from typing import Dict, List, Optional, Set, Tuple, Union

import numpy as np
import pandas as pd
import spacy
import xmltodict
from numpy.linalg import norm
from scipy.spatial.distance import euclidean
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn_extra.cluster import KMedoids

import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
# Ensure that necessary nltk resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

## Retrieve data
----

In [370]:
def get_primary_name(names: Union[List[Dict[str, str]], Dict[str, str]]) -> Optional[str]:
    """
    Retrieves the primary name from the given names and formats it to lowercase with underscores.

    Args:
        names (Union[List[Dict[str, str]], Dict[str, str]]): The list or dict containing name information.

    Returns:
        Optional[str]: The formatted primary name, or None if not found.
    """
    if isinstance(names, list):
        for name in names:
            if name.get('@type') == 'primary':
                primary_name = name.get('@value')
                return primary_name.lower().replace(' ', '_')
    elif isinstance(names, dict) and names.get('@type') == 'primary':
        primary_name = names.get('@value')
        return primary_name.lower().replace(' ', '_')
    return None

def retrieve_comments(index: int) -> pd.DataFrame:
    """
    Retrieves comments for the board game with the given index from the BoardGameGeek API and formats the game name

    Args:
        index (int): The index of the board game.

    Returns:
        pd.DataFrame: comments with the formatted game name as keys
    """
    URL = f"https://boardgamegeek.com/xmlapi2/thing?id={index}&type=boardgame&comments=1"
    response = requests.get(URL)
    data = xmltodict.parse(response.content)
    
    game_name = get_primary_name(data["items"]["item"]['name'])

    comments = []
    if 'items' in data and 'item' in data['items']:
        item = data['items']['item']
        if 'comments' in item and 'comment' in item['comments']:
            for comment in item['comments']['comment']:
                comments.append({game_name: comment['@value']})

    return pd.DataFrame(comments)

In [371]:
retrieve_comments(52).head(15)

Unnamed: 0,mighty_empires
0,Played it a long time ago. I have now only a f...
1,"I loved this game as a teenager, but the compl..."
2,"Nice pieces and tiles. Basic and easy rules, a..."
3,A game from my childhood. We had a massive map...
4,Use the pieces for War of the Ring!
5,Loved this game! Wish they would reprint..in i...
6,"Several extra Metal Miniatures, extra White Dw..."
7,This game just seems to have missed the mark i...
8,As a stand alone I really enjoyed playing this...
9,It's a good campaign generator for Warhammer b...


----

## Event concept extraction algorithm
----

The algorithm for capturing event concepts matches object concepts with normalized verb chunks. This is achieved by utilizing a parse graph that maps all the multi-word expressions contained in the knowledge bases.
1.	Match Object and Verb Phrases:
	- The algorithm searches for matches between the object concepts and the normalized verb phrases.
2.	Utilize a Parse Graph:
	- A directed, unweighted parse graph is used to quickly detect multi-word concepts without performing an exhaustive search through all possible word combinations that can form a commonsense concept.
3.	Remove Redundant Terms:
	- Single-word concepts, such as “house,” that already appear in the clause as part of a multi-word concept, like “beautiful house,” are considered pleonastic (providing redundant information) and are discarded.
4.	Extract Event Concepts:
	- The algorithm extracts event concepts such as “go market,” “buy some fruits,” “buy fruits,” and “buy vegetables.”
	- These event concepts represent Script-Based Object Concepts (SBoCs) and can be fed into a commonsense reasoning algorithm for further processing.

---- 

In [372]:
def event_concept_extraction(sentence: str) -> List[str]:
    """
    Extracts event concepts from a given sentence.

    Args:
        sentence (str): sentence to process

    Returns:
        List[str]: extracted event concepts
    """
    concepts: Set[str] = set()  # initialize an empty set to store unique concepts
    doc = nlp(sentence)  # process the sentence with spaCy

    for sent in doc.sents:
        # identify all verbs in the sentence
        verbs = [token for token in sent if token.pos_ == 'VERB']
        # identify all noun phrases in the sentence
        noun_phrases = list(sent.noun_chunks)

        for verb in verbs:
            # stem the verb
            stemmed_verb = verb.lemma_
            # find noun phrases associated with the verb
            associated_nouns = [np for np in noun_phrases if np.root.head == verb]

            for np in associated_nouns:
                # extract adjectives in the noun phrase
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                if len(np) > 1:
                    # if the noun phrase contains more than one word, form a concept with the verb and noun phrase
                    concept = f"{stemmed_verb} {' '.join([token.text for token in np])}"
                    if adjectives:
                        concept += f" {' '.join(adjectives)}"
                    concepts.add(concept)
                else:
                    # handle single-word noun phrases
                    single_word_concept = np.text
                    if not any(single_word_concept in concept for concept in concepts):
                        concept = f"{stemmed_verb} {single_word_concept}"
                        if adjectives:
                            concept += f" {' '.join(adjectives)}"
                        concepts.add(concept)

        for np in noun_phrases:
            # handle noun phrases associated with auxiliary verbs
            if np.root.head.pos_ == 'AUX':
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                concept = f"be {' '.join([token.text for token in np])}"
                if adjectives:
                    concept += f" {' '.join(adjectives)}"
                concepts.add(concept)

    return list(concepts)

def extract_concepts_from_text(text: str) -> List[str]:
    """
    Extracts event concepts from a given text by splitting it into sentences first

    Args:
        text (str): text to process

    Returns:
        List[str]: extracted event concepts
    """
    all_concepts = set()
    doc = nlp(text)
    for sent in doc.sents:
        concepts = event_concept_extraction(sent.text)
        all_concepts.update(concepts)
    return list(all_concepts)

def extract_concepts_from_series(series: pd.Series) -> pd.Series:
    """
    Extracts event concepts from a pandas Series

    Args:
        series (pd.Series): Series to process

    Returns:
        pd.Series: extracted event concepts
    """
    all_concepts_series = series.apply(lambda text: extract_concepts_from_text(str(text)))
    return all_concepts_series

def extract_concepts_from_dataframe(df: pd.DataFrame, text_columns: List[str]) -> pd.DataFrame:
    """
    Extracts event concepts from specified columns of a pandas DataFrame

    Args:
        df (pd.DataFrame): DataFrame to process
        text_columns (List[str]): column names containing text data to process

    Returns:
        pd.DataFrame: extracted event concepts in corresponding new columns
    """
    result_df = df.copy()
    for column in text_columns:
        result_df[f"{column}_concepts"] = extract_concepts_from_series(df[column])
    return result_df

### Prepare comments for a board game

In [373]:
comments = retrieve_comments(52)
concepts = extract_concepts_from_dataframe(comments, [comments.columns[0]])
# concepts.to_csv(f"./{concepts.columns[0]}.csv")
concepts.head(15)

Unnamed: 0,mighty_empires,mighty_empires_concepts
0,Played it a long time ago. I have now only a f...,"[leave only a few spare parts few spare, play ..."
1,"I loved this game as a teenager, but the compl...","[mean the complexity, doubt I, love this game,..."
2,"Nice pieces and tiles. Basic and easy rules, a...",[run ancient or medieval campaigns ancient med...
3,A game from my childhood. We had a massive map...,"[have We, work the game, have a massive map ma..."
4,Use the pieces for War of the Ring!,[use the pieces]
5,Loved this game! Wish they would reprint..in i...,[reprint they]
6,"Several extra Metal Miniatures, extra White Dw...",[]
7,This game just seems to have missed the mark i...,"[have a massive head start massive, miss the m..."
8,As a stand alone I really enjoyed playing this...,"[play this, enjoy I, make numerous house ruled..."
9,It's a good campaign generator for Warhammer b...,"[be a good campaign generator good, be nothing..."


### Load embedding for clustering

In [374]:
def load_embeddings(file_path: str) -> Dict[str, np.ndarray]:
    """
    Loads word embeddings from a file

    Args:
        file_path (str): Path to the embeddings file

    Returns:
        Dict[str, np.ndarray]: word embeddings
    """
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ')
            word = parts[0]
            vector = np.array(parts[1:], dtype=float)
            embeddings[word] = vector
    return embeddings

def get_combined_embedding(words: List[str], embeddings: Dict[str, np.ndarray]) -> Optional[np.ndarray]:
    """
    Computes the combined embedding for a list of words

    Args:
        words (List[str]): List of words.
        embeddings (Dict[str, np.ndarray]): word embeddings.

    Returns:
        Optional[np.ndarray]: Combined embedding vector, or None if no valid embeddings found
    """
    valid_embeddings = [embeddings.get(word) for word in words if word in embeddings]
    if not valid_embeddings:
        return None
    return np.mean(valid_embeddings, axis=0)


def process_concepts_from_dataframe(df: pd.DataFrame, column_name: str, embeddings: Dict[str, np.ndarray]) -> pd.DataFrame:
    """
    Processes concepts from a DataFrame column and computes their embeddings

    Args:
        df (pd.DataFrame): original sentences and concepts
        embeddings (Dict[str, np.ndarray]): word embeddings

    Returns:
        pd.DataFrame: original sentences, concepts, and their embeddings
    """
    processed_data = []

    for _, row in df.iterrows():
        sentence = row[column_name]
        concepts = row[column_name + '_concepts']
        concept_embedding = []
        num_concepts = 0
        for concept in concepts:
            words = concept.split()
            embedding = get_combined_embedding(words, embeddings)

            if embedding is not None:
                num_concepts += 1
                concept_embedding.append(embedding)

        processed_data.append({
            'sentence': sentence,
            'concept': concepts,
            'embedding': concept_embedding,
            'num_concepts': num_concepts,
        })

    result_df = pd.DataFrame(processed_data)
    return result_df


In [375]:
# Load embeddings
embeddings = load_embeddings('./data/numberbatch-en.txt')

# Process concepts and compute embeddings
concepts = process_concepts_from_dataframe(concepts, concepts.columns[0], embeddings)

In [376]:
def cluster_concepts(concepts: pd.DataFrame, num_clusters: int) -> pd.DataFrame:
    """
    Clusters each concept using KMedoids separately 

    Args:
        concepts (pd.DataFrame): DataFrame containing concepts and their embeddings.
        num_clusters (int): number of clusters

    Returns:
        pd.DataFrame: clustering results
    """
    clustered_concepts_list = []
    
    for index, row in concepts.iterrows():
        embedding = row['embedding']
        concepts = row['concept']
        num_concepts = row['num_concepts']

        reduced_concepts = []

        if num_concepts != 0:
            # Perform KMedoids clustering on this set
            kmedoids = KMedoids(n_clusters=min(num_clusters, num_concepts), random_state=0).fit(embedding)
            labels = kmedoids.labels_

            clustered_concepts = {}
            for label, concept in zip(labels, concepts):
                if label not in clustered_concepts:
                    clustered_concepts[label] = []
                clustered_concepts[label].append((concept, embedding[concepts.index(concept)]))

            for label, items in clustered_concepts.items():
                medoid_index = kmedoids.medoid_indices_[label]
                reduced_concepts.append(concepts[medoid_index])
            
        # Add clustering results to a new DataFrame
        clustered_concepts_list.append({
            'concept': concepts,
            'clustered_concepts': reduced_concepts,
        })
    
    return pd.DataFrame(clustered_concepts_list)

# Cluster the concepts and get the reduced concepts
reduced_concepts = cluster_concepts(concepts, 3)



## Definition of the pre-defined classes with keywords


--- 


In [377]:
# simplified classes
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea', 'out of control'],
    'bookkeeping': ['bookkeeping', 'manual recording', 'rulebook'],
    'downtime': ['downtime', 'waiting', 'unproductive'],
    'interaction': ['interaction', 'influence on other players'],
    'bash the leader': ['bash the leader', 'sacrifice themself', 'prevent victory'],
    'complicated': ['complicated', 'many rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable', 'difficult to master'],
}

---
## Use of similarity meassure
--- 

In [378]:
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> Optional[float]:
    if vec1 is None or vec2 is None:
        return None
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_similarity(concept: str, class_keywords: Dict[str, List[str]], embeddings: Dict[str, np.ndarray], 
                       similarity_threshold: float = 0.5, top_n_classes: int = 3) -> Optional[Tuple[str, float]]:
    
    concept_embedding = get_combined_embedding(concept.split(), embeddings)
    if concept_embedding is None:
        return None
    
    similarities = []
    for class_name, keywords in class_keywords.items():
        class_embedding = get_combined_embedding(keywords, embeddings)
        if class_embedding is None:
            continue
        similarity = cosine_similarity(concept_embedding, class_embedding)
        if similarity is not None:
            similarities.append((class_name, similarity))
    
    # Filter classes based on similarity threshold
    similarities = [item for item in similarities if item[1] >= similarity_threshold]
    # Sort by similarity and get top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_classes = similarities[:top_n_classes]
    
    if not top_classes:
        return None
            
    return top_classes[0]  # Return the best class and its similarity

def assign_concepts_to_classes(concepts: pd.DataFrame, class_keywords: Dict[str, List[str]], embeddings: Dict[str, np.ndarray], 
                               similarity_threshold: float = 0.2, top_n_classes: int = 1) -> pd.DataFrame:
    assignment_results = []
    for _, row in concepts.iterrows():
        concepts = row['clustered_concepts']
        assignments = {}
        for concept in concepts:
            best_class = compute_similarity(concept, class_keywords, embeddings, similarity_threshold, top_n_classes)
            assignments[concept] = best_class
        
        assignment_results.append({
            'classified_concepts': assignments 
        })
     
    assignment_df = pd.DataFrame(assignment_results)
    return assignment_df

In [379]:
# Assign concepts to classes
assigned_classes = assign_concepts_to_classes(reduced_concepts, class_keywords, embeddings)

# Output the results
assigned_classes

Unnamed: 0,classified_concepts
0,{'leave only a few spare parts few spare': ('d...
1,"{'love this game': None, 'enjoy it': None, 'se..."
2,{'run ancient or medieval campaigns ancient me...
3,"{'have a massive map massive': None, 'work the..."
4,{'use the pieces': None}
...,...
95,"{'buy two sets': None, 'enjoy this game': ('lu..."
96,"{'make which': None, 'take it': None, 'take ma..."
97,"{'play this': None, 'need new gaming friends n..."
98,"{'be it': None, 'be a reasonable system reason..."


---