In [309]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from itertools import combinations
from scipy.spatial.distance import euclidean
from sklearn.decomposition import TruncatedSVD
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import spacy
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import requests
import gzip
import xmltodict

from typing import Union, List, Dict, Optional, Tuple

In [310]:
# Ensure that necessary nltk resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lukas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/lukas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lukas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Retrieve data
----

In [311]:
def get_primary_name(names: Union[List[Dict[str, str]], Dict[str, str]]) -> Optional[str]:
    """
    Retrieves the primary name from the given names and formats it to lowercase with underscores.

    Args:
        names (Union[List[Dict[str, str]], Dict[str, str]]): The list or dict containing name information.

    Returns:
        Optional[str]: The formatted primary name, or None if not found.
    """
    if isinstance(names, list):
        for name in names:
            if name.get('@type') == 'primary':
                primary_name = name.get('@value')
                return primary_name.lower().replace(' ', '_')
    elif isinstance(names, dict) and names.get('@type') == 'primary':
        primary_name = names.get('@value')
        return primary_name.lower().replace(' ', '_')
    return None

def retrieve_comments(index: int) -> pd.DataFrame:
    """
    Retrieves comments for the board game with the given index from the BoardGameGeek API and formats the game name.

    Args:
        index (int): The index of the board game.

    Returns:
        pd.DataFrame: DataFrame containing the comments with the formatted game name as keys.
    """
    URL = f"https://boardgamegeek.com/xmlapi2/thing?id={index}&type=boardgame&comments=1"
    response = requests.get(URL)
    data = xmltodict.parse(response.content)
    
    game_name = get_primary_name(data["items"]["item"]['name'])

    comments = []
    if 'items' in data and 'item' in data['items']:
        item = data['items']['item']
        if 'comments' in item and 'comment' in item['comments']:
            for comment in item['comments']['comment']:
                comments.append({game_name: comment['@value']})

    return pd.DataFrame(comments)

In [312]:
retrieve_comments(20)

Unnamed: 0,full_metal_planète
0,"Box has shelfwear, contents in great condition..."
1,"Excellent mechanics, but the gameplay may be a..."
2,Exciting and tense game so far out of print yo...
3,I made a homemade copy. Still working on finis...
4,2 exemplaires
...,...
95,+ expansion!
96,An almost purely tactical game with very littl...
97,This is certainly one of the best game I playe...
98,A very good strategy game


----

## Event concept extraction algorithm
----

The algorithm for capturing event concepts matches object concepts with normalized verb chunks. This is achieved by utilizing a parse graph that maps all the multi-word expressions contained in the knowledge bases.
1.	Match Object and Verb Phrases:
	- The algorithm searches for matches between the object concepts and the normalized verb phrases.
2.	Utilize a Parse Graph:
	- A directed, unweighted parse graph is used to quickly detect multi-word concepts without performing an exhaustive search through all possible word combinations that can form a commonsense concept.
3.	Remove Redundant Terms:
	- Single-word concepts, such as “house,” that already appear in the clause as part of a multi-word concept, like “beautiful house,” are considered pleonastic (providing redundant information) and are discarded.
4.	Extract Event Concepts:
	- The algorithm extracts event concepts such as “go market,” “buy some fruits,” “buy fruits,” and “buy vegetables.”
	- These event concepts represent Script-Based Object Concepts (SBoCs) and can be fed into a commonsense reasoning algorithm for further processing.

---- 

In [313]:
def event_concept_extraction(sentence: str) -> List[str]:
    """
    Extracts event concepts from a given sentence.

    Args:
        sentence (str): sentence to process

    Returns:
        List[str]: extracted event concepts
    """
    concepts: Set[str] = set()  # initialize an empty set to store unique concepts
    doc = nlp(sentence)  # process the sentence with spaCy

    for sent in doc.sents:
        # identify all verbs in the sentence
        verbs = [token for token in sent if token.pos_ == 'VERB']
        # identify all noun phrases in the sentence
        noun_phrases = list(sent.noun_chunks)

        for verb in verbs:
            # stem the verb
            stemmed_verb = verb.lemma_
            # find noun phrases associated with the verb
            associated_nouns = [np for np in noun_phrases if np.root.head == verb]

            for np in associated_nouns:
                # extract adjectives in the noun phrase
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                if len(np) > 1:
                    # if the noun phrase contains more than one word, form a concept with the verb and noun phrase
                    concept = f"{stemmed_verb} {' '.join([token.text for token in np])}"
                    if adjectives:
                        concept += f" {' '.join(adjectives)}"
                    concepts.add(concept)
                else:
                    # handle single-word noun phrases
                    single_word_concept = np.text
                    if not any(single_word_concept in concept for concept in concepts):
                        concept = f"{stemmed_verb} {single_word_concept}"
                        if adjectives:
                            concept += f" {' '.join(adjectives)}"
                        concepts.add(concept)

        for np in noun_phrases:
            # handle noun phrases associated with auxiliary verbs
            if np.root.head.pos_ == 'AUX':
                adjectives = [token.text for token in np if token.pos_ == 'ADJ']
                concept = f"be {' '.join([token.text for token in np])}"
                if adjectives:
                    concept += f" {' '.join(adjectives)}"
                concepts.add(concept)

    return list(concepts)

def extract_concepts_from_text(text: str) -> List[str]:
    """
    Extracts event concepts from a given text by splitting it into sentences first.

    Args:
        text (str): text to process

    Returns:
        List[str]: extracted event concepts
    """
    all_concepts = set()
    doc = nlp(text)
    for sent in doc.sents:
        concepts = event_concept_extraction(sent.text)
        all_concepts.update(concepts)
    return list(all_concepts)

def extract_concepts_from_series(series: pd.Series) -> pd.Series:
    """
    Extracts event concepts from a pandas Series.

    Args:
        series (pd.Series): Series to process

    Returns:
        pd.Series: Series with extracted event concepts
    """
    all_concepts_series = series.apply(lambda text: extract_concepts_from_text(str(text)))
    return all_concepts_series

def extract_concepts_from_dataframe(df: pd.DataFrame, text_columns: List[str]) -> pd.DataFrame:
    """
    Extracts event concepts from specified columns of a pandas DataFrame.

    Args:
        df (pd.DataFrame): DataFrame to process
        text_columns (List[str]): List of column names containing text data to process

    Returns:
        pd.DataFrame: DataFrame with extracted event concepts in corresponding new columns
    """
    result_df = df.copy()
    for column in text_columns:
        result_df[f"{column}_concepts"] = extract_concepts_from_series(df[column])
    return result_df

### Prepare comments for a board game

In [314]:
comments = retrieve_comments(51)
concepts = extract_concepts_from_dataframe(comments, [comments.columns[0]])
concepts.to_csv(f"./{concepts.columns[0]}.csv")
concepts.head(15)

Unnamed: 0,ricochet_robots,ricochet_robots_concepts
0,Very interesting multi-player puzzle game. The...,"[be my feeling, get the game, change every tur..."
1,It's a brillant game that can be played by a t...,"[be It, play that, deserve it, be a brillant g..."
2,mind consuming game. A party game just because...,"[play you, play it, play A party game]"
3,It is a wonderful puzzle that is great fun in ...,"[be that, be great fun great, be It, be a wond..."
4,Brain burner! Easy play but very challenging.,[]
5,Makes your brain melt. Best played without 'th...,[]
6,reviewed on my blog.,[]
7,"Not for everyone, but I love it, particularly ...","[frustrate the others, have an even group even..."
8,"Ricochet Robot is a great brain burner, but it...","[be Ricochet Robot, guess I, love who, have go..."
9,played,[]


### Load embedding for clustering

In [315]:
def load_embeddings(file_path: str) -> Dict[str, np.ndarray]:
    """
    Loads word embeddings from a file.

    Args:
        file_path (str): Path to the embeddings file.

    Returns:
        Dict[str, np.ndarray]: Dictionary of word embeddings.
    """
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ')
            word = parts[0]
            vector = np.array(parts[1:], dtype=float)
            embeddings[word] = vector
    return embeddings

def get_combined_embedding(words: List[str], embeddings: Dict[str, np.ndarray]) -> Optional[np.ndarray]:
    """
    Computes the combined embedding for a list of words.

    Args:
        words (List[str]): List of words.
        embeddings (Dict[str, np.ndarray]): Dictionary of word embeddings.

    Returns:
        Optional[np.ndarray]: Combined embedding vector, or None if no valid embeddings found.
    """
    valid_embeddings = [embeddings.get(word) for word in words if word in embeddings]
    if not valid_embeddings:
        return None
    return np.mean(valid_embeddings, axis=0)


def process_concepts_from_dataframe(df: pd.DataFrame, column_name: str, embeddings: Dict[str, np.ndarray]) -> pd.DataFrame:
    """
    Processes concepts from a DataFrame column and computes their embeddings.

    Args:
        df (pd.DataFrame): DataFrame containing original sentences and concepts.
        embeddings (Dict[str, np.ndarray]): Dictionary of word embeddings.

    Returns:
        pd.DataFrame: DataFrame with original sentences, concepts, and their embeddings.
    """
    processed_data = []

    for _, row in df.iterrows():
        sentence = row[column_name]
        concepts = row[column_name + '_concepts']
        concept_embedding = []
        num_concepts = 0
        for concept in concepts:
            words = concept.split()
            embedding = get_combined_embedding(words, embeddings)

            if embedding is not None:
                num_concepts += 1
                concept_embedding.append(embedding)

        processed_data.append({
            'sentence': sentence,
            'concept': concepts,
            'embedding': concept_embedding,
            'num_concepts': num_concepts,
        })

    result_df = pd.DataFrame(processed_data)
    return result_df


In [316]:
# Load embeddings
embeddings = load_embeddings('./data/numberbatch-en.txt')

# Process concepts and compute embeddings
concepts = process_concepts_from_dataframe(concepts, concepts.columns[0], embeddings)

In [317]:
def cluster_concepts(concepts: pd.DataFrame, num_clusters: int) -> pd.DataFrame:
    """
    Clusters each concept using KMedoids separately and adds clustering results to the DataFrame.

    Args:
        concepts (pd.DataFrame): DataFrame containing concepts and their embeddings.
        num_clusters (int): number of clusters

    Returns:
        pd.DataFrame: DataFrame with clustering results.
    """
    clustered_concepts_list = []
    
    for index, row in concepts.iterrows():
        embedding = row['embedding']
        concepts = row['concept']
        num_concepts = row['num_concepts']

        reduced_concepts = []

        if num_concepts != 0:
            # Perform KMedoids clustering on this set
            kmedoids = KMedoids(n_clusters=min(num_clusters, num_concepts), random_state=0).fit(embedding)
            labels = kmedoids.labels_

            clustered_concepts = {}
            for label, concept in zip(labels, concepts):
                if label not in clustered_concepts:
                    clustered_concepts[label] = []
                clustered_concepts[label].append((concept, embedding[concepts.index(concept)]))

            for label, items in clustered_concepts.items():
                medoid_index = kmedoids.medoid_indices_[label]
                reduced_concepts.append(concepts[medoid_index])
            
        # Add clustering results to a new DataFrame
        clustered_concepts_list.append({
            'concept': concepts,
            'clustered_concepts': reduced_concepts,
        })
    
    return pd.DataFrame(clustered_concepts_list)

# Cluster the concepts and get the reduced concepts
reduced_concepts = cluster_concepts(concepts, 3)



## Definition of the pre-defined classes with keywords


--- 


In [318]:
# simplified classes
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

---
## Use of similarity meassure
--- 

In [319]:
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> Optional[float]:
    if vec1 is None or vec2 is None:
        return None
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def compute_similarity(concept: str, class_keywords: Dict[str, List[str]], embeddings: Dict[str, np.ndarray], 
                       similarity_threshold: float = 0.5, top_n_classes: int = 3) -> Optional[Tuple[str, float]]:
    
    concept_embedding = get_combined_embedding(concept.split(), embeddings)
    if concept_embedding is None:
        return None
    
    similarities = []
    for class_name, keywords in class_keywords.items():
        class_embedding = get_combined_embedding(keywords, embeddings)
        if class_embedding is None:
            continue
        similarity = cosine_similarity(concept_embedding, class_embedding)
        if similarity is not None:
            similarities.append((class_name, similarity))
    
    # Filter classes based on similarity threshold
    similarities = [item for item in similarities if item[1] >= similarity_threshold]
    # Sort by similarity and get top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_classes = similarities[:top_n_classes]
    
    if not top_classes:
        return None
            
    return top_classes[0]  # Return the best class and its similarity

def assign_concepts_to_classes(concepts: pd.DataFrame, class_keywords: Dict[str, List[str]], embeddings: Dict[str, np.ndarray], 
                               similarity_threshold: float = 0.1, top_n_classes: int = 1) -> pd.DataFrame:
    assignment_results = []
    for _, row in concepts.iterrows():
        concepts = row['clustered_concepts']
        assignments = {}
        for concept in concepts:
            best_class = compute_similarity(concept, class_keywords, embeddings, similarity_threshold, top_n_classes)
            assignments[concept] = best_class
        
        assignment_results.append({
            'classified_concepts': assignments 
        })
     
    assignment_df = pd.DataFrame(assignment_results)
    return assignment_df

In [320]:
# Assign concepts to classes
assigned_classes = assign_concepts_to_classes(reduced_concepts, class_keywords, embeddings)

# Output the results
assigned_classes.to_csv("test.csv")

---