In [47]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn_extra.cluster import KMedoids
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import spacy
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import requests
import xmltodict

from typing import List, Set, Dict

In [43]:
# Ensure that necessary nltk resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lukas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/lukas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lukas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Retrieve data
----

In [44]:
def get_primary_name(names):
    if isinstance(names, list):
        for name in names:
            if name.get('@type') == 'primary':
                return name.get('@value')
    elif isinstance(names, dict) and names.get('@type') == 'primary':
        return names.get('@value')
    return None

def retrieve_comments(index):
    URL = "https://boardgamegeek.com/xmlapi2/thing?id=" + str(index) + "&type=boardgame&comments=1"
    response = requests.get(URL)
    data = xmltodict.parse(response.content)
    game_name = get_primary_name(data["items"]["item"]['name'])

    comments = []
    if 'items' in data and 'item' in data['items']:
        item = data['items']['item']
        if 'comments' in item and 'comment' in item['comments']:
            for comment in item['comments']['comment']:
                comments.append({game_name: comment['@value']})

    return pd.DataFrame(comments)

----

In [None]:
def get_wordnet_pos(treebank_tag):
    """Map Treebank POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def extract_nouns(phrase):
    tokens = word_tokenize(phrase)
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    
    nouns = []
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        if wordnet_pos == wn.NOUN:
            nouns.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    
    return nouns

def extract_verbs_and_nouns(sentence):
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    
    verbs = []
    nouns = []
    
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        if wordnet_pos == wn.VERB:
            verbs.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
        elif wordnet_pos == wn.NOUN:
            nouns.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    
    return verbs, nouns

In [None]:
basic_actions = ['use', 'enjoy', 'play', 'do', 'know', 'suspect', 'drop']
stemmer = PorterStemmer()

def find_possible_forms_of_objects(nouns):
    """Finds possible forms of object concepts from nouns."""
    forms = set()
    for noun in nouns:
        # Stemming to find the base form of the noun
        base_form = stemmer.stem(noun)
        forms.add(base_form)
        
        # Add singular and plural forms using WordNet
        synsets = wn.synsets(noun, pos=wn.NOUN)
        for synset in synsets:
            for lemma in synset.lemmas():
                forms.add(lemma.name())
                # Adding plural form
                plural_form = lemma.name() + 's'
                forms.add(plural_form)
    
    return forms

def property_matches(concept):
    """Retrieve property matches for a concept from a knowledge base.
    Uses WordNet to find related properties and synonyms.
    """
    properties = set()
    synsets = wn.synsets(concept, pos=wn.NOUN)
    for synset in synsets:
        # Add the concept itself
        properties.add(concept)
        
        # Add synonyms
        for lemma in synset.lemmas():
            properties.add(lemma.name())
        
        # Add hypernyms (more general concepts)
        for hypernym in synset.hypernyms():
            properties.add(hypernym.name().split('.')[0])
        
        # Add hyponyms (more specific concepts)
        for hyponym in synset.hyponyms():
            properties.add(hyponym.name().split('.')[0])
    
    return properties

def link_objects_to_verb(verb, objects):
    """Links objects to a stemmed verb to form event concepts."""
    events = []
    for obj in objects:
        events.append(f"{verb} {obj}")
    return events

def extract_event_concepts(sentence):
    verbs, nouns = extract_verbs_and_nouns(sentence)
    
    event_concepts = []
    for verb in verbs:
        objects = find_possible_forms_of_objects(nouns)
        events = link_objects_to_verb(verb, objects)
        event_concepts.extend(events)
    
    return event_concepts

def are_noun_phrases_similar(phrase1, phrase2):
    nouns1 = extract_nouns(phrase1)
    nouns2 = extract_nouns(phrase2)
    
    if not set(nouns1).intersection(set(nouns2)):
        return False
    
    objects1 = find_possible_forms_of_objects(nouns1)
    objects2 = find_possible_forms_of_objects(nouns2)
    
    M1 = set()
    M2 = set()
    
    for concept in objects1:
        M1.update(property_matches(concept))
        
    for concept in objects2:
        M2.update(property_matches(concept))
    
    set_common = M1.intersection(M2)
    
    if len(set_common) > 0:
        return True
    else:
        return False


def combine_similar_events(event_concepts):
    combined_events = []
    used_events = set()
    
    for i, event1 in enumerate(event_concepts):
        if i in used_events:
            continue
        similar_group = [event1]
        for j, event2 in enumerate(event_concepts[i+1:], start=i+1):
            if j in used_events:
                continue
            if are_noun_phrases_similar(event1, event2):
                similar_group.append(event2)
                used_events.add(j)
        combined_events.append(similar_group)
        used_events.add(i)
    
    return combined_events


def simplify_concepts(combined_events):
    simplified_concepts = set()
    for event_list in combined_events:
        for event in event_list:
            # Split the event into words
            words = event.split()
            # Add non-basic action words to the simplified concepts set
            for word in words:
                if word not in basic_actions and word not in ['[', ']']:
                    simplified_concepts.add(word)
    return list(simplified_concepts)

In [None]:
# classes and keywords
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

comment = retrieve_comments(1)

print("The original comment: ", comment.iloc[0].to_list())

# Example usage
sentence = str(comment.iloc[0].to_list())
event_concepts = extract_event_concepts(sentence)
print("Extracted Event Concepts:", event_concepts)

# Combine similar events
combined_events = combine_similar_events(event_concepts)
print("Combined Events:", combined_events)

simplified_concepts = simplify_concepts(combined_events)
print("Simplified Events:", simplified_concepts)

## Event concept extraction algorithm
----

The algorithm for capturing event concepts matches object concepts with normalized verb chunks. This is achieved by utilizing a parse graph that maps all the multi-word expressions contained in the knowledge bases.
1.	Match Object and Verb Phrases:
	- The algorithm searches for matches between the object concepts and the normalized verb phrases.
2.	Utilize a Parse Graph:
	- A directed, unweighted parse graph is used to quickly detect multi-word concepts without performing an exhaustive search through all possible word combinations that can form a commonsense concept.
3.	Remove Redundant Terms:
	- Single-word concepts, such as “house,” that already appear in the clause as part of a multi-word concept, like “beautiful house,” are considered pleonastic (providing redundant information) and are discarded.
4.	Extract Event Concepts:
	- The algorithm extracts event concepts such as “go market,” “buy some fruits,” “buy fruits,” and “buy vegetables.”
	- These event concepts represent Script-Based Object Concepts (SBoCs) and can be fed into a commonsense reasoning algorithm for further processing.

---- 

In [45]:
def event_concept_extraction(sentence: str) -> List[str]:
    """
    Extracts event concepts from a given sentence.

    This function processes a natural language sentence to extract event concepts by linking verbs with associated
    noun phrases. It stems the verbs and constructs concepts by combining them with the nouns.

    Args:
        sentence (str): sentence to process

    Returns:
        List[str]: extracted event concepts
    """
    concepts: Set[str] = set()  # initialize an empty set to store unique concepts
    doc = nlp(sentence)  # process the sentence with spaCy

    for sent in doc.sents:
        # identify all verbs in the sentence
        verbs = [token for token in sent if token.pos_ == 'VERB']
        # identify all noun phrases in the sentence
        noun_phrases = list(sent.noun_chunks)

        for verb in verbs:
            # stem the verb
            stemmed_verb = verb.lemma_
            # find noun phrases associated with the verb
            associated_nouns = [np for np in noun_phrases if np.root.head == verb]

            for np in associated_nouns:
                if len(np) > 1:
                    # if the noun phrase contains more than one word, form a concept with the verb and noun phrase
                    concept = f"{stemmed_verb} {' '.join([token.text for token in np])}"
                    concepts.add(concept)
                else:
                    # handle single-word noun phrases
                    single_word_concept = np.text
                    if not any(single_word_concept in concept for concept in concepts):
                        concept = f"{stemmed_verb} {single_word_concept}"
                        concepts.add(concept)

        for np in noun_phrases:
            # handle noun phrases associated with auxiliary verbs
            if np.root.head.pos_ == 'AUX':
                concept = f"be {' '.join([token.text for token in np])}"
                concepts.add(concept)

    return list(concepts)

In [50]:
sentence = str(retrieve_comments(12).iloc[0, 0])
print("Original comment:\n")
print(f"""\"{sentence}\"""")
concepts = event_concept_extraction(sentence)
print("\nExtracted concepts: ", concepts)

Original comment:

"Too many scoring options  make this game unfit for casual play."

Extracted concepts:  ['make Too many scoring options']


In [51]:

def classify_concepts(concepts: List[str], class_keywords: Dict[str, List[str]]) -> Dict[str, Set[str]]:
    """
    Classifies extracted concepts into predefined categories based on keywords.

    Args:
        concepts (List[str]): The list of extracted concepts.
        class_keywords (Dict[str, List[str]]): A dictionary where keys are class names and values are lists of keywords.

    Returns:
        Dict[str, Set[str]]: A dictionary where keys are class names and values are sets of concepts that match the keywords.
    """
    classified_concepts: Dict[str, Set[str]] = {category: set() for category in class_keywords}

    for concept in concepts:
        for category, keywords in class_keywords.items():
            for keyword in keywords:
                if keyword in concept:
                    classified_concepts[category].add(concept)

    return classified_concepts

In [52]:
class_keywords = {
    'luck or alea': ['luck', 'chance', 'alea'],
    'bookkeeping': ['bookkeeping', 'recording', 'rulebook'],
    'downtime': ['downtime', 'waiting'],
    'interaction': ['interaction', 'influence'],
    'bash the leader': ['bash the leader', 'sacrifice'],
    'complicated': ['complicated', 'rules', 'exceptions'],
    'complex': ['complex', 'repercussions', 'unpredictable'],
}

classified_concepts = classify_concepts(concepts, class_keywords)
print("\nClassified concepts: ", classified_concepts)


Classified concepts:  {'luck or alea': set(), 'bookkeeping': set(), 'downtime': set(), 'interaction': set(), 'bash the leader': set(), 'complicated': set(), 'complex': set()}
