In [5]:
import spacy
import numpy as np
from nltk.corpus import wordnet as wn

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

In [6]:
def preprocess_text(text):
    doc = nlp(text)
    return [token.text.lower() for token in doc if not token.is_stop and token.is_alpha]

# Function to find synonyms of a keyword using WordNet
def find_synonyms(keyword):
    synonyms = set()
    for synset in wn.synsets(keyword):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

In [7]:
# Function to calculate cosine similarity
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    if magnitude1 == 0 or magnitude2 == 0:
        return 0  # Handle zero division
    return dot_product / (magnitude1 * magnitude2)

# Function to calculate similarity between a token and a keyword (or its synonyms)
def calculate_similarity(token, keyword):
    # Use spaCy's word vectors for token and keyword
    token_vector = token.vector
    keyword_vector = nlp(keyword).vector
    return cosine_similarity(token_vector, keyword_vector)

In [8]:
def calculate_similarity_metric(text, keywords):
    preprocessed_text = preprocess_text(text)
    similarity_scores = []
    for token in preprocessed_text:
        token_similarity_scores = []
        for keyword in keywords:
            similarity_score = calculate_similarity(nlp(token), keyword)
            token_similarity_scores.append(similarity_score)
            synonyms = find_synonyms(keyword)
            for synonym in synonyms:
                similarity_score = calculate_similarity(nlp(token), synonym)
                token_similarity_scores.append(similarity_score)
        if token_similarity_scores:
            max_similarity_score = max(token_similarity_scores)
            similarity_scores.append(max_similarity_score)
    if similarity_scores:
        # Calculate the average similarity score
        similarity_metric = sum(similarity_scores) / len(similarity_scores)
        return similarity_metric
    else:
        return 0

In [13]:
# Example usage
text = """Interdisciplinary introduction to the basic concepts and approaches in Asian American Studies. 
Surveys the various dimensions of Asian American experiences including history, social organization, literature, arts, and politics."""
keywords = ["Asia", "Basic", "Art"]

similarity_metric = calculate_similarity_metric(text, keywords)
print("Similarity metric:", similarity_metric)

keywords2 = ["Asian", "Simple", "Artistic"]

similarity_metric = calculate_similarity_metric(text, keywords2)
print("Similarity metric:", similarity_metric)

Similarity metric: 0.7141436517238617
Similarity metric: 0.5619395181536675
