In [1]:
# load all test_articles in
import os

test_article_dir = "../test_articles"

articles_map: dict[str, str] = {}

for article_path in os.listdir(test_article_dir):
    full_article_path = os.path.join(test_article_dir, article_path)
    with open(full_article_path, "r", encoding="utf-8") as file:
        articles_map[article_path[:-3]] = file.read()

In [19]:
import numpy as np
from sentence_transformers import SentenceTransformer
from functools import cache

model = SentenceTransformer("all-MiniLM-L6-v2")

def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two vectors.

    Parameters:
    vec1 (numpy.ndarray): First vector.
    vec2 (numpy.ndarray): Second vector.

    Returns:
    float: Cosine similarity between vec1 and vec2.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

@cache
def vectorize(sentence):
    embeddings = model.encode(sentence)
    return np.array(embeddings)


title_vectors: dict[str, np.array] = {}
for i, title in enumerate(articles_map.keys()):
    title_vectors[title] = vectorize(title)
    print(f"{i / len(articles_map) * 100:.2f}%", end="\r")
print("Finished.")

Finished.


In [21]:
def tokenize(sentence: str) -> list[str]:
    return sentence.replace(".", "").replace("!", "").replace("?", "").replace(",", "").split(" ")

article_backlinks: dict[str, list[str]] = {}
all_titles = list(articles_map.keys())
title_tokenizations = {t: tokenize(t) for t in all_titles}
single_word_titles = [t for t in all_titles if len(title_tokenizations[t]) == 1]

similarity_thresh = 0.8

def parse_article(title: str, content: str):
    words = tokenize(content)
    backlinks: list[tuple[str, str]] = []

    # find 1 word matches
    for word in words:
        word_vec = vectorize(word)
        for title in single_word_titles:
            title_vec = title_vectors[title]
            if cosine_similarity(word_vec, title_vec) >= similarity_thresh:
                backlinks.append((word, title))

    # find 2 word matches
    for word, next_word in zip(words, words[1:]):
        query = f"{word} {next_word}"
        query_vec = vectorize(word)
        for title in all_titles:
            title_vec = title_vectors[title]
            if cosine_similarity(query_vec, title_vec) >= similarity_thresh:
                backlinks.append((query, title))

    return backlinks

for i, title in enumerate(articles_map.keys()):
    content = articles_map[title]
    backlinks = parse_article(title, content)
    article_backlinks[title] = backlinks
    print(f"{i / len(articles_map) * 100:.2f}%", end="\r")
print("Finished.")

Finished.


In [None]:
class Node:

    def __init__(self, title, edges):
        self.title = title
        self.edges = edges

[('Dublin', 'Dublin'),
 ('Ireland', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Geography\nDublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Culture\nDublin', 'Dublin'),
 ('Ireland', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Ireland', 'Dublin'),
 ('Dublin', 'Dublin'),
 ('Dublin is', 'Dublin'),
 ('Ireland located', 'Dublin'),
 ('Dublin is', 'Dublin'),
 ('Dublin has', 'Dublin'),
 ('Geography\nDublin is', 'Dublin'),
 ('Dublin and', 'Dublin'),
 ('Culture\nDublin is', 'Dublin'),
 ('Joyce Samuel', 'James Joyce'),
 ('Ireland and', 'Dublin'),
 ('Dublin was', 'Dublin'),
 ('Dublin The', 'Dublin'),
 ('Dublin University', 'Dublin'),
 ('Dublin and', 'Dublin'),
 ('Dublin City', 'Dublin'),
 ('Dublin Airport', 'Dublin'),
 ('Ireland Additionally', 'Dublin'),
 ('Dublin is', 'Dublin')]