In [1]:
import heapq

def tokenize(document):
    """
    Split document into sentences/paragraphs.
    Here we use '.' as separator for simplicity.
    """
    return [s.strip() for s in document.split('.') if s.strip()]




In [2]:
def a_star_alignment(docA, docB):
    A = tokenize(docA)
    B = tokenize(docB)
    n, m = len(A), len(B)

    start = (0, 0)  # indices in both docs
    goal = (n, m)

    # Priority queue for A* (f, g, state, path)
    pq = []
    heapq.heappush(pq, (0, 0, start, []))
    visited = set()

    while pq:
        f, g, (i, j), path = heapq.heappop(pq)

        if (i, j) in visited:
            continue
        visited.add((i, j))

        # Goal reached
        if (i, j) == goal:
            return path

        # Generate neighbors
        # 1. Align A[i] with B[j]
        if i < n and j < m:
            cost = 0 if A[i].lower() == B[j].lower() else 1
            new_path = path + [(A[i], B[j], cost)]
            h = abs((n - (i+1)) - (m - (j+1)))
            heapq.heappush(pq, (g + cost + h, g + cost, (i+1, j+1), new_path))

        # 2. Skip A[i]
        if i < n:
            new_path = path + [(A[i], "-", 1)]
            h = abs((n - (i+1)) - (m - j))
            heapq.heappush(pq, (g + 1 + h, g + 1, (i+1, j), new_path))

        # 3. Skip B[j]
        if j < m:
            new_path = path + [("-", B[j], 1)]
            h = abs((n - i) - (m - (j+1)))
            heapq.heappush(pq, (g + 1 + h, g + 1, (i, j+1), new_path))

    return None


In [3]:
def plagiarism_score(alignment):
    """Calculate similarity percentage based on matches."""
    if not alignment:
        return 0.0
    total = len(alignment)
    matches = sum(1 for _, _, cost in alignment if cost == 0)
    return (matches / total) * 100


In [4]:
doc1 = "Artificial intelligence is a branch of computer science. It deals with machine learning. A* search is widely used."
doc2 = "Artificial intelligence is a branch of computer science. It studies machine learning. A* search is widely used."

alignment = a_star_alignment(doc1, doc2)

print("Alignment Result (A vs B):")
for a, b, cost in alignment:
    print(f"{a:<60} | {b:<60} | {'Match' if cost==0 else 'Mismatch'}")

Alignment Result (A vs B):
Artificial intelligence is a branch of computer science      | Artificial intelligence is a branch of computer science      | Match
It deals with machine learning                               | It studies machine learning                                  | Mismatch
A* search is widely used                                     | A* search is widely used                                     | Match


In [5]:
score = plagiarism_score(alignment)
print(f"\nPlagiarism Similarity Score: {score:.2f}%")



Plagiarism Similarity Score: 66.67%
