In [None]:
!pip install python-docx scikit-learn

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!pip install PyMuPDF

In [None]:
import fitz  # from PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import io

def extract_text_from_pdf(file_bytes):
    text = ""
    with fitz.open(stream=file_bytes, filetype="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text.strip()

file_texts = []
file_names = []

for filename, file_bytes in uploaded.items():
    text = extract_text_from_pdf(file_bytes)
    print(f"\n--- {filename} ---\n{text[:300]}")  # Preview
    if text:
        file_texts.append(text)
        file_names.append(filename)

if len(file_texts) < 2:
    print("\n❌ Not enough valid PDF documents with extractable text.")
else:
    # TF-IDF Vectorization + Similarity
    vectorizer = TfidfVectorizer(stop_words='english').fit_transform(file_texts)
    similarity_matrix = cosine_similarity(vectorizer)

    # Threshold to flag similar files
    threshold = 0.95
    print("\n📋 Highly similar PDF document pairs (Possible copies):")
    found = False
    for i in range(len(file_names)):
        for j in range(i + 1, len(file_names)):
            score = similarity_matrix[i][j]
            if score > threshold:
                print(f"{file_names[i]} <--> {file_names[j]} : {score:.2f}")
                found = True
    if not found:
        print("No highly similar files found above threshold.")


In [None]:
# Create and save a text report of similar files
report_lines = []
report_lines.append("📋 Highly Similar PDF Document Pairs (Threshold = 0.95)\n")

found = False
for i in range(len(file_names)):
    for j in range(i + 1, len(file_names)):
        score = similarity_matrix[i][j]
        if score > threshold:
            found = True
            report_lines.append(f"{file_names[i]} <--> {file_names[j]} : {score:.2f}")

if not found:
    report_lines.append("No highly similar files found above the threshold.")

# Save the report to a .txt file
with open("similarity_report.txt", "w") as f:
    f.write("\n".join(report_lines))

# Offer download
from google.colab import files
files.download("similarity_report.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>