In [1]:
! pip install -q sentence-transformers scikit-learn pypdf


In [11]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.document_loaders import PyPDFLoader
import os
import json


In [3]:
def extract_text_from_pdf(filepath):
    loader = PyPDFLoader(filepath)
    documents = loader.load()
    return " ".join(doc.page_content for doc in documents)


In [6]:
# 4. Load the two whitepapers
reference_path = "docs/bitcoin (1).pdf"
target_path = "docs/MyCoin.pdf"  # <-- change this to your file name

reference_text = extract_text_from_pdf(reference_path)
target_text = extract_text_from_pdf(target_path)


In [9]:
# Paths
docs_folder = "projects"
references_folder = "references"

# Load model once
model = SentenceTransformer("all-MiniLM-L6-v2")

# Find all user-uploaded documents
uploaded_docs = [os.path.join(docs_folder, f) for f in os.listdir(docs_folder) if f.endswith('.pdf')]
reference_docs = [os.path.join(references_folder, f) for f in os.listdir(references_folder) if f.endswith('.pdf')]

# Loop through each uploaded whitepaper
for upload_path in uploaded_docs:
    print(f"\nüßæ Checking: {os.path.basename(upload_path)}")

    uploaded_text = extract_text_from_pdf(upload_path)
    uploaded_embed = model.encode([uploaded_text])[0]

    flagged = False

    for ref_path in reference_docs:
        ref_text = extract_text_from_pdf(ref_path)
        ref_embed = model.encode([ref_text])[0]

        similarity = cosine_similarity([uploaded_embed], [ref_embed])[0][0]
        print(f"üîç Similarity with {os.path.basename(ref_path)}: {similarity:.4f}")

        if similarity > 0.85:
            print(f"üö© High similarity detected with {os.path.basename(ref_path)} ‚Äî possible plagiarism!")
            flagged = True

    if flagged:
        print(f"‚ùå {os.path.basename(upload_path)} is REJECTED due to high plagiarism risk.\n")
    else:
        print(f"‚úÖ {os.path.basename(upload_path)} is CLEAN ‚Äî ready for RAG ingestion.\n")



üßæ Checking: MyCoin.pdf
üîç Similarity with avalanche_whitepaper.pdf: 0.3754


incorrect startxref pointer(1)
parsing for Object Streams


üîç Similarity with bitcoin_whitepaper.pdf: 0.8752
üö© High similarity detected with bitcoin_whitepaper.pdf ‚Äî possible plagiarism!
üîç Similarity with cardano_whitepaper.pdf: 0.1215
üîç Similarity with ethereum_whitepaper.pdf: 0.4876


incorrect startxref pointer(3)
parsing for Object Streams


üîç Similarity with litecoin_whitepaper.pdf: 0.5029
üîç Similarity with polkadot_whitepaper.pdf: 0.1215
üîç Similarity with solana-whitepaper.pdf: 0.2716
‚ùå MyCoin.pdf is REJECTED due to high plagiarism risk.



In [12]:
# Paths
docs_folder = "docs"
references_folder = "references"
outputs_folder = "outputs"

# Create outputs folder if missing
os.makedirs(outputs_folder, exist_ok=True)

# Load model once
model = SentenceTransformer("all-MiniLM-L6-v2")

# List uploaded and reference files
uploaded_docs = [os.path.join(docs_folder, f) for f in os.listdir(docs_folder) if f.endswith('.pdf')]
reference_docs = [os.path.join(references_folder, f) for f in os.listdir(references_folder) if f.endswith('.pdf')]

# Check each uploaded whitepaper
for upload_path in uploaded_docs:
    print(f"\nüßæ Checking: {os.path.basename(upload_path)}")
    
    report = {
        "project_file": os.path.basename(upload_path),
        "comparisons": [],
        "overall_result": ""
    }
    
    uploaded_text = extract_text_from_pdf(upload_path)
    uploaded_embed = model.encode([uploaded_text])[0]
    
    flagged = False

    for ref_path in reference_docs:
        ref_text = extract_text_from_pdf(ref_path)
        ref_embed = model.encode([ref_text])[0]
        
        similarity = cosine_similarity([uploaded_embed], [ref_embed])[0][0]
        print(f"üîç Similarity with {os.path.basename(ref_path)}: {similarity:.4f}")

        report["comparisons"].append({
            "reference_file": os.path.basename(ref_path),
            "similarity_score": round(float(similarity), 4)
        })

        if similarity > 0.85:
            flagged = True

    if flagged:
        print(f"‚ùå {os.path.basename(upload_path)} is REJECTED due to high plagiarism risk.\n")
        report["overall_result"] = "Rejected due to high similarity."
    else:
        print(f"‚úÖ {os.path.basename(upload_path)} is CLEAN ‚Äî ready for RAG ingestion.\n")
        report["overall_result"] = "Accepted. No high similarity detected."

    # Save the report to outputs/
    output_path = os.path.join(outputs_folder, f"{os.path.splitext(os.path.basename(upload_path))[0]}_plagiarism_report.json")
    with open(output_path, "w") as f:
        json.dump(report, f, indent=4)

    print(f"üìÑ Report saved at: {output_path}")



üßæ Checking: bitcoin_whitepaper.pdf
üîç Similarity with avalanche_whitepaper.pdf: 0.4024


incorrect startxref pointer(1)
parsing for Object Streams


üîç Similarity with bitcoin_whitepaper.pdf: 1.0000
üîç Similarity with cardano_whitepaper.pdf: 0.1447


incorrect startxref pointer(3)


üîç Similarity with ethereum_whitepaper.pdf: 0.4696
üîç Similarity with litecoin_whitepaper.pdf: 0.4771


parsing for Object Streams


üîç Similarity with polkadot_whitepaper.pdf: 0.1447
üîç Similarity with solana-whitepaper.pdf: 0.2529
‚ùå bitcoin_whitepaper.pdf is REJECTED due to high plagiarism risk.

üìÑ Report saved at: outputs\bitcoin_whitepaper_plagiarism_report.json
