<a href="https://colab.research.google.com/github/parassetia889/docs-sematic-similarity/blob/main/similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers
!pip install -U sentence-transformers
!pip install PyPDF2
!pip install pandas
!pip install numpy
!pip install xlrd



In [None]:
import PyPDF2
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import xlrd

def extract_content(file_path, format):
    if format == "pdf":
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
        return text
    elif format == "xls":
        workbook = xlrd.open_workbook(file_path)
        text = ""
        for sheet_name in workbook.sheet_names():
            sheet = workbook.sheet_by_name(sheet_name)
            for row_idx in range(sheet.nrows):
                for col_idx in range(sheet.ncols):
                    cell_value = sheet.cell_value(row_idx, col_idx)
                    if cell_value:
                        text += str(cell_value) + " "
        return text
    elif format =='txt':
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    else:
        raise ValueError("Unsupported file format:", format)


def split_text_into_chunks(text, chunk_size=500):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks


def encode_and_combine_chunks(chunks, model):
    chunk_embeddings = model.encode(chunks)
    combined_embedding = np.mean(chunk_embeddings, axis=0)  # Average embeddings
    return combined_embedding


def compare_documents(file1_path, file2_path, chunk_size=500):
    start_time = time.time()
    print(f"Start Time : {start_time}")
    model = SentenceTransformer("paraphrase-distilroberta-base-v2")  # Use paraphrase model for long text

    # Extract text from PDFs
    format = file1_path.split(".")[1]
    doc1_content = extract_content(file1_path, format)
    doc2_content = extract_content(file2_path, format)

    print("content fetched")
    # Split text into chunks
    chunk1 = split_text_into_chunks(doc1_content, chunk_size)
    chunk2 = split_text_into_chunks(doc2_content, chunk_size)

    # Encode chunks and combine embeddings
    content1_embedding = encode_and_combine_chunks(chunk1, model)
    content2_embedding = encode_and_combine_chunks(chunk2, model)

    # Calculate cosine similarity
    similarity = cosine_similarity(content1_embedding.reshape(1, -1), content2_embedding.reshape(1, -1))[0][0]

    # Print similarity score
    similarity_percentage = round(similarity * 100, 2)

    elapsed_time = time.time() - start_time
    print("Time Taken: {:.4f} seconds".format(elapsed_time))
    print(f"Similarity between documents: {similarity_percentage:.2f}%")


# Example usage
# pdf1_path = "/content/file1.pdf"
# pdf2_path = "/content/file2.pdf"

# compare_documents(pdf1_path, pdf2_path)


pdf1_path = "/content/Hist.txt"
pdf2_path = "/content/Moby.txt"

compare_documents(pdf1_path, pdf2_path)

# file1_path = "/content/file1.xls"
# file2_path = "/content/file2.xls"

# compare_documents(file1_path, file2_path)


Start Time : 1707904763.7720938
content fetched
Time Taken: 51.7560 seconds
Similarity between documents: 64.00%


In [None]:
import PyPDF2
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def extract_content(file_path, format):
    if format == "pdf":
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
        return text
    elif format == "txt":
        with open(file_path, 'r') as file:
            return file.read()
    else:
        print("format : ", format)
        raise ValueError(f"Unsupported file format: {format}")



def split_text_into_chunks(text, chunk_size=200):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks


def encode_and_combine_chunks(chunks, model):
    chunk_embeddings = model.encode(chunks)
    combined_embedding = np.mean(chunk_embeddings, axis=0)  # Average embeddings
    return combined_embedding


def compare_documents(file1_path, file2_path, chunk_size=100):
    start_time = time.time()
    # model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v2")  # Use paraphrase model for long text
    # model = SentenceTransformer("castorini/aggretriever-distilbert")
    model = SentenceTransformer("paraphrase-distilroberta-base-v2")  # Use paraphrase model for long text
    model = SentenceTransformer("all-mpnet-base-v2")  # Use paraphrase model for long text
    # model = SentenceTransformer("BAAI/bge-base-en-v1.5")
    # Extract text from PDFs
    format = file1_path.split(".")[1]
    doc1_content = extract_content(file1_path, format)
    doc2_content = extract_content(file2_path, format)


    # Split text into chunks
    chunk1 = split_text_into_chunks(doc1_content, chunk_size)
    chunk2 = split_text_into_chunks(doc2_content, chunk_size)

    print(f"doc1_content : {len(doc1_content)}")
    print(f"doc2_content : {len(doc2_content)}")
    # Encode chunks and combine embeddings
    content1_embedding = encode_and_combine_chunks(chunk1, model)
    content2_embedding = encode_and_combine_chunks(chunk2, model)

    # Calculate cosine similarity
    similarity = cosine_similarity(content1_embedding.reshape(1, -1), content2_embedding.reshape(1, -1))[0][0]

    # Print similarity score
    similarity_percentage = round(similarity * 100, 2)

    elapsed_time = time.time() - start_time
    print("Time Taken: {:.4f} seconds".format(elapsed_time))
    return similarity_percentage


# file1_path = "/content/3page.txt"
# file2_path = "/content/1page.txt"
# print("pdf to txt  Score : ", compare_documents(file1_path, file2_path), "\n")

file1_path = "/content/file1.pdf"
file2_path = "/content/file1_removed.pdf"
print("pdf  Score : ", compare_documents(file1_path, file2_path), "\n")
# # Example usage
# file1_path = "/content/hello.txt"
# file2_path = "/content/hello2.txt"
# print("hello txt Score : ", compare_documents(file1_path, file2_path), "\n")

# file1_path = "/content/unique_file_ids 1.txt"
# file2_path = "/content/unique_file_ids 2.txt"
# print("long txt Score : ", compare_documents(file1_path, file2_path), "\n")

file1_path = "/content/Hist.txt"
file2_path = "/content/Moby.txt"
print("Novel txt Score : ", compare_documents(file1_path, file2_path), "\n")




doc1_content : 24111
doc2_content : 12302
Time Taken: 3.8867 seconds
pdf  Score :  98.68 

doc1_content : 1283
doc2_content : 34424
Time Taken: 4.3826 seconds
Novel txt Score :  47.32 

