<a href="https://colab.research.google.com/github/parassetia889/docs-sematic-similarity/blob/main/similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers
!pip install -U sentence-transformers
!pip install PyPDF2
!pip install pandas
!pip install numpy
!pip install xlrd

In [None]:
import PyPDF2
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def extract_content(file_path, format):
    if format == "pdf":
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
        return text
    elif format == "txt":
        with open(file_path, 'r') as file:
            return file.read()
    else:
        print("format : ", format)
        raise ValueError(f"Unsupported file format: {format}")



def split_text_into_chunks(text, chunk_size=200):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks


def encode_and_combine_chunks(chunks, model):
    chunk_embeddings = model.encode(chunks)
    combined_embedding = np.mean(chunk_embeddings, axis=0)
    return combined_embedding


def compare_documents(file1_path, file2_path, chunk_size=100):
    start_time = time.time()

    model = SentenceTransformer("paraphrase-distilroberta-base-v2")

    # Other models to be used
    # model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v2")
    # model = SentenceTransformer("all-mpnet-base-v2")
    # model = SentenceTransformer("BAAI/bge-base-en-v1.5")

    format = file1_path.split(".")[1]
    doc1_content = extract_content(file1_path, format)
    doc2_content = extract_content(file2_path, format)


    # Split text into chunks
    chunk1 = split_text_into_chunks(doc1_content, chunk_size)
    chunk2 = split_text_into_chunks(doc2_content, chunk_size)

    print(f"doc1_content : {len(doc1_content)}")
    print(f"doc2_content : {len(doc2_content)}")

    # Encode chunks and combine embeddings
    content1_embedding = encode_and_combine_chunks(chunk1, model)
    content2_embedding = encode_and_combine_chunks(chunk2, model)

    # Calculate cosine similarity
    similarity = cosine_similarity(content1_embedding.reshape(1, -1), content2_embedding.reshape(1, -1))[0][0]

    similarity_percentage = round(similarity * 100, 2)

    elapsed_time = time.time() - start_time
    print("Time Taken: {:.4f} seconds".format(elapsed_time))
    return similarity_percentage

file1_path = "/content/Hist.txt"
file2_path = "/content/Moby.txt"
print("Similarity Score : ", compare_documents(file1_path, file2_path), "\n")


