In [1]:
import requests
import io
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2

# Function to preprocess text
def preprocess(text):
    # Replace newlines and tabs with spaces
    text = text.replace('\n', ' ').replace('\t', ' ')
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

# Function to retrieve text from a PDF file
def extract_pdf_text(filename):
    with open(filename, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Function to retrieve text from a web page
def extract_web_text(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()
    text = preprocess(text)
    return text

# Function to detect plagiarism between two texts
def detect_plagiarism(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(vectors)[0,1]
    if similarity > 0.8:
        return "Plagiarized", similarity
    else:
        return "Not Plagiarized", similarity

# Example usage
pdf_file = 'example.pdf'
pdf_text = extract_pdf_text(pdf_file)

web_url = 'https://www.gfg.org'
web_text = extract_web_text(web_url)

# Check for plagiarism
label, similarity = detect_plagiarism(pdf_text, web_text)
print(f"Label: {label}, Similarity: {similarity}")


Label: Not Plagiarized, Similarity: 0.015518659364925651
