In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
import requests
import io
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2

# Function to preprocess text
def preprocess(text):
    # Replace newlines and tabs with spaces
    text = text.replace('\n', ' ').replace('\t', ' ')
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

# Function to retrieve text from a PDF file
def extract_pdf_text(filename):
    with open(filename, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Function to retrieve text from a web page
def extract_web_text(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()
    text = preprocess(text)
    return text

# Function to detect plagiarism between two texts
def detect_plagiarism(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(vectors)[0,1]
    if similarity > 0.8:
        return "Plagiarized", similarity
    else:
        return "Not Plagiarized", similarity

def check_plagiarism():
    pdf_file = pdf_entry.get()
    web_url = url_entry.get()

    if pdf_file == "" or web_url == "":
        messagebox.showerror("Error", "Please provide both a PDF file and a web URL.")
        return

    try:
        pdf_text = extract_pdf_text(pdf_file)
        web_text = extract_web_text(web_url)

        label, similarity = detect_plagiarism(pdf_text, web_text)

        result_label.config(text=f"Label: {label}, Similarity: {similarity}")
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {str(e)}")

# Function to handle the "Browse" button click
def browse_file():
    filename = filedialog.askopenfilename()
    pdf_entry.delete(0, tk.END)
    pdf_entry.insert(0, filename)

# Create the Tkinter window
window = tk.Tk()
window.title("Plagiarism Checker")

# Create the PDF File label, entry field, and browse button
pdf_label = tk.Label(window, text="PDF File:")
pdf_label.pack()
pdf_entry = tk.Entry(window, width=50)
pdf_entry.pack()
browse_button = tk.Button(window, text="Browse", command=browse_file)
browse_button.pack()

# Create the Web URL label and entry field
url_label = tk.Label(window, text="Web URL:")
url_label.pack()
url_entry = tk.Entry(window, width=50)
url_entry.pack()

# Create the "Check Plagiarism" button
check_button = tk.Button(window, text="Check Plagiarism", command=check_plagiarism)
check_button.pack()

# Create the label to display the result
result_label = tk.Label(window, text="")
result_label.pack()

# Run the Tkinter event loop
window.mainloop()


Label: Not Plagiarized, Similarity: 0.015518659364925651
