In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]
student_notes = [open(_file, encoding='utf-8').read() for _file in student_files]

def vectorize(Text): return TfidfVectorizer().fit_transform(Text).toarray()
def similarity(doc1, doc2): return cosine_similarity([doc1, doc2])

vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

def check_plagiarism():
    global s_vectors
    for student_a, text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a, text_vector_a))
        del new_vectors[current_index]
        for student_b, text_vector_b in new_vectors:
            sim_score = similarity(text_vector_a, text_vector_b)[0][1]
            student_pair = sorted((student_a, student_b))
            score = (student_pair[0], student_pair[1], sim_score)
            plagiarism_results.add(score)
    return plagiarism_results

for data in check_plagiarism():
    print(data)


('doan_1.txt', 'doan_2.txt', 0.7479716320477551)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def vectorize(Text): 
    return TfidfVectorizer().fit_transform(Text).toarray()

def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])

# Đọc nội dung từ hai tệp tin văn bản
document1_text = open("doan_7.txt", encoding='utf-8').read()
document2_text = open("doan_8.txt", encoding='utf-8').read()

# Chuyển đổi đoạn văn thành vector
vectors = vectorize([document1_text, document2_text])

# So sánh tương đồng giữa hai đoạn văn
similarity_score = similarity(vectors[0], vectors[1])[0][1]

print("Tương đồng:", similarity_score)


Tương đồng: 0.12905569378720508


# Test với url

In [None]:
import requests
import nltk
from underthesea import word_tokenize
from bs4 import BeautifulSoup

# URL của trang web
url = "https://vnexpress.net/"

# Sử dụng requests để tải nội dung trang web
response = requests.get(url)

# Kiểm tra xem việc tải nội dung thành công hay không
if response.status_code == 200:
    # Lấy nội dung HTML của trang web
    html = response.text

    # Tạo một đối tượng BeautifulSoup để phân tích nội dung HTML
    soup = BeautifulSoup(html, "html.parser")

    # Trích xuất văn bản từ trang web và loại bỏ các đoạn trống
    text = '\n'.join([line.strip() for line in soup.stripped_strings])


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline

def vectorize(Text): 
    return TfidfVectorizer().fit_transform(Text).toarray()

def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])

def calculate_similarity_score_bert(text1, text2):
    # Sử dụng DistilBERT để tính toán độ tương đồng giữa hai đoạn văn
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=1)
    similarity_analyzer = pipeline('text-classification', model=model, tokenizer=tokenizer)

    # Tính toán độ tương đồng
    similarity_score = similarity_analyzer(text1, text2)[0]['score']

    return similarity_score

def check_plagiarism_with_tfidf_bert(text1, text2, threshold_tfidf=0.8, threshold_bert=0.8):
    # Vector hóa dữ liệu bằng TF-IDF
    vectors_tfidf = vectorize([text1, text2])

    # Tính toán độ tương đồng với TF-IDF
    similarity_score_tfidf = similarity(vectors_tfidf[0], vectors_tfidf[1])[0][1]

    # Tính toán độ tương đồng với DistilBERT
    similarity_score_bert = calculate_similarity_score_bert(text1, text2)

    print("Tương đồng TF-IDF:", similarity_score_tfidf)
    print("Tương đồng DistilBERT:", similarity_score_bert)

    # Kết hợp kết quả từ cả hai phương pháp
    combined_score = (similarity_score_tfidf + similarity_score_bert) / 2

    return combined_score >= max(threshold_tfidf, threshold_bert)

# Đọc nội dung từ hai tệp tin văn bản
document1_text = open("doan_1.txt", encoding='utf-8').read()
document2_text = open("doan_2.txt", encoding='utf-8').read()

# Kiểm tra đạo văn
is_plagiarized = check_plagiarism_with_tfidf_bert(document1_text, document2_text)
print("Is Plagiarized:", is_plagiarized)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Ignoring args : ('The rapid development of artificial intelligence (AI) has become a defining feature of our technological landscape. AI applications, ranging from virtual assistants to self-driving cars, have become an integral part of our daily lives. Machine learning, a key component of AI, empowers computers to learn and make decisions based on data, contributing to the advancement of various fields.\n\nNatural language processing (NLP) is an area where AI has shown remarkable progress. NLP algorithms enable machines to understand, interpret, and generate human-like language, facilitating improvements in language translation

Tương đồng TF-IDF: 0.7479716320477551
Tương đồng DistilBERT: 0.4788822531700134
Is Plagiarized: False
