In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]
student_notes = [open(_file, encoding='utf-8').read()
                 for _file in student_files]


In [3]:
student_files

['fatma.txt', 'john.txt', 'juma.txt', 'requirements.txt']

In [4]:
student_notes

['Life is all about doing your best in trying to\nfind what works out for you and taking most time in\ntrying to pursue those skills ',
 'Life is all about finding money and spending on luxury stuffs\nCoz this life is kinda short , trust ',
 'Life to me is about finding money and use it on things that makes you happy\ncoz this life is kinda short ',
 'scikit_learn==0.24.2\n']

In [6]:
def vectorize(Text): 
    return TfidfVectorizer().fit_transform(Text).toarray()
def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])

In [7]:
vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

In [8]:
vectors

array([[0.        , 0.12202937, 0.15073054, 0.12202937, 0.19118258,
        0.        , 0.19118258, 0.19118258, 0.        , 0.19118258,
        0.        , 0.38236516, 0.12202937, 0.        , 0.        ,
        0.12202937, 0.        , 0.        , 0.        , 0.        ,
        0.19118258, 0.        , 0.19118258, 0.19118258, 0.        ,
        0.        , 0.19118258, 0.        , 0.        , 0.19118258,
        0.        , 0.        , 0.        , 0.19118258, 0.19118258,
        0.30146109, 0.        , 0.38236516, 0.        , 0.19118258,
        0.19118258, 0.15073054, 0.19118258],
       [0.        , 0.17671088, 0.21827309, 0.17671088, 0.        ,
        0.21827309, 0.        , 0.        , 0.21827309, 0.        ,
        0.        , 0.        , 0.35342175, 0.        , 0.21827309,
        0.35342175, 0.27685174, 0.        , 0.        , 0.21827309,
        0.        , 0.21827309, 0.        , 0.        , 0.        ,
        0.21827309, 0.        , 0.27685174, 0.27685174, 0.        ,
   

In [9]:
s_vectors

[('fatma.txt',
  array([0.        , 0.12202937, 0.15073054, 0.12202937, 0.19118258,
         0.        , 0.19118258, 0.19118258, 0.        , 0.19118258,
         0.        , 0.38236516, 0.12202937, 0.        , 0.        ,
         0.12202937, 0.        , 0.        , 0.        , 0.        ,
         0.19118258, 0.        , 0.19118258, 0.19118258, 0.        ,
         0.        , 0.19118258, 0.        , 0.        , 0.19118258,
         0.        , 0.        , 0.        , 0.19118258, 0.19118258,
         0.30146109, 0.        , 0.38236516, 0.        , 0.19118258,
         0.19118258, 0.15073054, 0.19118258])),
 ('john.txt',
  array([0.        , 0.17671088, 0.21827309, 0.17671088, 0.        ,
         0.21827309, 0.        , 0.        , 0.21827309, 0.        ,
         0.        , 0.        , 0.35342175, 0.        , 0.21827309,
         0.35342175, 0.27685174, 0.        , 0.        , 0.21827309,
         0.        , 0.21827309, 0.        , 0.        , 0.        ,
         0.21827309, 0.   

In [10]:
plagiarism_results

set()

In [11]:
def check_plagiarism():
    global s_vectors
    for student_a, text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a, text_vector_a))
        del new_vectors[current_index]
        for student_b, text_vector_b in new_vectors:
            sim_score = similarity(text_vector_a, text_vector_b)[0][1]
            student_pair = sorted((student_a, student_b))
            score = (student_pair[0], student_pair[1], sim_score)
            plagiarism_results.add(score)
    return plagiarism_results


In [12]:
for data in check_plagiarism():
    print(data)

('fatma.txt', 'juma.txt', 0.20179089793739657)
('fatma.txt', 'requirements.txt', 0.0)
('juma.txt', 'requirements.txt', 0.0)
('fatma.txt', 'john.txt', 0.16228391831223246)
('john.txt', 'requirements.txt', 0.0)
('john.txt', 'juma.txt', 0.5713243251172899)
