In [9]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score

# Get a list of all .txt files in the current directory
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]

# Ensure we have files to process
if not student_files:
    print("No .txt files found in the current directory.")
else:
    # Read the content of the files
    student_notes = [open(_file, encoding='utf-8').read() for _file in student_files]

    # Print the content of each file for debugging
    for i, note in enumerate(student_notes):
        print(f"Content of {student_files[i]}:\n{note}\n")

    def vectorize(text):
        vectorizer = CountVectorizer(binary=True).fit(text)
        return vectorizer.transform(text).toarray()

    def jaccard_similarity(doc1, doc2):
        return jaccard_score(doc1, doc2, average='binary')

    # Convert the documents to binary vectors
    vectors = vectorize(student_notes)
    s_vectors = list(zip(student_files, vectors))

    # Print the vector representations for debugging
    for i, vector in enumerate(vectors):
        print(f"Vector for {student_files[i]}:\n{vector}\n")

    similarity_results = set()

    def check_similarity():
        global s_vectors
        for student_a, text_vector_a in s_vectors:
            new_vectors = s_vectors.copy()
            current_index = new_vectors.index((student_a, text_vector_a))
            del new_vectors[current_index]
            for student_b, text_vector_b in new_vectors:
                sim_score = jaccard_similarity(text_vector_a, text_vector_b)
                student_pair = sorted((student_a, student_b))
                score = (student_pair[0], student_pair[1], sim_score)
                similarity_results.add(score)
                print(f"Jaccard similarity between {student_a} and {student_b}: {sim_score}")
        return similarity_results

    # Print the similarity results
    results = check_similarity()
    if results:
        for data in results:
            print(data)
    else:
        print("No similarity results found.")


Content of fatma.txt:
Life is all about doing your best in trying to
find what works out for you and taking most time in
trying to pursue those skills 

Content of john.txt:
Life is all about finding money and spending on luxury stuffs
Coz this life is kinda short , trust 

Content of juma.txt:
Life to me is about finding money and use it on things that makes you happy
coz this life is kinda short 

Content of requirements.txt:
scikit_learn==0.24.2


Vector for fatma.txt:
[0 1 1 1 1 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 1 1 1 0
 1 0 1 1 1 1]

Vector for john.txt:
[0 1 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1
 0 0 0 0 0 0]

Vector for juma.txt:
[0 1 0 1 0 1 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0
 0 1 0 0 1 0]

Vector for requirements.txt:
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]

Jaccard similarity between fatma.txt and john.txt: 0.14705882352941177
Jaccard similarity be