In [36]:
import os

cwd = os.getcwd()
doc_list = os.listdir(cwd + "/ngram")


In [37]:
docs = {}
for doc in doc_list:
    with open(cwd + "/ngram/" + doc) as f:
        docs[doc.split(".")[0]] = f.readline()

docs

{'D1': 'I am Sam',
 'D2': 'Sam I am',
 'D3': 'I do not like green eggs and ham',
 'D4': 'I do not like them, Sam I am'}

In [38]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mssel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
from nltk.tokenize import word_tokenize

after_tokenized = {}

for doc, sentence in docs.items():
    after_tokenized[doc] = word_tokenize(sentence)

after_tokenized

{'D1': ['I', 'am', 'Sam'],
 'D2': ['Sam', 'I', 'am'],
 'D3': ['I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham'],
 'D4': ['I', 'do', 'not', 'like', 'them', ',', 'Sam', 'I', 'am']}

In [40]:
from nltk.util import ngrams

def generate_ngrams(sentence, k):
    return list(ngrams(sentence, k))

k_values = [1, 2, 3]

for k in k_values:
    print(f"K-Value: {k} -------------------")
    for doc, tokenized_sentence in after_tokenized.items():
        print(f"{doc}: {generate_ngrams(tokenized_sentence, k)}")
    print("\n")

K-Value: 1 -------------------
D1: [('I',), ('am',), ('Sam',)]
D2: [('Sam',), ('I',), ('am',)]
D3: [('I',), ('do',), ('not',), ('like',), ('green',), ('eggs',), ('and',), ('ham',)]
D4: [('I',), ('do',), ('not',), ('like',), ('them',), (',',), ('Sam',), ('I',), ('am',)]


K-Value: 2 -------------------
D1: [('I', 'am'), ('am', 'Sam')]
D2: [('Sam', 'I'), ('I', 'am')]
D3: [('I', 'do'), ('do', 'not'), ('not', 'like'), ('like', 'green'), ('green', 'eggs'), ('eggs', 'and'), ('and', 'ham')]
D4: [('I', 'do'), ('do', 'not'), ('not', 'like'), ('like', 'them'), ('them', ','), (',', 'Sam'), ('Sam', 'I'), ('I', 'am')]


K-Value: 3 -------------------
D1: [('I', 'am', 'Sam')]
D2: [('Sam', 'I', 'am')]
D3: [('I', 'do', 'not'), ('do', 'not', 'like'), ('not', 'like', 'green'), ('like', 'green', 'eggs'), ('green', 'eggs', 'and'), ('eggs', 'and', 'ham')]
D4: [('I', 'do', 'not'), ('do', 'not', 'like'), ('not', 'like', 'them'), ('like', 'them', ','), ('them', ',', 'Sam'), (',', 'Sam', 'I'), ('Sam', 'I', 'am

In [41]:
def jaccard_coefficient(set1: set, set2: set):
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    return len(intersection) / len(union)

In [58]:
k_values = [1, 2, 3]

for k in k_values:
    print(f"Jaccard coefficient for k = {k}----------")
    kgrams = []
    for doc, sentence in after_tokenized.items():
        kgrams.append(set(generate_ngrams(sentence, k)))

    for i in range(len(kgrams)):
        for j in range(i + 1, len(kgrams)):
            jc = jaccard_coefficient(kgrams[i], kgrams[j])
            print(f"D{i + 1} and D{j + 1}: {jc}")
    print('\n')

Jaccard coefficient for k = 1----------
D1 and D2: 1.0
D1 and D3: 0.1
D1 and D4: 0.375
D2 and D3: 0.1
D2 and D4: 0.375
D3 and D4: 0.3333333333333333


Jaccard coefficient for k = 2----------
D1 and D2: 0.3333333333333333
D1 and D3: 0.0
D1 and D4: 0.1111111111111111
D2 and D3: 0.0
D2 and D4: 0.25
D3 and D4: 0.25


Jaccard coefficient for k = 3----------
D1 and D2: 0.0
D1 and D3: 0.0
D1 and D4: 0.0
D2 and D3: 0.0
D2 and D4: 0.14285714285714285
D3 and D4: 0.18181818181818182


