In [5]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import nltk

# Download NLTK punkt sentence tokenizer (if not already downloaded)
nltk.download("punkt")

# Download NLTK averaged perceptron POS tagger (if not already downloaded)
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
def get_pos_tags(sentences):
    """
    This function takes a list of sentences and returns a list of lists containing POS tags.
    """
    pos_tagged_sentences = []
    for sentence in sentences:
        # Tokenize the sentence into words
        tokens = nltk.word_tokenize(sentence)
        # Get POS tags for each token
        pos_tags = nltk.pos_tag(tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

In [8]:
def get_grams(pos_tagged_sentences):
    """
    This function takes a list of POS-tagged sentences and returns a dictionary containing 1-grams and 2-grams for each sentence.
    """
    grams_dict = {}
    for i, sentence in enumerate(pos_tagged_sentences):
        grams = {"1-grams": [], "2-grams": []}
        for word, tag in sentence:
            grams["1-grams"].append(tag)
            if len(sentence) > 1 and sentence.index((word, tag)) < len(sentence) - 1:
                next_word, next_tag = sentence[sentence.index((word, tag)) + 1]
                grams["2-grams"].append((tag, next_tag))
        grams_dict[f"Sentence {i+1}"] = grams
    return grams_dict

In [43]:
def jaccard_similarity(set1, set2):
    """
    This function calculates the Jaccard similarity between two sets,
    along with union and intersection counts.
    """
    intersection_count = 0
    union = len(set1 | set2)  # Union of sets
    for element in set1:
        if element in set2:
            intersection_count += 1
    jaccard_value = intersection_count / float(union) if union else 0
    return jaccard_value, intersection_count, union  # Return additional values

In [44]:
def calculate_jaccard(grams_dict):
    """
    This function calculates Jaccard similarity for 1-grams and 2-grams between all sentence pairs,
    also displaying the union and intersection counts.
    """
    for i, sentence1 in enumerate(grams_dict.values()):
        for j, sentence2 in enumerate(grams_dict.values()):
            if i != j:
                # Calculate Jaccard for 1-grams
                jaccard_1gram, intersection_1gram, union_1gram = jaccard_similarity(
                    set(sentence1["1-grams"]), set(sentence2["1-grams"])
                )
                # Calculate Jaccard for 2-grams
                jaccard_2gram, intersection_2gram, union_2gram = jaccard_similarity(
                    set(sentence1["2-grams"]), set(sentence2["2-grams"])
                )

                # Print the results
                print(f"Jaccard Similarity (Sentence {i+1} vs. Sentence {j+1}):")
                print(f"\t1-grams: {jaccard_1gram}")
                print(
                    f"\tUnion 1-gram: {union_1gram}, Intersection 1-gram: {intersection_1gram}"
                )
                print(f"\t2-grams: {jaccard_2gram}")
                print(
                    f"\tUnion 2-gram: {union_2gram}, Intersection 2-gram: {intersection_2gram}"
                )
                print("-------------------------------------------------------")

In [45]:
# Define your sentences
# NOTE: The name Secretes is written in uppercase because the library detects it as a NNS instead of NNP since the begging of the sentence starts with Uppercase.
sentences = ["All men are mortal", "SOCRATES is a man", "SOCRATES is mortal"]

# Get POS-tagged sentences
pos_tagged_sentences = get_pos_tags(sentences)

# Get POS 1-grams and 2-grams for each sentence
grams_dict = get_grams(pos_tagged_sentences)

# Print POS Tags and Grams
print("POS Tags and Grams:")
for key, value in grams_dict.items():
    print(f"{key}:")
    print(f"\t1-grams: {value['1-grams']}")
    print(f"\t2-grams: {value['2-grams']}")
    print(
        "---------------------------------------------------------------------------------"
    )

# Calculate Jaccard similarity
calculate_jaccard(grams_dict)

POS Tags and Grams:
Sentence 1:
	1-grams: ['DT', 'NNS', 'VBP', 'JJ']
	2-grams: [('DT', 'NNS'), ('NNS', 'VBP'), ('VBP', 'JJ')]
---------------------------------------------------------------------------------
Sentence 2:
	1-grams: ['NNP', 'VBZ', 'DT', 'NN']
	2-grams: [('NNP', 'VBZ'), ('VBZ', 'DT'), ('DT', 'NN')]
---------------------------------------------------------------------------------
Sentence 3:
	1-grams: ['NNP', 'VBZ', 'JJ']
	2-grams: [('NNP', 'VBZ'), ('VBZ', 'JJ')]
---------------------------------------------------------------------------------
Jaccard Similarity (Sentence 1 vs. Sentence 2):
	1-grams: 0.14285714285714285
	Union 1-gram: 7, Intersection 1-gram: 1
	2-grams: 0.0
	Union 2-gram: 6, Intersection 2-gram: 0
-------------------------------------------------------
Jaccard Similarity (Sentence 1 vs. Sentence 3):
	1-grams: 0.16666666666666666
	Union 1-gram: 6, Intersection 1-gram: 1
	2-grams: 0.0
	Union 2-gram: 5, Intersection 2-gram: 0
----------------------------------