In [2]:
lines = open('cricketbat.txt').read()

In [3]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(lines)
tokens = [token.lower() for token in tokens]

In [4]:
BAD_CHARS = [';', ':', '!', "*", '<', '>','#','?','@','p',',','.','(',')']
no_bad_chars = list(filter(lambda token: token not in BAD_CHARS, tokens))

In [5]:
from nltk.corpus import stopwords

STOP_WORDS = set(stopwords.words('english'))
no_stop_words = list(filter(lambda token: token not in STOP_WORDS, no_bad_chars))

In [6]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tabulate import tabulate

stemmer = PorterStemmer()
stem = [stemmer.stem(token) for token in no_stop_words]


lemmatizer = WordNetLemmatizer()
lemma = [lemmatizer.lemmatize(token) for token in no_stop_words]

print(tabulate(zip(no_stop_words, stem, lemma), headers=['WORD', 'STEM', 'LEMMA']))

lemmatizer.lemmatize()

WORD              STEM              LEMMA
----------------  ----------------  ----------------
blade             blade             blade
cricket           cricket           cricket
bat               bat               bat
wooden            wooden            wooden
block             block             block
generally         gener             generally
flat              flat              flat
striking          strike            striking
face              face              face
ridge             ridg              ridge
reverse           revers            reverse
back              back              back
concentrates      concentr          concentrate
wood              wood              wood
middle            middl             middle
ball              ball              ball
generally         gener             generally
hit               hit               hit
bat               bat               bat
traditionally     tradit            traditionally
made              made              made
will

TypeError: lemmatize() missing 1 required positional argument: 'word'

In [None]:
from nltk import pos_tag
pos_tags = list(pos_tag(no_stop_words))
print(pos_tags[:5])

word, tags = zip(*pos_tags)
print(tabulate(zip(no_stop_words, stem, lemma, tags), headers=['WORD', 'STEM', 'LEMMA', 'POS_TAG']))

In [None]:
from collections import Counter

uniGram = Counter(tokens)
biGram = Counter(zip(tokens, tokens[1:]))

def calculate_probability(sentence, uniGram, biGram):
    output_probability = 1
    
    for word1, word2 in zip(sentence, sentence[1:]):
        probability = biGram[(word1, word2)] / uniGram[word1]
        output_probability *= probability
    
    return output_probability

In [None]:
from nltk import RegexpParser

chunker = RegexpParser("""
    NP: {<DT>?<JJ>*<NN>}
    P: {<IN>}
    V: {<V>.*}
    PP: {<P> <NP>}
    VP: {<V> <NP|PP>*}
""")

  
output = chunker.parse(pos_tags)
output.draw()

In [None]:
from nltk.corpus import wordnet

word = "sample"

for synonym in wordnet.synsets(word):
    print(synonym)
    print(synonym.definition())
    print(synonym.examples())

In [None]:
from nltk.corpus import wordnet

def preprocess(lines):
    tokens = word_tokenize(lines)
    tokens = [token.lower() for token in tokens]
    
    no_bad_chars = list(filter(lambda token: token not in BAD_CHARS, tokens))
    no_stop_words = list(filter(lambda token: token not in STOP_WORDS, no_bad_chars))
    
    return no_stop_words

def get_word_lemmas(tokens):
    word_lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return word_lemmas

def get_synonyms_lemma(word):
    synonyms = []

    for synonym in wordnet.synsets(word):
        synonyms += [lemma.name() for lemma in synonym.lemmas()]

    return synonyms

def get_word_and_syn_lemmas(tokens):
    word_and_syn_lemmas = []    # Should contain the lemma of the word and its synonyms

    for word in tokens:
        word_and_syn_lemmas.append(lemmatizer.lemmatize(word))   # Adding the lemma of the word
        word_and_syn_lemmas.extend(get_synonyms_lemma(word))     # Adding the lemma of all the synonyms of the word
    
    return word_and_syn_lemmas

def words_simlilarity_score(word1, word2):
    word1 = word1 + ".n.01"
    word2 = word2 + ".n.01"

    try:
        w1 = wordnet.synset(word1)
        w2 = wordnet.synset(word2)
        return w1.wup_similarity(w2)
    except:
        return 0

In [None]:
query_sentence = "Will be given by the user"
file_1 = open('cricketbat.txt').read() 
file_2 = open('vampirebat.txt').read()

tokens1 = preprocess(file_1) 
tokens2 = preprocess(file_2) 
tokens3 = preprocess(query_sentence)


word_lemmas_1 = get_word_lemmas(tokens1)
word_syn_lemmas_1 = get_word_and_syn_lemmas(tokens1)

word_lemmas_2 = get_word_lemmas(tokens2)
word_syn_lemmas_2 = get_word_and_syn_lemmas(tokens2)

word_lemmas_3 = get_word_lemmas(tokens3)
word_syn_lemmas_3 = get_word_and_syn_lemmas(tokens3)

exact_word_match13 = 0
exact_word_match23 = 0
for word3 in word_lemmas_3:
    for word1 in word_lemmas_1:
        exact_word_match13 += 1 if word1 == word3 else 0

    for word2 in word_lemmas_2:
        exact_word_match23 += 1 if word2 == word3 else 0

similarity_score13 = 0
similarity_score23 = 0
for word3 in word_lemmas_3:
    for word1 in word_lemmas_1:
        similarity_score13 += words_simlilarity_score(word3, word1)

    for word2 in word_lemmas_2:
        similarity_score23 += words_simlilarity_score(word3, word2)

file1_score = exact_word_match13 + similarity_score13
file2_score = exact_word_match23 + similarity_score23

if file1_score > file2_score: print("The query sentence belongs to file1")
else: print("The query sentence belongs to file2")

What follows is for HMM

In [None]:
import nltk 
import math
from itertools import chain

TRAIN_PERCENT = 0.7

training_corpus = nltk.corpus.brown
tagged_sents = training_corpus.tagged_sents()

X, y = [], []
for sentence in tagged_sents:
    x_, y_ = list(zip(*sentence))
    X += x_
    y += y_

train_size = math.ceil(TRAIN_PERCENT * len(X))

X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

In [None]:
train_vocab = frozenset(chain([s for s in X_train]))

In [None]:
from pomegranate import State,HiddenMarkovModel,DiscreteDistribution
model = HiddenMarkovModel(name="HMM")

In [None]:
from collections import Counter, defaultdict

def get_unigram(sequences):
    return Counter(chain(sequences))

def get_bigram(sequences):
    bigrams = defaultdict(int)
    
    for sequence in sequences:
        for i in zip(sequence, sequence[1:]): 
            bigrams[i] += 1
    
    return bigrams

In [None]:
unique_tags = set(y)
tag_unigrams = get_unigram(y_train)
tag_bigrams = get_bigram(y_train)

emission_counts = defaultdict(lambda: defaultdict(int))
for word, tag in zip(X_train, y_train):
    emission_counts[tag][word] += 1

emissions_distribution = {}
states = dict()
for tag in unique_tags:
    for word in emission_counts[tag]:
        emissions_distribution[word] = emission_counts[tag][word] / tag_unigrams[tag]
    tag_emissions = DiscreteDistribution(emissions_distribution)
    tag_state = State(tag_emissions, name=tag)
    states[tag]=tag_state

model.add_states([elt for elt in states.values()])

In [None]:
start_state_count=defaultdict(int)
for sentence in tagged_sents:
    start_state_count[sentence[0][1]] +=1

tag_starts_sum=sum(start_state_count.values())
for tag in unique_tags:    
    prob = start_state_count[tag] / tag_starts_sum
    model.add_transition(model.start, states[tag], prob)

In [None]:
stop_state_count=defaultdict(int)
for sentence in tagged_sents:
    stop_state_count[sentence[-1][1]] +=1

tag_stop_sum=sum(stop_state_count.values())
for tag in unique_tags:    
    prob = stop_state_count[tag] / tag_stop_sum
    model.add_transition(states[tag], model.end, prob)

In [None]:
for t1, t2 in tag_bigrams.keys():
    # Compute the transition probability P(t2|t1)=C(t1,t2) / C(t1)
    if tag_unigrams[t1] <= 0: continue
    prob = tag_bigrams[(t1,t2)] / tag_unigrams[t1]
    model.add_transition(states[t1], states[t2] , prob)
model.bake()
_, state_path = model.viterbi(X_test)
output_sequence=[state[1].name for state in state_path[1:-1]]