In [None]:
import nltk
import numpy as np
import pandas as pd

In [None]:
nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()

print("Tagged sentences: ", len(tagged_sentences))


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Tagged sentences:  3914


#Hidden Markov Model

In [None]:
from nltk.tag import hmm


trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(tagged_sentences)

print(tagger)

<HiddenMarkovModelTagger 46 states and 12408 output symbols>


In [None]:
predictions = []
y_test= []
split_index = round(len(tagged_sentences) * 0.8)

for i in tagged_sentences[split_index:]:
  X = [word for word, _ in i]
  y_test.append([tag for _, tag in i])
  predictions.append([tag for _, tag in tagger.tag(X)])

In [None]:
def accuracy_of_model(y_test, y_pred):
  accuracy = 0
  for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
      accuracy+=1
  return accuracy/len(y_test)
print(accuracy_of_model(y_test, predictions))

0.6819923371647509


In [None]:
from nltk import word_tokenize
nltk.download('punkt')

input_sent = str(input())
tokens = word_tokenize(input_sent)
tags = tagger.tag(tokens)
print(tags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


I am Peter
[('I', 'PRP'), ('am', 'VBP'), ('Peter', 'NNP')]


#CRF

In [None]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': str(sentence[index][0]).upper() == sentence[index][0],
        'is_all_caps': str(sentence[index]).upper() == sentence[index],
        'is_all_lower': str(sentence[index]).lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': str(sentence[index]).isdigit(),
        'capitals_inside': str(sentence[index][1:]).lower() != sentence[index][1:]
    }

In [None]:
from nltk.tag.util import untag

# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

def transform_to_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])

    return X, y

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

print(len(X_train))
print(len(X_test))
print(X_test[0])
print(y_test[0])


2935
979
[{'word': 'We', 'is_first': True, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'W', 'prefix-2': 'We', 'prefix-3': 'We', 'suffix-1': 'e', 'suffix-2': 'We', 'suffix-3': 'We', 'prev_word': '', 'next_word': 'can', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'can', 'is_first': False, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 'c', 'prefix-2': 'ca', 'prefix-3': 'can', 'suffix-1': 'n', 'suffix-2': 'an', 'suffix-3': 'can', 'prev_word': 'We', 'next_word': 'understand', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'understand', 'is_first': False, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 'u', 'prefix-2': 'un', 'prefix-3': 'und', 'suffix-1': 'd', 'suffix-2': 'nd', 'suffix-3': 'and', 'prev_word': 'can', 'next_word': 'and', 'has_hyphen': False, 'is_numeric': F

In [None]:
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn_crfsuite import CRF

model = CRF()

try:
    model.fit(X_train, y_train)
except AttributeError:
    pass
predictions = model.predict(X_test)

In [None]:
print(accuracy_of_model(y_test, predictions))

0.4024514811031665


In [None]:
input_sent = str(input())
tokens = word_tokenize(input_sent)

def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, model.predict([sentence_features])[0]))

print(pos_tag(tokens))  # [('I', 'PRP'), ('am', 'VBP'), ('Bob', 'NNP'), ('!', '.')]

I am Tom
[('I', 'PRP'), ('am', 'VBP'), ('Tom', 'NNP')]


#Linear Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

vectorizer = DictVectorizer(sparse = True)
lr = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', lr)
])

pipeline.fit([features(sent, i) for sent in training_sentences for i in range(len(sent))],
             [sent[i][1] for sent in training_sentences for i in range(len(sent))])

Pipeline(steps=[('vectorizer', DictVectorizer()),
                ('classifier', LogisticRegression())])

In [None]:
predictions = pipeline.predict([features(sent, i) for sent in test_sentences for i in range(len(sent))])

In [None]:
from itertools import chain
y_test_trans = list(chain.from_iterable(y_test))

In [None]:
import sklearn
print(sklearn.metrics.accuracy_score(y_test_trans, predictions))

1.0


In [None]:
input_sent = str(input())
tokens = word_tokenize(input_sent)
X = [features(tokens, i) for i in range(len(tokens))]
print(tokens)
print(pipeline.predict(X))

I am very tired
['I', 'am', 'very', 'tired']
['PRP' 'DT' 'NN' 'NN']
