<a href="https://colab.research.google.com/github/neochoon/2024_UTS/blob/main/Week3_POSTagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## In this exercise, we will
- try an existing POS tool from a Python library called "nltk", and then
- we will create our own POS tool.

In [None]:

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


In [None]:
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

# Same word with different POS tags
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

# Your Turn - what same words in a sentence can you think of that have different POS tags?

############################




In [None]:
# Now let's train our own POS tag classifier
# Download tagged text data
nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()

# Check a sentence
print(tagged_sentences[0])

############################


In [None]:
# Now let's create features for each word.

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

import pprint
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2)) # index == 2

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}

# Try different indices

############################

# Small helper function to strip the tags from our tagged corpus and feed it to our classifier:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]


# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

print(len(training_sentences))   # 2935
print(len(test_sentences))      # 979

# Transform the list of sentences to a list of features
def transform_to_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])

    return X, y

X, y = transform_to_dataset(training_sentences)



In [None]:
# We are now ready to train a classifier.

###### Now you have to click the "Run" bottom above ######
###### Do NOT copy & paste the below ######

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

clf.fit(X[:100], y[:100])   # Use only the first 100 samples if you're running it multiple times. It takes a fair bit :)

print("Training completed")

X_test, y_test = transform_to_dataset(test_sentences)

print ("Accuracy:", clf.score(X_test, y_test))


In [None]:

# Now you can use your classifier to tag any tagged_sentences, such as "I am studying NLP in UTS."
def my_pos_tag(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return (sentence, tags)

my_text = word_tokenize("I am studying NLP at UTS.")
print(list(my_pos_tag(my_text)))