# HMM POS tagger for German, NLP assignment 2, FHNW FS18

## Read and prepare data

In [1]:
# map Stuttgart-Tübingen tagset to universal tagset
stts_to_univ = {
'ADJA': 'ADJ',
'ADJD': 'ADJ',
'ADV': 'ADV',
'APPR': 'ADP',
'APPRART': 'ADP',
'APPO': 'ADP',
'APZR': 'ADP',
'ART': 'DET',
'CARD': 'NUM',
'FM': 'X',
'ITJ': 'X',
'KOUI': 'CONJ',
'KOUS': 'CONJ',
'KON': 'CONJ',
'KOKOM': 'CONJ',
'NN': 'NOUN',
'NE': 'NOUN',
'PDS': 'PRON',
'PDAT': 'PRON',
'PIS': 'PRON',
'PIAT': 'PRON',
'PIDAT': 'PRON',
'PPER': 'PRON',
'PPOSS': 'PRON',
'PPOSAT': 'PRON',
'PRELS': 'PRON',
'PRELAT': 'PRON',
'PRF': 'PRON',
'PWS': 'PRON',
'PWAT': 'PRON',
'PWAV': 'PRON',
'PAV': 'PRON',
'PTKZU': 'PRT',
'PTKNEG': 'PRT',
'PTKVZ': 'PRT',
'PTKANT': 'PRT',
'PTKA': 'PRT',
'TRUNC': 'X',
'VVFIN': 'VERB',
'VVIMP': 'VERB',
'VVINF': 'VERB',
'VVIZU': 'VERB',
'VVPP': 'VERB',
'VAFIN': 'VERB',
'VAIMP': 'VERB',
'VAINF': 'VERB',
'VAPP': 'VERB',
'VMFIN': 'VERB',
'VMINF': 'VERB',
'VMPP': 'VERB',
'XY': 'X',
'$,': '.',
'$.': '.',
'$(': '.'
}

In [2]:
def readfile(filename):
    with open(filename, 'r') as f:
        lines = f.readlines() # sentences are split by newline
        
    tagged_sents = []
    for i, tagged_sent in enumerate(lines):
        tagged_sents.append([])
        for pair in tagged_sent.split(';'): # tagged words are split by ;
            splitted = pair.split('/') # word and tag are split by /
            if (len(splitted) == 2): # making sure we only get actual word-tag pairs
                word = splitted[0].strip().lower()
                tag = stts_to_univ[splitted[1].strip()] # map the stts tag to a universal tag
                tagged_sents[i].append((word, tag))
    
    return tagged_sents

In [4]:
tagged_sents = readfile("POS_German_train.txt")
tagged_sents_minitest = readfile("POS_German_minitest.txt")

In [5]:
from sklearn.model_selection import train_test_split

# split data so we have additional testdata (not only sentences_minitest)
tagged_sents_train, tagged_sents_test = train_test_split(tagged_sents, test_size=0.2, random_state=9000)

## Calculate emission probabilities P(word | tag)

In [7]:
import itertools
from nltk import FreqDist, ConditionalFreqDist

tagged_words = list(itertools.chain(*tagged_sents_train)) # flatten

emissions = ConditionalFreqDist((tag, word) for word, tag in tagged_words)

"""
tag_freqs = FreqDist(tag  for word, tag in tagged_words)
word_freqs = FreqDist(word for word, tag in tagged_words)
tag_probs =  dict(map(lambda pair: (pair[0], pair[1]/tag_freqs.N()), tag_freqs.items()))
word_probs = dict(map(lambda pair: (pair[0], pair[1]/word_freqs.N()), word_freqs.items()))

cond_tag_freqs = ConditionalFreqDist(tagged_words)
cond_tag_probs = {} # P(tag | word)
for word, tag_fdist in cond_tag_freqs.items():
    cond_tag_probs[word] = dict(map(lambda pair: (pair[0], pair[1]/tag_fdist.N()), tag_fdist.items()))

# using bayes law to get P(word | tag)
cond_word_probs = {} # P(word | tag)
for tag in tag_freqs:
    cond_word_probs[tag] = {}
    for word in word_freqs:
        if tag in cond_tag_probs[word]:
            cond_word_probs[tag][word] = cond_tag_probs[word][tag] * word_probs[word] / tag_probs[tag]

# make sure all the sums are (approx.) 1
for c in cond_word_probs.items():
    print(c[0] + ":\t" + str(sum(b for a, b in c[1].items())))
"""
            


'\ntag_freqs = FreqDist(tag  for word, tag in tagged_words)\nword_freqs = FreqDist(word for word, tag in tagged_words)\ntag_probs =  dict(map(lambda pair: (pair[0], pair[1]/tag_freqs.N()), tag_freqs.items()))\nword_probs = dict(map(lambda pair: (pair[0], pair[1]/word_freqs.N()), word_freqs.items()))\n\ncond_tag_freqs = ConditionalFreqDist(tagged_words)\ncond_tag_probs = {} # P(tag | word)\nfor word, tag_fdist in cond_tag_freqs.items():\n    cond_tag_probs[word] = dict(map(lambda pair: (pair[0], pair[1]/tag_fdist.N()), tag_fdist.items()))\n\n# using bayes law to get P(word | tag)\ncond_word_probs = {} # P(word | tag)\nfor tag in tag_freqs:\n    cond_word_probs[tag] = {}\n    for word in word_freqs:\n        if tag in cond_tag_probs[word]:\n            cond_word_probs[tag][word] = cond_tag_probs[word][tag] * word_probs[word] / tag_probs[tag]\n\n# make sure all the sums are (approx.) 1\nfor c in cond_word_probs.items():\n    print(c[0] + ":\t" + str(sum(b for a, b in c[1].items())))\n'

In [13]:
print("Example: P(einfach | ADJ) = " + str(emissions['ADJ'].freq('einfach')))

P(einfach | ADJ) = 0.0003172454633898735


## Calculate transition probabilities P(tag_i | tag_j)

In [10]:
transitions = ConditionalFreqDist()
for tagged_sent in tagged_sents_train[:500]:
    transitions += ConditionalFreqDist([('START', tagged_sent[0][1])])
    for i in range(1, len(tagged_sent)):
        transitions += ConditionalFreqDist([(tagged_sent[i - 1][1], tagged_sent[i][1])])

In [11]:
# visualize transition probabilities
titlerow = []
titlerow.append('{0:5}'.format(''))
for tag in list(transitions):
    titlerow.append('{0:5}'.format(tag))

print(titlerow)
    
for tag1 in list(transitions):
    row = []
    row.append('{0:5}'.format(tag1))
    for tag2 in list(transitions):
        row.append('{:>.3f}'.format(transitions[tag1].freq(tag2)) )
    print(row)

['     ', 'START', 'NOUN ', 'CONJ ', 'PRON ', 'DET  ', 'PRT  ', 'VERB ', '.    ', 'ADV  ', 'ADJ  ', 'ADP  ', 'NUM  ', 'X    ']
['START', '0.000', '0.214', '0.058', '0.154', '0.208', '0.004', '0.012', '0.050', '0.112', '0.034', '0.138', '0.008', '0.008']
['NOUN ', '0.000', '0.126', '0.047', '0.017', '0.084', '0.033', '0.214', '0.212', '0.028', '0.039', '0.187', '0.012', '0.001']
['CONJ ', '0.000', '0.259', '0.003', '0.203', '0.203', '0.016', '0.057', '0.009', '0.073', '0.079', '0.076', '0.022', '0.000']
['PRON ', '0.000', '0.264', '0.013', '0.076', '0.095', '0.023', '0.177', '0.041', '0.081', '0.090', '0.129', '0.010', '0.000']
['DET  ', '0.000', '0.674', '0.000', '0.012', '0.001', '0.002', '0.000', '0.030', '0.008', '0.243', '0.023', '0.005', '0.001']
['PRT  ', '0.000', '0.000', '0.013', '0.013', '0.013', '0.033', '0.344', '0.404', '0.060', '0.066', '0.053', '0.000', '0.000']
['VERB ', '0.000', '0.057', '0.029', '0.120', '0.104', '0.015', '0.112', '0.376', '0.060', '0.027', '0.087', '0

## Use hidden Markov model to determine POS

In [514]:
## TODO maybe implement my own ;P

from nltk.tag.hmm import HiddenMarkovModelTrainer, HiddenMarkovModelTagger
from nltk.probability import LidstoneProbDist

tag_set = set([tag for sentence in sentences for word, tag in sentence])
word_set = set([word for sentence in sentences for word, tag in sentence])

trainer = HiddenMarkovModelTrainer(list(tag_set), list(word_set)) # make lists so items can potentionally be appended

tagger = trainer.train_supervised(sentences_train, estimator=lambda fd, bins: LidstoneProbDist(fd, .1, bins))

In [515]:
tagger.test(sentences_test) #, verbose=True --> TAKES FOREVER, but super high accuracy

KeyboardInterrupt: 

In [516]:
tagger.test(sentences_minitest[:100]) #, verbose=True --> TAKES FOREVER, but super high accuracy

accuracy over 1812 tokens: 96.52


In [517]:
tagger.tag("Der schnelle Fuchs springt über den faulen Zaun".split())

[('Der', 'DET'),
 ('schnelle', 'ADJ'),
 ('Fuchs', 'NOUN'),
 ('springt', 'VERB'),
 ('über', 'ADP'),
 ('den', 'DET'),
 ('faulen', 'ADJ'),
 ('Zaun', 'NOUN')]