# HMM POS tagger for German, NLP assignment 2, FHNW FS18

## Read and prepare data

In [368]:
# map Stuttgart-Tübingen tagset to universal tagset
stts_to_univ = {
'ADJA': 'ADJ',
'ADJD': 'ADJ',
'ADV': 'ADV',
'APPR': 'ADP',
'APPRART': 'ADP',
'APPO': 'ADP',
'APZR': 'ADP',
'ART': 'DET',
'CARD': 'NUM',
'FM': 'X',
'ITJ': 'X',
'KOUI': 'CONJ',
'KOUS': 'CONJ',
'KON': 'CONJ',
'KOKOM': 'CONJ',
'NN': 'NOUN',
'NE': 'NOUN',
'PDS': 'PRON',
'PDAT': 'PRON',
'PIS': 'PRON',
'PIAT': 'PRON',
'PIDAT': 'PRON',
'PPER': 'PRON',
'PPOSS': 'PRON',
'PPOSAT': 'PRON',
'PRELS': 'PRON',
'PRELAT': 'PRON',
'PRF': 'PRON',
'PWS': 'PRON',
'PWAT': 'PRON',
'PWAV': 'PRON',
'PAV': 'PRON',
'PTKZU': 'PRT',
'PTKNEG': 'PRT',
'PTKVZ': 'PRT',
'PTKANT': 'PRT',
'PTKA': 'PRT',
'TRUNC': 'X',
'VVFIN': 'VERB',
'VVIMP': 'VERB',
'VVINF': 'VERB',
'VVIZU': 'VERB',
'VVPP': 'VERB',
'VAFIN': 'VERB',
'VAIMP': 'VERB',
'VAINF': 'VERB',
'VAPP': 'VERB',
'VMFIN': 'VERB',
'VMINF': 'VERB',
'VMPP': 'VERB',
'XY': 'X',
'$,': '.',
'$.': '.',
'$(': '.'
}

In [486]:
def readfile(filename):
    with open(filename, 'r') as f:
        lines = f.readlines() # sentences are split by newline
        
    tagged_sents = []
    for i, tagged_sent in enumerate(lines):
        tagged_sents.append([])
        for pair in tagged_sent.split(';'): # tagged words are split by ;
            splitted = pair.split('/') # word and tag are split by /
            if (len(splitted) == 2): # making sure we only get actual word-tag pairs
                word = splitted[0].strip().lower()
                tag = stts_to_univ[splitted[1].strip()] # map the stts tag to a universal tag
                tagged_sents[i].append((word, tag))
    
    return tagged_sents

In [487]:
tagged_sents = readfile("POS_German_train.txt")
tagged_sents_minitest = readfile("POS_German_minitest.txt")

In [488]:
from sklearn.model_selection import train_test_split

# split data so we have additional testdata (not only sentences_minitest)
tagged_sents_train, tagged_sents_test = train_test_split(tagged_sents, test_size=0.2, random_state=9000)

In [489]:
tagged_sents_train[:3]

[[('paris', 'NOUN'), ('.', '.')],
 [('wenn', 'CONJ'),
  ('ich', 'PRON'),
  ('die', 'DET'),
  ('bewegung', 'NOUN'),
  ('des', 'DET'),
  ('films', 'NOUN'),
  ('nicht', 'PRT'),
  ('spuere', 'VERB'),
  (',', '.'),
  ('kann', 'VERB'),
  ('ich', 'PRON'),
  ('nicht', 'PRT'),
  ('schreiben', 'VERB'),
  ('.', '.')],
 [('``', '.'),
  ('unsere', 'PRON'),
  ('division', 'NOUN'),
  ('ist', 'VERB'),
  ('genauso', 'ADV'),
  ('volksverbunden', 'ADJ'),
  ('wie', 'CONJ'),
  ('andere', 'PRON'),
  ('teile', 'NOUN'),
  ('der', 'DET'),
  ('armee', 'NOUN'),
  ("''", '.'),
  (',', '.'),
  ('erfaehrt', 'VERB'),
  ('helmut', 'NOUN'),
  ('kohl', 'NOUN'),
  ('aus', 'ADP'),
  ('erster', 'ADJ'),
  ('hand', 'NOUN'),
  (',', '.'),
  ('wie', 'PRON'),
  ('der', 'DET'),
  ('redetext', 'NOUN'),
  ('des', 'DET'),
  ('oberst', 'NOUN'),
  ('zeigt', 'VERB'),
  ('.', '.')]]

## Calculate emission probabilities P(word | tag)

In [527]:
import itertools
from nltk import FreqDist, ConditionalFreqDist

tagged_words = list(itertools.chain(*tagged_sents_train)) # flatten

tag_freqs = FreqDist(tag  for word, tag in tagged_words)
word_freqs = FreqDist(word for word, tag in tagged_words)
tag_probs =  dict(map(lambda pair: (pair[0], pair[1]/tag_freqs.N()), tag_freqs.items()))
word_probs = dict(map(lambda pair: (pair[0], pair[1]/word_freqs.N()), word_freqs.items()))

cond_tag_freqs = ConditionalFreqDist(tagged_words)
cond_tag_probs = {} # P(tag | word)
for word, tag_fdist in cond_tag_freqs.items():
    cond_tag_probs[word] = dict(map(lambda pair: (pair[0], pair[1]/tag_fdist.N()), tag_fdist.items()))

# using bayes law to get P(word | tag)
cond_word_probs = {} # P(word | tag)
for tag in tag_freqs:
    cond_word_probs[tag] = {}
    for word in word_freqs:
        if tag in cond_tag_probs[word]:
            cond_word_probs[tag][word] = cond_tag_probs[word][tag] * word_probs[word] / tag_probs[tag]

In [528]:
# make sure all the sums are (approx.) 1
for c in cond_word_probs.items():
    print(c[0] + ":\t" + str(sum(b for a, b in c[1].items())))

NOUN:	1.0000000000003337
.:	1.0
CONJ:	0.9999999999999998
PRON:	0.9999999999999977
DET:	0.9999999999999999
PRT:	0.9999999999999997
VERB:	1.0000000000000764
ADV:	1.0000000000000095
ADJ:	0.99999999999976
ADP:	1.000000000000001
NUM:	0.9999999999999829
X:	1.000000000000001


## Calculate transition probabilities P(tag_i | tag_j)

In [544]:
cfd = nltk.ConditionalFreqDist()
for tagged_sent in tagged_sents_train[:50]:
    cfd += nltk.ConditionalFreqDist([('<S>', tagged_sent[0][1])])
    for i in range(1, len(tagged_sent)):
        cfd += nltk.ConditionalFreqDist([(tagged_sent[i - 1][1], tagged_sent[i][1])])

transition_probs = {}
for tag in list(cfd):
    transition_probs[tag] = {}
    for tag2 in list(cfd):
        transition_probs[tag][tag2] = 0.0
    for tag2 in list(cfd[tag]):
        transition_probs[tag][tag2] = float(cfd[tag][tag2])/float(cfd[tag].N())

In [545]:
# visualize transition probabilities
head = []
head.append('{0:5}'.format(''))
for tag in list(cfd):
    head.append('{0:5}'.format(tag))

print(head)
    
for tag1 in list(cfd):
    line = []
    line.append('{0:5}'.format(tag1))
    for tag in list(cfd):
        line.append('{:>.3f}'.format((transition_probs[tag1][tag])) )
    print(line)

['     ', '<S>  ', 'NOUN ', 'CONJ ', 'PRON ', 'DET  ', 'PRT  ', 'VERB ', '.    ', 'ADV  ', 'ADJ  ', 'ADP  ', 'NUM  ', 'X    ']
['<S>  ', '0.000', '0.280', '0.080', '0.080', '0.200', '0.000', '0.000', '0.120', '0.060', '0.040', '0.100', '0.040', '0.000']
['NOUN ', '0.000', '0.123', '0.051', '0.018', '0.087', '0.040', '0.192', '0.228', '0.029', '0.072', '0.156', '0.004', '0.000']
['CONJ ', '0.000', '0.359', '0.000', '0.179', '0.205', '0.000', '0.026', '0.026', '0.051', '0.077', '0.077', '0.000', '0.000']
['PRON ', '0.000', '0.317', '0.017', '0.067', '0.117', '0.067', '0.200', '0.017', '0.083', '0.033', '0.067', '0.017', '0.000']
['DET  ', '0.000', '0.721', '0.000', '0.010', '0.000', '0.000', '0.000', '0.000', '0.010', '0.231', '0.019', '0.000', '0.010']
['PRT  ', '0.000', '0.000', '0.000', '0.000', '0.100', '0.000', '0.450', '0.300', '0.100', '0.050', '0.000', '0.000', '0.000']
['VERB ', '0.000', '0.060', '0.009', '0.095', '0.086', '0.009', '0.138', '0.405', '0.112', '0.009', '0.078', '0

## Use hidden Markov model to determine POS

In [514]:
## TODO maybe implement my own ;P

from nltk.tag.hmm import HiddenMarkovModelTrainer, HiddenMarkovModelTagger
from nltk.probability import LidstoneProbDist

tag_set = set([tag for sentence in sentences for word, tag in sentence])
word_set = set([word for sentence in sentences for word, tag in sentence])

trainer = HiddenMarkovModelTrainer(list(tag_set), list(word_set)) # make lists so items can potentionally be appended

tagger = trainer.train_supervised(sentences_train, estimator=lambda fd, bins: LidstoneProbDist(fd, .1, bins))

In [515]:
tagger.test(sentences_test) #, verbose=True --> TAKES FOREVER, but super high accuracy

KeyboardInterrupt: 

In [516]:
tagger.test(sentences_minitest[:100]) #, verbose=True --> TAKES FOREVER, but super high accuracy

accuracy over 1812 tokens: 96.52


In [517]:
tagger.tag("Der schnelle Fuchs springt über den faulen Zaun".split())

[('Der', 'DET'),
 ('schnelle', 'ADJ'),
 ('Fuchs', 'NOUN'),
 ('springt', 'VERB'),
 ('über', 'ADP'),
 ('den', 'DET'),
 ('faulen', 'ADJ'),
 ('Zaun', 'NOUN')]