# HMM POS tagger, NLP assignment 2, FHNW FS18

## Read and prepare data

In [248]:
# map Stuttgart-Tübingen tagset to universal tagset
stts_to_univ = {
'ADJA': 'ADJ',
'ADJD': 'ADJ',
'ADV': 'ADV',
'APPR': 'ADP',
'APPRART': 'ADP',
'APPO': 'ADP',
'APZR': 'ADP',
'ART': 'DET',
'CARD': 'NUM',
'FM': 'X',
'ITJ': 'X',
'KOUI': 'CONJ',
'KOUS': 'CONJ',
'KON': 'CONJ',
'KOKOM': 'CONJ',
'NN': 'NOUN',
'NE': 'NOUN',
'PDS': 'PRON',
'PDAT': 'PRON',
'PIS': 'PRON',
'PIAT': 'PRON',
'PIDAT': 'PRON',
'PPER': 'PRON',
'PPOSS': 'PRON',
'PPOSAT': 'PRON',
'PRELS': 'PRON',
'PRELAT': 'PRON',
'PRF': 'PRON',
'PWS': 'PRON',
'PWAT': 'PRON',
'PWAV': 'PRON',
'PAV': 'PRON',
'PTKZU': 'PRT',
'PTKNEG': 'PRT',
'PTKVZ': 'PRT',
'PTKANT': 'PRT',
'PTKA': 'PRT',
'TRUNC': 'X',
'VVFIN': 'VERB',
'VVIMP': 'VERB',
'VVINF': 'VERB',
'VVIZU': 'VERB',
'VVPP': 'VERB',
'VAFIN': 'VERB',
'VAIMP': 'VERB',
'VAINF': 'VERB',
'VAPP': 'VERB',
'VMFIN': 'VERB',
'VMINF': 'VERB',
'VMPP': 'VERB',
'XY': 'X',
'$,': '.',
'$.': '.',
'$(': '.'
}

In [249]:
def readfile(filename):
    with open(filename, 'r') as f:
        lines = f.readlines() # sentences are split by newline
        
    labeled_sentences = []
    for i, sentence in enumerate(lines):
        labeled_sentences.append([])
        for pair in sentence.split(';'): # labeled words are split by ;
            splitted = pair.split('/') # word and tag are split by /
            if (len(splitted) == 2): # making sure we only get actual word-tag pairs
                word = splitted[0].strip()
                tag = stts_to_univ[splitted[1].strip()] # map the stts tag to a universal tag
                labeled_sentences[i].append((word, tag))
    
    return sentences

In [250]:
sentences = readfile("POS_German_train.txt")
sentences_minitest = readfile("POS_German_minitest.txt")

In [251]:
from sklearn.model_selection import train_test_split

# split data so we have additional testdata (not only sentences_minitest)
sentences_train, sentences_test = train_test_split(sentences, test_size=0.2, random_state=9000)

In [252]:
sentences_train[:3]

[[('Angesichts', 'ADP'),
  ('der', 'DET'),
  ('verstaerkten', 'ADJ'),
  ('Rolle', 'NOUN'),
  ('Deutschlands', 'NOUN'),
  ('in', 'ADP'),
  ('der', 'DET'),
  ('Weltwirtschaft', 'NOUN'),
  ('muessten', 'VERB'),
  ('auch', 'ADV'),
  ('die', 'DET'),
  ('globalen', 'ADJ'),
  ('Aspekte', 'NOUN'),
  ('der', 'DET'),
  ('Entwicklungshilfe', 'NOUN'),
  ('staerker', 'ADJ'),
  ('zum', 'ADP'),
  ('Tragen', 'NOUN'),
  ('kommen', 'VERB'),
  ('.', '.')],
 [('Interessant', 'ADJ'), ('.', '.')],
 [('Der', 'DET'),
  ('Eindruck', 'NOUN'),
  ('vom', 'ADP'),
  ('Ende', 'NOUN'),
  ('einer', 'DET'),
  ('Epoche', 'NOUN'),
  ('wird', 'VERB'),
  ('verstaerkt', 'VERB'),
  (',', '.'),
  ('nachdem', 'CONJ'),
  ('auch', 'ADV'),
  ('auf', 'ADP'),
  ('dem', 'DET'),
  ('Aushaengeschild', 'NOUN'),
  ('der', 'DET'),
  ('Genossenschaftswirtschaft', 'NOUN'),
  (',', '.'),
  ('dem', 'DET'),
  ('Einzelhandelskonzern', 'NOUN'),
  ('Konsum', 'NOUN'),
  (',', '.'),
  ('der', 'DET'),
  ('Pleitegeier', 'NOUN'),
  ('klebt', 'VERB'),

In [286]:
from nltk.tag.hmm import HiddenMarkovModelTrainer, HiddenMarkovModelTagger
from nltk.probability import LidstoneProbDist

tag_set = set([tag for sentence in sentences for word, tag in sentence])
word_set = set([word for sentence in sentences for word, tag in sentence])

trainer = HiddenMarkovModelTrainer(list(tag_set), list(word_set)) # make lists so items can potentionally be appended

tagger = trainer.train_supervised(sentences_train, estimator=lambda fd, bins: LidstoneProbDist(fd, .1, bins))

In [None]:
tagger.test(sentences_test) #, verbose=True --> TAKES FOREVER, but super high accuracy

In [None]:
tagger.test(sentences_minitest) #, verbose=True --> TAKES FOREVER, but super high accuracy

In [288]:
tagger.tag("Der schnelle Fuchs springt über den faulen Zaun".split())

[('Der', 'DET'),
 ('schnelle', 'ADJ'),
 ('Fuchs', 'NOUN'),
 ('springt', 'VERB'),
 ('über', 'ADP'),
 ('den', 'DET'),
 ('faulen', 'ADJ'),
 ('Zaun', 'NOUN')]