# SymetryML Hidden Markov Model Part of Speech tagger Tutorial

This tutorial gives an example on how to use SymetryML to build a Part of speech tagger using the Brown universal corpus

Please make sure that

* `sym-spark-assembly.jar` is present in `/opt/symetry/lib/` folder
* `SMLPy4JGateway.py` is present in `/opt/symetry/python` folder

In [None]:
import sys
sys.path.append("/opt/symetry/python")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import py4j.java_gateway as py4jjg
import sys
import time

import SMLPy4JGateway as smlgw

from collections import Counter, defaultdict, namedtuple, OrderedDict
import random

In [None]:
# Some helper function to load and splits (train / test) the Brown sentences
# ============================================================
def read_data(filename):
    Sentence = namedtuple("sentences", "words tags")
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
    return OrderedDict(((s[0], Sentence(*zip(*[l.strip().split("\t")
                        for l in s[1:]]))) for s in sentence_lines if s[0]))
# ============================================================
def get_df_for_keys(in_keys, in_data):
    acc_a = []
    for k in in_keys:
        the_data = in_data[k] 
        words = the_data[0]
        pos = the_data[1]
        for i in range(0, len(words)):
            a_w = words[i]
            a_pos = pos[i]  
            newt = (a_w, a_pos)
            acc_a.append(newt)         
    rval = pd.DataFrame(acc_a, columns=['word', 'pos'])
    return rval

# ============================================================
# split data into arrays of dataframe for each sentence
def split_data(data, split=0.8, seed=1):
    keys = tuple(data.keys())
    _keys = list(keys)
    if seed is not None: 
        random.seed(seed)
    random.shuffle(_keys)
    split_idx = int(split * len(_keys))

    training_keys = _keys[:split_idx]
    testing_keys =  _keys[split_idx:]
    
    rval_train = get_df_for_keys(training_keys, data)
    rval_test = get_df_for_keys(testing_keys, data)        

    return rval_train, rval_test, training_keys, testing_keys 

# Start the java gateway

* It is also possible to pass extra JVM parameters using the `jvm_options` optional parameters:
    * `jvm_options="-Xms2g -Xmx4g"`
* Normally `java_classpath` should points to `/opt/symetry/lib/*`, e.g.:
    * `java_classpath='/opt/symetry/lib/*'`

In [None]:
# Start the Py4J server.
gateway_server = smlgw.SMLPy4JGatewayServer(java_classpath='/opt/symetry/lib/*')

# Getting the Python client

In [None]:
# Delay to allow Py4J gateway server to be ready.
time.sleep(2)
gateway = smlgw.get_python_client()
sml = gateway.jvm

# Create a local, unpersisted SML project. 

* In order to use SymetryML's Hidden Markov Model we need to specify that we want a sequence project, this is achieved by passing either `21` or `"sequence"` as the project type in the following code block:

In [None]:
user = 'c1'
prj_name = 'smlPOSTagger'
prj_type = 'sequence'   # sequence project, 21 would also be valid
persist = False

prj = smlgw.createSMLProject(
    gateway,
    user,
    prj_name,
    prj_type,
    persist)

In [None]:
%%time
# load the brown corpus into 2 datasets : train and spliting
the_data = read_data("./brown-universal.txt")
train_df, test_df, train_keys, test_keys = split_data(the_data)
print(len(train_df))

# Process/ingest the training dataframe into the SymetryProject

In [None]:
%%time
order = 2
attr_types = "T,T"
row_count = 0
ll = len(train_df)


pdfJson = smlgw.pandas_df_to_sml_json(train_df, attr_types)
jdf = sml.DataFrame()
jdf.fromJSON(pdfJson)
prj.learnSequence(jdf, order)

row_count += jdf.getSize()

print("Rows processed: %d" % row_count)
# print("Learned dataset %s" % the_data)

# Building HMM Part of Speech Model.

In [None]:
%%time
hidden = "pos"
observed = "word"
model_name = "hmmPOSModel"
rez = prj.buildHMMModel(hidden, observed, model_name)


In [None]:
#print(rez)
status_str = sml.CoreUtil.getErrorString(rez)
print(status_str)

# Predict Tag example with one sentence:

In [None]:

hmmModel = prj.getModel(model_name)
obsj = gateway.jvm.java.util.ArrayList()
obs = ['She','had','the','opportunity','that','few','clever','women','can','resist'] 
for tok in obs:
    obsj.add(tok)
    
res = hmmModel.predict(obsj)
res['seq']

# Compute Accuracy of the HMM model on out of sample data

In [None]:
# helper function for accuracy calculation
from io import StringIO
import csv
def _get_pos_csv_(tt):
    f = StringIO(tt)
    reader = csv.reader(f, delimiter=',')
    k = 0
    for row in reader:
        k += 1 
    if k > 1:
        raise Exception("More than one result in tt")     
    return row

def _accuracy(in_keys, in_data):
    good_count = 0
    total_count = 0

    k = 0
    tk = len(in_keys)
    for a_test_key in in_keys:
        a_test = in_data[a_test_key]
        words = a_test[0]
        pos = a_test[1]

        obsj = gateway.jvm.java.util.ArrayList()
        for tok in words:
            obsj.add(tok)

        # print(obsj)
        res = hmmModel.predict(obsj)
        hidden_state = res.get('seq')
        hsa = _get_pos_csv_(hidden_state)   # hidden state array

        lhsa = len(hsa)  
        lpos = len(pos)
        if lhsa != lpos:
            raise Exception("Part of Speech good value array len:%d != predicted array len:%d"
                           % (lpos, lhsa))

        for i in range(0, lhsa):
            good_value = pos[i].lower()
            pred_value = hsa[i]

            if good_value == pred_value:
                good_count += 1
            else:
                pass
                #print("good[%s] != pred[%s]" % (good_value, pred_value))
            total_count += 1
        k += 1
        acc = (good_count / total_count)
        print("prediction processed: %d of:%d pct:%f accuracy:%f \r" % (k, tk, (k / tk * 100), acc), end='')
    
    return acc

In [None]:
%%time
acc_test = _accuracy(test_keys, the_data)   
print("accuracy test %f" % acc_test)   

In [None]:
# Clean up SML...
prj.deleteModel(model_name)
prj.clear()
gateway.shutdown()
gateway_server.kill_server()
